<a href="https://colab.research.google.com/github/Papa-Panda/industry_algo/blob/main/Deep_Interest_Evolution_Network_(DIEN)_Implementation_in_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score

print(f"PyTorch Version: {torch.__version__}")

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 1. Synthetic Data Generation ---
# Reusing the data generation from the DIN PyTorch example.

def generate_synthetic_data(num_samples=10000, num_users=1000, num_items=500, history_length=10, embedding_dim=16):
    """
    Generates synthetic data for a Deep Interest Evolution Network.
    """
    user_ids = np.random.randint(1, num_users + 1, num_samples)
    candidate_item_ids = np.random.randint(1, num_items + 1, num_samples)

    historical_item_ids = []
    for _ in range(num_samples):
        current_history_len = np.random.randint(1, history_length + 1)
        history = np.random.randint(1, num_items + 1, current_history_len).tolist()
        history.extend([0] * (history_length - len(history))) # Pad with 0s
        historical_item_ids.append(history)
    historical_item_ids = np.array(historical_item_ids)

    labels = np.random.randint(0, 2, num_samples)

    item_features_matrix = np.random.rand(num_items + 1, embedding_dim).astype(np.float32)

    print(f"Generated synthetic data: {num_samples} samples.")
    print(f"  User IDs shape: {user_ids.shape}")
    print(f"  Candidate Item IDs shape: {candidate_item_ids.shape}")
    print(f"  Historical Item IDs shape: {historical_item_ids.shape}")
    print(f"  Labels shape: {labels.shape}")
    print(f"  Item Features matrix shape: {item_features_matrix.shape}")

    user_ids_t = torch.LongTensor(user_ids)
    candidate_item_ids_t = torch.LongTensor(candidate_item_ids)
    historical_item_ids_t = torch.LongTensor(historical_item_ids)
    labels_t = torch.FloatTensor(labels).unsqueeze(1)

    return user_ids_t, candidate_item_ids_t, historical_item_ids_t, labels_t, item_features_matrix

# Generate the data
num_users = 1000
num_items = 500
history_length = 10
embedding_dim = 16
gru_units = 32 # Units for GRU layers

user_ids_data, candidate_item_ids_data, historical_item_ids_data, labels_data, item_features_data_np = \
    generate_synthetic_data(num_samples=50000, num_users=num_users, num_items=num_items, history_length=history_length, embedding_dim=embedding_dim)

# --- 2. DIEN Model Architecture in PyTorch ---

# Custom Activation Function (Dice) - Reused from DIN example
class Dice(nn.Module):
    """
    Data Adaptive Activation Function (DICE) for DIEN in PyTorch.
    """
    def __init__(self, input_dim, epsilon=1e-9):
        super().__init__()
        self.epsilon = epsilon
        self.alphas = nn.Parameter(torch.zeros(input_dim))
        self.beta = nn.Parameter(torch.zeros(input_dim))

    def forward(self, x):
        reduction_axes = tuple(range(x.dim() - 1))
        mean = torch.mean(x, dim=reduction_axes, keepdim=True)
        variance = torch.mean(torch.pow(x - mean, 2), dim=reduction_axes, keepdim=True)
        x_normed = (x - mean) / torch.sqrt(variance + self.epsilon)
        p = torch.sigmoid(self.alphas * x_normed + self.beta)
        return p * x + (1 - p) * self.alphas * x

# Attention Layer for DIEN's Interest Evolving Layer
class DIENAttention(nn.Module):
    """
    Attention mechanism for DIEN's interest evolving layer.
    Computes attention weights for each historical state w.r.t. the candidate item.
    """
    def __init__(self, embedding_dim, attention_hidden_units=[32, 16]):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.attention_hidden_units = attention_hidden_units

        attention_mlp_layers = []
        # Input dim: candidate_emb + history_state + product + difference
        input_dim_mlp = embedding_dim * 4

        for i, units in enumerate(attention_hidden_units):
            attention_mlp_layers.append(nn.Linear(input_dim_mlp, units))
            attention_mlp_layers.append(Dice(units))
            input_dim_mlp = units

        self.attention_mlp = nn.Sequential(*attention_mlp_layers)
        self.output_layer = nn.Linear(input_dim_mlp, 1) # Outputs a single attention score

    def forward(self, candidate_item_embedding, historical_sequence_states):
        # candidate_item_embedding: (batch_size, embedding_dim)
        # historical_sequence_states: (batch_size, seq_len, gru_units) from the first GRU

        batch_size, seq_len, _ = historical_sequence_states.shape

        # Expand candidate_item_embedding to match seq_len for concatenation
        candidate_item_embedding_tiled = candidate_item_embedding.unsqueeze(1).expand(-1, seq_len, -1)

        # Concatenate features for attention MLP
        concatenated_features = torch.cat([
            candidate_item_embedding_tiled,
            historical_sequence_states,
            candidate_item_embedding_tiled * historical_sequence_states,
            candidate_item_embedding_tiled - historical_sequence_states
        ], dim=-1)

        # Apply attention MLP
        attention_logits = self.attention_mlp(concatenated_features) # (batch_size, seq_len, 1)
        attention_logits = self.output_layer(attention_logits) # (batch_size, seq_len, 1)

        # Mask out padded items (where historical_sequence_states are all zeros)
        mask = (historical_sequence_states.abs().sum(dim=-1, keepdim=True) > 0).float()
        attention_logits = attention_logits - (1.0 - mask) * 1e9 # Apply large negative value to masked logits

        attention_weights = F.softmax(attention_logits, dim=1) # (batch_size, seq_len, 1)

        return attention_weights

# Attention-Guided GRU Layer (Simplified AUGRU concept)
class AttentionGRULayer(nn.Module):
    """
    A simplified implementation of the Attention Update Gate (AUGRU) concept for DIEN.
    It applies attention weights to the input sequence before feeding it into a standard GRU.
    """
    def __init__(self, units):
        super().__init__()
        self.units = units
        # Standard GRU layer
        self.gru = nn.GRU(input_size=units, hidden_size=units, batch_first=True)

    def forward(self, input_sequence_embedding, attention_weights, initial_state=None):
        # input_sequence_embedding: (batch_size, seq_len, units) from the first GRU's output
        # attention_weights: (batch_size, seq_len, 1) from DIENAttention layer

        # Apply attention weights to the input sequence
        # This effectively modulates the input to the GRU based on attention.
        attention_weighted_inputs = input_sequence_embedding * attention_weights

        # Process the attention-weighted sequence with the GRU
        # If initial_state is None, GRU initializes it with zeros.
        _, final_state = self.gru(attention_weighted_inputs, initial_state)

        # final_state is (1, batch_size, units) for non-bidirectional GRU
        return final_state.squeeze(0) # (batch_size, units)

# Deep Interest Evolution Network (DIEN) Model
class DIEN(nn.Module):
    def __init__(self, num_users, num_items, history_length, embedding_dim, gru_units, item_features_matrix):
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.history_length = history_length
        self.embedding_dim = embedding_dim
        self.gru_units = gru_units

        # Embedding layers
        self.user_embedding = nn.Embedding(num_users + 1, embedding_dim)
        self.item_embedding = nn.Embedding.from_pretrained(
            torch.from_numpy(item_features_matrix).float(),
            freeze=False
        )

        # Interest Extractor Layer: GRU to get sequential interest states
        # input_size=embedding_dim (from item embeddings), hidden_size=gru_units
        # In PyTorch, return_sequences=True is implied when batch_first=True and you take the first output of GRU
        self.interest_extractor_gru = nn.GRU(input_size=embedding_dim, hidden_size=gru_units, batch_first=True)

        # Interest Evolving Layer components
        self.dien_attention = DIENAttention(embedding_dim=gru_units, attention_hidden_units=[32, 16]) # Attention over GRU states
        self.interest_evolving_gru = AttentionGRULayer(units=gru_units) # Attention-guided GRU

        # Final Prediction MLP
        # Input dim: user_emb + candidate_item_emb + evolved_interest_state
        input_mlp_dim = embedding_dim + embedding_dim + gru_units
        self.mlp = nn.Sequential(
            nn.Linear(input_mlp_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
        )
        self.output_layer = nn.Linear(32, 1)

    def forward(self, user_id, candidate_item_id, historical_item_ids):
        user_id = user_id.squeeze(-1)
        candidate_item_id = candidate_item_id.squeeze(-1)

        user_emb = self.user_embedding(user_id)
        candidate_item_emb = self.item_embedding(candidate_item_id)
        historical_item_embs = self.item_embedding(historical_item_ids)

        # Interest Extractor Layer
        # historical_item_embs: (batch_size, history_length, embedding_dim)
        # interest_states: (batch_size, history_length, gru_units)
        interest_states, _ = self.interest_extractor_gru(historical_item_embs)

        # Interest Evolving Layer
        # Project candidate_item_emb to gru_units before passing to attention
        candidate_item_emb_proj = nn.Linear(self.embedding_dim, self.gru_units).to(device)(candidate_item_emb)

        attention_weights = self.dien_attention(candidate_item_emb_proj, interest_states) # (batch_size, history_length, 1)

        # 2. Evolve Interest using AttentionGRULayer
        evolved_interest_state = self.interest_evolving_gru(interest_states, attention_weights) # (batch_size, gru_units)

        # Concatenate all features
        concatenated_features = torch.cat([
            user_emb,
            candidate_item_emb,
            evolved_interest_state
        ], dim=-1)

        # Pass through prediction MLP
        mlp_output = self.mlp(concatenated_features)
        logits = self.output_layer(mlp_output)

        return logits

# Instantiate and move model to device
dien_model = DIEN(
    num_users=num_users,
    num_items=num_items,
    history_length=history_length,
    embedding_dim=embedding_dim,
    gru_units=gru_units,
    item_features_matrix=item_features_data_np
).to(device)

print("\nDIEN Model Summary:")
print(dien_model)


# --- 3. Prepare Data for Training (PyTorch DataLoader) ---
dataset = TensorDataset(user_ids_data, candidate_item_ids_data, historical_item_ids_data, labels_data)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# --- 4. Train the Model ---
optimizer = torch.optim.Adam(dien_model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

num_epochs = 5
print(f"\n--- Training the DIEN Model for {num_epochs} epochs ---")

for epoch in range(num_epochs):
    dien_model.train()
    total_loss = 0
    predictions_train = []
    labels_train = []

    for batch_idx, (user_ids, candidate_ids, historical_ids, labels) in enumerate(train_loader):
        user_ids, candidate_ids, historical_ids, labels = \
            user_ids.to(device), candidate_ids.to(device), historical_ids.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = dien_model(user_ids, candidate_ids, historical_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        predictions_train.extend(outputs.sigmoid().detach().cpu().numpy().flatten())
        labels_train.extend(labels.detach().cpu().numpy().flatten())

    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = accuracy_score(labels_train, np.round(predictions_train))
    train_auc = roc_auc_score(labels_train, predictions_train)

    # --- Validation ---
    dien_model.eval()
    val_total_loss = 0
    predictions_val = []
    labels_val = []

    with torch.no_grad():
        for user_ids, candidate_ids, historical_ids, labels in val_loader:
            user_ids, candidate_ids, historical_ids, labels = \
                user_ids.to(device), candidate_ids.to(device), historical_ids.to(device), labels.to(device)

            outputs = dien_model(user_ids, candidate_ids, historical_ids)
            loss = criterion(outputs, labels)
            val_total_loss += loss.item()

            predictions_val.extend(outputs.sigmoid().cpu().numpy().flatten())
            labels_val.extend(labels.cpu().numpy().flatten())

    avg_val_loss = val_total_loss / len(val_loader)
    val_accuracy = accuracy_score(labels_val, np.round(predictions_val))
    val_auc = roc_auc_score(labels_val, predictions_val)

    print(f"Epoch {epoch+1}/{num_epochs}:")
    print(f"  Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.4f}, Train AUC: {train_auc:.4f}")
    print(f"  Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}, Val AUC: {val_auc:.4f}")

print("\nTraining complete.")

# --- 5. Make Predictions (Example) ---
print("\n--- Making Predictions (Example) ---")

dien_model.eval()

num_predict_samples = 5
random_indices = np.random.choice(len(user_ids_data), num_predict_samples, replace=False)

sample_user_ids = user_ids_data[random_indices].to(device)
sample_candidate_item_ids = candidate_item_ids_data[random_indices].to(device)
sample_historical_item_ids = historical_item_ids_data[random_indices].to(device)
sample_labels = labels_data[random_indices].to(device)

with torch.no_grad():
    sample_outputs = dien_model(sample_user_ids, sample_candidate_item_ids, sample_historical_item_ids)
    sample_predictions = sample_outputs.sigmoid().cpu().numpy().flatten()

print("\nSample Predictions:")
for i in range(num_predict_samples):
    print(f"  Sample {i+1}:")
    print(f"    User ID: {sample_user_ids[i].item()}")
    print(f"    Candidate Item ID: {sample_candidate_item_ids[i].item()}")
    print(f"    Historical Item IDs: {sample_historical_item_ids[i].cpu().numpy().tolist()}")
    print(f"    True Label: {sample_labels[i].item()}")
    print(f"    Predicted Probability: {sample_predictions[i]:.4f}")
    print("-" * 30)

PyTorch Version: 2.6.0+cu124
Using device: cpu
Generated synthetic data: 50000 samples.
  User IDs shape: (50000,)
  Candidate Item IDs shape: (50000,)
  Historical Item IDs shape: (50000, 10)
  Labels shape: (50000,)
  Item Features matrix shape: (501, 16)

DIEN Model Summary:
DIEN(
  (user_embedding): Embedding(1001, 16)
  (item_embedding): Embedding(501, 16)
  (interest_extractor_gru): GRU(16, 32, batch_first=True)
  (dien_attention): DIENAttention(
    (attention_mlp): Sequential(
      (0): Linear(in_features=128, out_features=32, bias=True)
      (1): Dice()
      (2): Linear(in_features=32, out_features=16, bias=True)
      (3): Dice()
    )
    (output_layer): Linear(in_features=16, out_features=1, bias=True)
  )
  (interest_evolving_gru): AttentionGRULayer(
    (gru): GRU(32, 32, batch_first=True)
  )
  (mlp): Sequential(
    (0): Linear(in_features=64, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=128, out_feat