<a href="https://colab.research.google.com/github/Papa-Panda/Paper_reading/blob/main/Deep_Interest_Network_(DIN)_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# https://gemini.google.com/app/72f602f1dcb0baa4
# 屠龙少年与龙：漫谈深度学习驱动的广告推荐技术发展周期 - 朱小强的文章 - 知乎
# https://zhuanlan.zhihu.com/p/398041971

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd

print(f"TensorFlow Version: {tf.__version__}")

# --- 1. Synthetic Data Generation ---
# This dataset simulates user interactions with items.
# Each user has a history of clicked items, and we'll predict if they click a new candidate item.

def generate_synthetic_data(num_samples=10000, num_users=1000, num_items=500, history_length=10):
    """
    Generates synthetic data for a Deep Interest Network.

    Args:
        num_samples (int): Total number of data points (user-candidate pairs).
        num_users (int): Number of unique users.
        num_items (int): Number of unique items.
        history_length (int): Maximum length of a user's historical clicked items.

    Returns:
        tuple: A tuple containing:
            - np.array: User IDs.
            - np.array: Candidate Item IDs.
            - np.array: 2D array of Historical Item IDs (padded with 0s).
            - np.array: Labels (1 if clicked, 0 otherwise).
            - np.array: Item feature embeddings (simulated).
    """
    user_ids = np.random.randint(1, num_users + 1, num_samples) # User IDs start from 1
    candidate_item_ids = np.random.randint(1, num_items + 1, num_samples) # Item IDs start from 1

    # Simulate historical clicked items for each user
    historical_item_ids = []
    for _ in range(num_samples):
        # Random number of historical items for each sample, up to history_length
        current_history_len = np.random.randint(1, history_length + 1)
        history = np.random.randint(1, num_items + 1, current_history_len).tolist()
        # Pad history with 0s if less than history_length
        history.extend([0] * (history_length - len(history)))
        historical_item_ids.append(history)
    historical_item_ids = np.array(historical_item_ids)

    # Simulate labels (e.g., 1 if candidate item is "similar" to history, 0 otherwise)
    # This is a very simple simulation of a click, based on random chance
    labels = np.random.randint(0, 2, num_samples)

    # Simulate item feature embeddings (e.g., each item has a 16-dim embedding)
    # We will use this for the DIN's attention mechanism
    embedding_dim = 16
    item_features = np.random.rand(num_items + 1, embedding_dim) # +1 for 0-padding, item 0 is dummy

    print(f"Generated synthetic data: {num_samples} samples.")
    print(f"  User IDs shape: {user_ids.shape}")
    print(f"  Candidate Item IDs shape: {candidate_item_ids.shape}")
    print(f"  Historical Item IDs shape: {historical_item_ids.shape}")
    print(f"  Labels shape: {labels.shape}")
    print(f"  Item Features shape: {item_features.shape}")

    return user_ids, candidate_item_ids, historical_item_ids, labels, item_features

# Generate the data
num_users = 1000
num_items = 500
history_length = 10 # Max length of user behavior sequence
embedding_dim = 16 # Dimensionality of item and user embeddings

user_ids_data, candidate_item_ids_data, historical_item_ids_data, labels_data, item_features_data = \
    generate_synthetic_data(num_samples=50000, num_users=num_users, num_items=num_items, history_length=history_length)

# --- 2. DIN Model Architecture ---

# Custom Attention Layer for DIN
class Dice(layers.Layer):
    """
    Data Adaptive Activation Function (DICE) for DIN.
    It's a variant of PReLu that learns a dynamic 'p' parameter.
    """
    def __init__(self, axis=-1, epsilon=1e-9, **kwargs):
        super().__init__(**kwargs)
        self.axis = axis
        self.epsilon = epsilon

    def build(self, input_shape):
        self.alphas = self.add_weight(
            shape=(input_shape[-1],),
            initializer='zeros',
            trainable=True,
            name='dice_alpha'
        )
        self.beta = self.add_weight(
            shape=(input_shape[-1],),
            initializer='zeros',
            trainable=True,
            name='dice_beta'
        )
        super().build(input_shape)

    def call(self, inputs):
        # Calculate mean and variance along the specified axis
        reduc_axis = list(range(len(inputs.shape)))
        if self.axis != -1:
            reduc_axis.pop(self.axis)
        mean = tf.reduce_mean(inputs, axis=reduc_axis, keepdims=True)
        variance = tf.reduce_mean(tf.square(inputs - mean), axis=reduc_axis, keepdims=True)

        # Normalize the input
        x_normed = (inputs - mean) / tf.sqrt(variance + self.epsilon)

        # Calculate p (the parameter for PReLu-like activation)
        p = tf.sigmoid(self.alphas * x_normed + self.beta)

        # Apply DICE activation
        return p * inputs + (1 - p) * self.alphas * inputs

    def get_config(self):
        config = super().get_config()
        config.update({
            "axis": self.axis,
            "epsilon": self.epsilon,
        })
        return config

class AttentionPoolingLayer(layers.Layer):
    """
    Attention pooling layer for Deep Interest Network (DIN).
    Calculates attention scores between candidate item and historical items,
    then weights the historical item embeddings.
    """
    def __init__(self, embedding_dim, hidden_units=[80, 40], **kwargs):
        super().__init__(**kwargs)
        self.embedding_dim = embedding_dim
        self.hidden_units = hidden_units

        # Attention network (e.g., MLP)
        self.dense_layers = []
        for units in hidden_units:
            self.dense_layers.append(layers.Dense(units, activation=None)) # No activation initially
            self.dense_layers.append(Dice()) # Use Dice activation after each dense layer

        self.output_layer = layers.Dense(1, activation=None) # Output attention score (scalar)

    def call(self, inputs):
        # inputs: [candidate_item_embedding, historical_item_embeddings]
        # candidate_item_embedding: (batch_size, embedding_dim)
        # historical_item_embeddings: (batch_size, history_length, embedding_dim)

        candidate_item_embedding, historical_item_embeddings = inputs

        # Expand candidate_item_embedding to match history_length dimension for concatenation
        # (batch_size, 1, embedding_dim) -> (batch_size, history_length, embedding_dim)
        candidate_item_embedding_expanded = tf.expand_dims(candidate_item_embedding, 1)
        candidate_item_embedding_tiled = tf.tile(candidate_item_embedding_expanded, [1, tf.shape(historical_item_embeddings)[1], 1])

        # Concatenate candidate item, historical item, their product, and their difference
        # This is a common practice in attention mechanisms for DIN
        # (batch_size, history_length, embedding_dim * 4)
        concatenated_features = tf.concat([
            candidate_item_embedding_tiled,
            historical_item_embeddings,
            candidate_item_embedding_tiled * historical_item_embeddings,
            candidate_item_embedding_tiled - historical_item_embeddings
        ], axis=-1)

        # Pass through attention network
        attention_logits = concatenated_features
        for layer in self.dense_layers:
            attention_logits = layer(attention_logits)

        attention_logits = self.output_layer(attention_logits) # (batch_size, history_length, 1)

        # Apply softmax to get attention weights.
        # Mask out padded items (where embedding is 0) to prevent them from influencing attention.
        # For simplicity, we assume historical_item_embeddings with all zeros corresponds to padding.
        # A more robust approach would be to pass a mask explicitly.
        mask = tf.cast(tf.reduce_sum(tf.abs(historical_item_embeddings), axis=-1, keepdims=True) > 0, tf.float32)
        attention_logits = attention_logits - (1.0 - mask) * 1e9 # Mask padded items with large negative value

        attention_weights = tf.nn.softmax(attention_logits, axis=1) # (batch_size, history_length, 1)

        # Weighted sum of historical item embeddings
        # (batch_size, history_length, embedding_dim) * (batch_size, history_length, 1)
        # -> (batch_size, history_length, embedding_dim) -> (batch_size, embedding_dim)
        weighted_history_embedding = tf.reduce_sum(attention_weights * historical_item_embeddings, axis=1)

        return weighted_history_embedding

    def get_config(self):
        config = super().get_config()
        config.update({
            "embedding_dim": self.embedding_dim,
            "hidden_units": self.hidden_units,
        })
        return config


def build_din_model(num_users, num_items, history_length, embedding_dim, item_features_matrix):
    """
    Builds the Deep Interest Network (DIN) model.

    Args:
        num_users (int): Total number of unique users.
        num_items (int): Total number of unique items.
        history_length (int): Maximum length of user historical behavior sequence.
        embedding_dim (int): Dimensionality of item and user embeddings.
        item_features_matrix (np.array): Pre-trained or initial item feature embeddings.

    Returns:
        keras.Model: Compiled DIN model.
    """
    # Input Layers
    user_id_input = keras.Input(shape=(1,), name='user_id_input', dtype='int32')
    candidate_item_id_input = keras.Input(shape=(1,), name='candidate_item_id_input', dtype='int32')
    historical_item_ids_input = keras.Input(shape=(history_length,), name='historical_item_ids_input', dtype='int32')

    # Embedding Layers
    # User embeddings: simple lookup
    user_embedding_layer = layers.Embedding(
        input_dim=num_users + 1, # +1 for 0-padding if user_id 0 exists
        output_dim=embedding_dim,
        name='user_embedding'
    )
    user_embedding = user_embedding_layer(user_id_input) # (batch_size, 1, embedding_dim)
    user_embedding = layers.Reshape((embedding_dim,))(user_embedding) # (batch_size, embedding_dim)

    # Item embeddings: use pre-defined item_features_matrix (e.g., from pre-training or here simulated)
    # Set trainable=False if these embeddings are fixed, True if they should be fine-tuned.
    item_embedding_layer = layers.Embedding(
        input_dim=num_items + 1, # +1 for 0-padding
        output_dim=embedding_dim,
        weights=[item_features_matrix], # Initialize with the simulated item features
        trainable=True, # Allow fine-tuning these embeddings during training
        name='item_embedding'
    )

    # Candidate item embedding
    candidate_item_embedding = item_embedding_layer(candidate_item_id_input) # (batch_size, 1, embedding_dim)
    candidate_item_embedding = layers.Reshape((embedding_dim,))(candidate_item_embedding) # (batch_size, embedding_dim)

    # Historical items embeddings
    historical_item_embeddings = item_embedding_layer(historical_item_ids_input) # (batch_size, history_length, embedding_dim)

    # DIN Attention Mechanism
    # The AttentionPoolingLayer computes a weighted sum of historical item embeddings
    # based on their relevance to the candidate item.
    attention_output = AttentionPoolingLayer(
        embedding_dim=embedding_dim,
        hidden_units=[80, 40], # Attention MLP hidden units
        name='din_attention_pooling'
    )([candidate_item_embedding, historical_item_embeddings])

    # Concatenate all features for the final prediction layer
    # These are: user_embedding, candidate_item_embedding, and the attention-weighted historical embedding
    concatenated_features = layers.concatenate([
        user_embedding,
        candidate_item_embedding,
        attention_output
    ], axis=-1)

    # Prediction MLP (Deep Network)
    mlp_output = layers.Dense(128, activation='relu')(concatenated_features)
    mlp_output = layers.Dropout(0.3)(mlp_output)
    mlp_output = layers.Dense(64, activation='relu')(mlp_output)
    mlp_output = layers.Dropout(0.3)(mlp_output)
    mlp_output = layers.Dense(32, activation='relu')(mlp_output)

    # Output layer (sigmoid for binary classification)
    output = layers.Dense(1, activation='sigmoid', name='output')(mlp_output)

    # Create the model
    model = keras.Model(
        inputs=[user_id_input, candidate_item_id_input, historical_item_ids_input],
        outputs=output,
        name='deep_interest_network'
    )

    # Compile the model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
    )

    return model

# Build the DIN model
din_model = build_din_model(
    num_users=num_users,
    num_items=num_items,
    history_length=history_length,
    embedding_dim=embedding_dim,
    item_features_matrix=item_features_data
)

din_model.summary()

# --- 3. Prepare Data for Training ---
# Create a dictionary for model inputs
model_inputs = {
    'user_id_input': user_ids_data,
    'candidate_item_id_input': candidate_item_ids_data,
    'historical_item_ids_input': historical_item_ids_data
}

# --- 4. Train the Model ---
print("\n--- Training the DIN Model ---")
history = din_model.fit(
    model_inputs,
    labels_data,
    batch_size=256,
    epochs=5, # Using a small number of epochs for demonstration
    validation_split=0.2, # Use 20% of data for validation
    verbose=1
)

print("\nTraining complete.")
print(f"Final training accuracy: {history.history['accuracy'][-1]:.4f}")
print(f"Final validation accuracy: {history.history['val_accuracy'][-1]:.4f}")
print(f"Final training AUC: {history.history['auc'][-1]:.4f}")
print(f"Final validation AUC: {history.history['val_auc'][-1]:.4f}")

# --- 5. Make Predictions (Example) ---
print("\n--- Making Predictions (Example) ---")

# Select a few random samples for prediction
num_predict_samples = 5
random_indices = np.random.choice(len(user_ids_data), num_predict_samples, replace=False)

sample_user_ids = user_ids_data[random_indices]
sample_candidate_item_ids = candidate_item_ids_data[random_indices]
sample_historical_item_ids = historical_item_ids_data[random_indices]
sample_labels = labels_data[random_indices]

sample_inputs = {
    'user_id_input': sample_user_ids,
    'candidate_item_id_input': sample_candidate_item_ids,
    'historical_item_ids_input': sample_historical_item_ids
}

predictions = din_model.predict(sample_inputs)

print("\nSample Predictions:")
for i in range(num_predict_samples):
    print(f"  Sample {i+1}:")
    print(f"    User ID: {sample_user_ids[i][0] if sample_user_ids[i].ndim > 0 else sample_user_ids[i]}")
    print(f"    Candidate Item ID: {sample_candidate_item_

SyntaxError: unterminated string literal (detected at line 347) (<ipython-input-2-064493d31321>, line 347)