# Taobao DSSM Training Pipeline

This notebook implements the complete training pipeline for the DSSM recommendation model.

In [None]:
# Environment Setup and Data Loading

import pandas as pd
import numpy as np
import os
import sys
import pickle
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import EarlyStopping
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Add src to path for imports
sys.path.append('/kaggle/input/dssm-project-code')

# Import modules
from src import config
from src import data_processing
from src import feature_engineering
from src import model

# Set data path
DATA_PATH = f'../input/{config.DATASET_FOLDER_NAME}/'

logger.info("Environment and modules imported successfully!")
logger.info(f"Configuration loaded: BATCH_SIZE={config.BATCH_SIZE}, LEARNING_RATE={config.LEARNING_RATE}")

## Data Preprocessing

In [None]:
# Data Preprocessing

if not os.path.exists(os.path.join(config.OUTPUT_DIR, 'train_data.csv')):
    logger.info("Starting data preprocessing pipeline...")
    data_processing.run_pipeline(
        data_path=DATA_PATH,
        output_dir=config.OUTPUT_DIR,
        chunk_size=config.CHUNKSIZE,
        negative_ratio=config.NEG_POS_RATIO
    )
else:
    logger.info("Data files already exist, skipping preprocessing.")

## Feature Engineering

In [None]:
# Feature Engineering

logger.info("Loading data...")
train_df = pd.read_csv(os.path.join(config.OUTPUT_DIR, 'train_data.csv'))
test_df = pd.read_csv(os.path.join(config.OUTPUT_DIR, 'test_data.csv'))

logger.info(f"Training data shape: {train_df.shape}")
logger.info(f"Test data shape: {test_df.shape}")

# Perform feature engineering
train_df, test_df = feature_engineering.perform_feature_engineering(
    train_df, 
    test_df, 
    config.SPARSE_FEATURES, 
    config.DENSE_FEATURES, 
    config.OUTPUT_DIR
)

logger.info("Feature engineering completed!")
logger.info(f"Processed training data shape: {train_df.shape}")

## Model Training

In [None]:
# Model Training

# 1. Create model input dictionaries
train_model_input = {name: train_df[name] for name in config.ALL_FEATURES}
train_label = train_df['label'].values
test_model_input = {name: test_df[name] for name in config.ALL_FEATURES}
test_label = test_df['label'].values

logger.info(f"Training samples: {len(train_label)}")
logger.info(f"Test samples: {len(test_label)}")
logger.info(f"Positive samples in training: {train_df['label'].sum()}")
logger.info(f"Positive samples in test: {test_df['label'].sum()}")

In [None]:
# 2. Create feature columns
with open(os.path.join(config.OUTPUT_DIR, 'feature_encoders.pkl'), 'rb') as f:
    feature_encoders = pickle.load(f)

# User feature columns
user_feature_columns = [
    model.SparseFeat(
        name=feat, 
        vocabulary_size=len(feature_encoders[feat].classes_) + 1, 
        embedding_dim=config.EMBEDDING_DIM
    )
    for feat in config.USER_SPARSE_FEATURES
]

from typing import List, Union
# Item feature columns
item_feature_columns: List[Union[model.SparseFeat, model.DenseFeat]] = [
    model.SparseFeat(
        name=feat, 
        vocabulary_size=len(feature_encoders[feat].classes_) + 1, 
        embedding_dim=config.EMBEDDING_DIM
    )
    for feat in config.ITEM_SPARSE_FEATURES
]

# Add dense features
item_feature_columns.append(model.DenseFeat(name='price', dimension=1))

logger.info(f"User feature columns: {len(user_feature_columns)}")
logger.info(f"Item feature columns: {len(item_feature_columns)}")

In [None]:
# 3. Build and compile model
dssm_model = model.DSSM(
    user_feature_columns, 
    item_feature_columns, 
    dnn_units=config.DNN_UNITS, 
    temp=config.TEMP
)

dssm_model.summary()

dssm_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=config.LEARNING_RATE), 
    loss="binary_crossentropy", 
    metrics=[tf.keras.metrics.AUC(name='auc')]
)

logger.info("Model compiled successfully!")

In [None]:
# 4. Train model
early_stopping = EarlyStopping(
    monitor='val_auc', 
    mode='max', 
    patience=2, 
    verbose=1, 
    restore_best_weights=True
)

logger.info("Starting model training...")

history = dssm_model.fit(
    train_model_input, 
    train_label, 
    batch_size=config.BATCH_SIZE,
    epochs=config.EPOCHS,
    validation_data=(test_model_input, test_label),
    callbacks=[early_stopping]
)

logger.info("Model training completed!")

## Model Evaluation and Saving

In [None]:
# Model Evaluation

# Evaluate on test set
test_loss, test_auc = dssm_model.evaluate(test_model_input, test_label)
logger.info(f"Test Loss: {test_loss:.4f}")
logger.info(f"Test AUC: {test_auc:.4f}")

# Save model
model_save_path = os.path.join(config.OUTPUT_DIR, 'dssm_model')
dssm_model.save(model_save_path)
logger.info(f"Model saved to: {model_save_path}")

# Save training history
history_save_path = os.path.join(config.OUTPUT_DIR, 'training_history.pkl')
if history is not None:
    with open(history_save_path, 'wb') as f:
        pickle.dump(history.history, f)
    logger.info(f"Training history saved to: {history_save_path}")
else:
    logger.warning("No training history to save")