# Two-Tower Recommendation System Training on Kaggle

This notebook demonstrates how to train the two-tower recommendation model using Kaggle's GPU.

In [None]:
# Install required packages
!pip install -q torch numpy pandas scikit-learn tqdm pyyaml

In [None]:
# Clone the repository
!git clone https://github.com/your-username/two-tower-rec.git
!cd two-tower-rec

In [None]:
# Download AliEC dataset
!mkdir -p data/raw
!kaggle datasets download -d your-username/aliec-dataset
!unzip aliec-dataset.zip -d data/raw

In [None]:
import sys
sys.path.append('two-tower-rec/src')

import torch
from models.two_tower import TwoTowerModel
from trainers.two_tower_trainer import TwoTowerTrainer
from data.data_loader import get_dataloader
from utils.config import load_config, DEFAULT_CONFIG

In [None]:
# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB')

In [None]:
# Preprocess data
!python two-tower-rec/src/data/preprocess.py

In [None]:
# Create data loaders
train_loader = get_dataloader(
    data_path='data/processed',
    batch_size=DEFAULT_CONFIG['training']['batch_size'],
    mode='train'
)

valid_loader = get_dataloader(
    data_path='data/processed',
    batch_size=DEFAULT_CONFIG['training']['batch_size'],
    mode='valid'
)

In [None]:
# Initialize model and move to GPU
model = TwoTowerModel(DEFAULT_CONFIG['model']).to(device)

# Enable mixed precision training
scaler = torch.cuda.amp.GradScaler()

# Initialize optimizer
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=DEFAULT_CONFIG['training']['learning_rate']
)

# Initialize trainer
trainer = TwoTowerTrainer(
    model=model,
    optimizer=optimizer,
    device=device,
    scaler=scaler
)

In [None]:
# Training loop
best_metric = float('-inf')
patience = DEFAULT_CONFIG['training']['early_stopping_patience']
patience_counter = 0

for epoch in range(DEFAULT_CONFIG['training']['num_epochs']):
    # Train
    train_loss = trainer.train_epoch(train_loader)
    
    # Validate
    metrics = trainer.validate(valid_loader)
    
    print(f'Epoch {epoch+1}/{DEFAULT_CONFIG["training"]["num_epochs"]}')
    print(f'Train Loss: {train_loss:.4f}')
    for metric_name, metric_value in metrics.items():
        print(f'{metric_name}: {metric_value:.4f}')
    
    # Early stopping
    if metrics['ndcg@10'] > best_metric:
        best_metric = metrics['ndcg@10']
        patience_counter = 0
        # Save best model
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print('Early stopping triggered')
            break

In [None]:
# Save final results
print(f'Best NDCG@10: {best_metric:.4f}')