# Training Transformer Recommender on INSPIRED Dataset

This notebook trains the Transformer-based movie recommender on the INSPIRED dataset.

## Environmental Setup

In [1]:
import os
import sys
from pathlib import Path

# Set working directory
os.chdir("..")

# Add project root to Python path
project_root = os.getcwd()
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Check current directory
current_directory = os.getcwd()
print("Project Root:", project_root)
print("Current Working Directory:", current_directory)

Project Root: C:\Users\91953\Documents\GitHub\RAG-Movie-CRS
Current Working Directory: C:\Users\91953\Documents\GitHub\RAG-Movie-CRS


In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer

import pandas as pd
import numpy as np
from tqdm import tqdm
import json

from datetime import datetime
from collections import Counter
import matplotlib.pyplot as plt

# Import your transformer model
from scripts.transformer_recommender import TransformerRecommender, INSPIREDDataProcessor

In [3]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

PyTorch version: 2.9.1+cpu
CUDA available: False
Using device: cpu


## DATA PREPARATION FOR TRAINING

### Load Dataset And Create Training Dataset

In [None]:
# Initialize data processor
data_processor = INSPIREDDataProcessor(dataset_dir="data")

# Load movie database
print("Loading movie database...")
movie_id_map, movie_name_map = data_processor.load_movie_database()

print(f"\nTotal movies in database: {len(movie_id_map)}")

# Load dialogs
print("\nLoading dialogs...")
train_dialogs = data_processor.load_dialogs(split="train", max_dialogs=None)
val_dialogs = data_processor.load_dialogs(split="dev", max_dialogs=None)

print(f"\nDataset sizes:")
print(f"Training: {len(train_dialogs)} dialogs")
print(f"Validation: {len(val_dialogs)} dialogs")

### PyTorch Dataset Class

In [None]:
class MovieRecommendationDataset(Dataset):
    """PyTorch Dataset for INSPIRED movie conversations"""
    
    def __init__(self, dialogs, num_movies, tokenizer, max_length=512):
        self.dialogs = dialogs
        self.num_movies = num_movies
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.dialogs)
    
    def __getitem__(self, idx):
        dialog = self.dialogs[idx]
        
        # Tokenize conversation
        encoding = self.tokenizer(
            dialog['conversation'],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Create multi-hot label vector (1 for recommended movies, 0 for others)
        labels = torch.zeros(self.num_movies, dtype=torch.float)
        for movie_id in dialog['recommended_movies']:
            if movie_id < self.num_movies:
                labels[movie_id] = 1.0
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': labels
        }

print("MovieRecommendationDataset class defined")

### CREATE DATASETS AND DATALOADERS

In [None]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Create datasets
train_dataset = MovieRecommendationDataset(
    train_dialogs, 
    len(movie_id_map), 
    tokenizer,
    max_length=512
)

val_dataset = MovieRecommendationDataset(
    val_dialogs, 
    len(movie_id_map), 
    tokenizer,
    max_length=512
)

# Create dataloaders
batch_size = 8  # Adjust based on your GPU memory

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0  # Set to 0 for Windows
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0
)

print(f"Created dataloaders:")
print(f"  Train batches: {len(train_loader)}")
print(f"  Val batches: {len(val_loader)}")
print(f"  Batch size: {batch_size}")

### Initialize Model

In [None]:
model = TransformerRecommender(
    model_name="bert-base-uncased",
    num_movies=len(movie_id_map)
)

model = model.to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Model initialized:")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")
print(f"  Device: {device}")

# TRAINING

### Training Configuration

### Training Loop

### Validation

### Save/Load Checkpoints