# Importing Modules

In [75]:
import os
import time
import math
from typing import Any, Dict, List, Optional, Tuple
from tempfile import TemporaryDirectory
import json

import numpy as np
import polars as pl
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Data Loading

In [2]:
movies_df = pl.read_parquet('../data/processed/output.parquet')
train_df = pl.read_parquet('../data/processed/train.parquet')
test_df = pl.read_parquet('../data/processed/test.parquet')
vocabs = torch.load('../data/processed/all_vocabs.pth')

user_vocab = vocabs["user_vocab"]
movie_vocab = vocabs["movie_vocab"]
genres_vocab = vocabs["genres_vocab"]
prod_comp_vocab = vocabs["prod_comp_vocab"]
prod_countries_vocab = vocabs["prod_countries_vocab"]
languages_vocab = vocabs["languages_vocab"]
words_vocab = vocabs["words_vocab"]

vocabs = {
    "user_vocab": user_vocab,
    "movie_vocab": movie_vocab,
    "genres_vocab": genres_vocab,
    "prod_comp_vocab": prod_comp_vocab,
    "prod_countries_vocab": prod_countries_vocab,
    "languages_vocab": languages_vocab,
    "words_vocab": words_vocab,
}

movie_vocab_stoi = movie_vocab.get_stoi()
user_vocab_stoi = user_vocab.get_stoi()



In [3]:
print(f"Movies DF shape: {movies_df.shape}")
print(f"Train DF shape: {train_df.shape}")
print(f"Test DF shape: {test_df.shape}")
print("Vocab sizes:", {k: len(v) for k, v in vocabs.items()})
print("\nFirst movie row:", movies_df[0].to_dict(as_series=False))
print("First sequence row:", train_df[0].to_dict(as_series=False))

Movies DF shape: (86493, 15)
Train DF shape: (13218442, 3)
Test DF shape: (2330541, 3)
Vocab sizes: {'user_vocab': 200949, 'movie_vocab': 86494, 'genres_vocab': 21, 'prod_comp_vocab': 45546, 'prod_countries_vocab': 201, 'languages_vocab': 164, 'words_vocab': 270246}

First movie row: {'movieId_idx': [29614], 'genres_idx': [[9, 12, 1]], 'production_companies_idx': [[5787, 10790, 33238]], 'production_countries_idx': [[199, 179]], 'spoken_languages_idx': [[6, 23, 137, 47]], 'keywords_idx': [[16308, 175809, 93150, 146205, 199438, 206752, 33269, 147659, 198442, 112495, 215262, 104288, 10122, 84395, 101318, 37337, 35868, 263713, 52720, 82866]], 'overview_idx': [[176485, 24219, 129031, 130595, 204009, 102295, 187482, 46191, 265111, 75891, 120691, 82866, 7607, 80603, 210357, 60107, 248412, 24219, 171073, 185742, 207323, 80603, 208294, 186120, 230514, 250658, 48946, 24219, 162914, 55906, 185742, 145939, 86280, 230898, 120691, 50712, 7607, 233356, 69050, 257026, 247940, 24219, 99177, 12348]], 't

In [4]:
# --- Get Vocab Sizes for Model ---
len_genres = len(genres_vocab)
len_prod_comp = len(prod_comp_vocab)
len_prod_cont = len(prod_countries_vocab)
len_langs = len(languages_vocab)
len_words = len(words_vocab)
n_items = len(movie_vocab)

In [59]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class MovieSeqDataset(Dataset):
    def __init__(self, data, movie_vocab_stoi, user_vocab_stoi):
        self.data = data
        self.movie_vocab_stoi = movie_vocab_stoi
        self.user_vocab_stoi = user_vocab_stoi
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        user, movie_sequence, rating_sequence = self.data[idx]
        movie_data = [self.movie_vocab_stoi.get(item,movie_vocab_stoi['<unk>']) for item in movie_sequence.to_list()[0]]
        user_data = self.user_vocab_stoi[user.to_list()[0]]
        rating_sequence = rating_sequence.to_list()[0]
        return torch.tensor(movie_data[:-1]), torch.tensor(user_data), torch.tensor(rating_sequence[:-1]), torch.tensor(movie_data[-1])
    
def collate_batch(batch):
    movie_list = [item[0] for item in batch]
    user_list = [item[1] for item in batch]
    rating_list = [item[2] for item in batch]
    target_list = [item[3] for item in batch]
    return pad_sequence(movie_list, padding_value=movie_vocab_stoi['<unk>'], batch_first=True), torch.stack(user_list), pad_sequence(rating_list, padding_value=3, batch_first=True), torch.stack(target_list)

In [60]:
BATCH_SIZE = 16

train_dataset = MovieSeqDataset(train_df, movie_vocab_stoi, user_vocab_stoi)
val_dataset = MovieSeqDataset(test_df, movie_vocab_stoi, user_vocab_stoi)

train_iter = DataLoader(train_dataset, batch_size=BATCH_SIZE,shuffle=True, collate_fn=collate_batch)
val_iter = DataLoader(val_dataset, batch_size=BATCH_SIZE,shuffle=False, collate_fn=collate_batch)

In [61]:
for i, (movie_data, user_data, ratings_data, y_train) in enumerate(train_iter):
    print(movie_data.shape, user_data.shape, ratings_data.shape, y_train.shape)
    break

torch.Size([16, 4]) torch.Size([16]) torch.Size([16, 4]) torch.Size([16])


# Embeddings

In [7]:
import torch 
from torch import nn
from typing import Tuple

class MovieEmbeddings(nn.Module):
    def __init__(self, 
                 d_model: int,
                 hidden_size: int,
                 num_list_features: int,
                 num_scalar_features: int,
                 n_genres: int, 
                 n_production_companies: int,
                 n_production_countries: int,
                 n_spoken_languages: int,
                 n_words: int):
        super().__init__()
        self.genres_embedding = nn.EmbeddingBag(n_genres, d_model*2, mode='mean')
        self.prod_comp_embedding = nn.EmbeddingBag(n_production_companies, d_model, mode='mean')
        self.prod_cont_embedding = nn.EmbeddingBag(n_production_countries, d_model, mode='mean')
        self.lang_embedding = nn.EmbeddingBag(n_spoken_languages, d_model, mode='mean')
        self.word_embedding = nn.EmbeddingBag(n_words, d_model*4, mode='mean')
        self.fc = nn.Linear(d_model*(10+num_list_features)+num_scalar_features,hidden_size)
        self._init_weights()

    def _init_weights(self) -> None:
        nn.init.xavier_uniform_(self.genres_embedding.weight)
        nn.init.xavier_uniform_(self.prod_comp_embedding.weight)
        nn.init.xavier_uniform_(self.prod_cont_embedding.weight)
        nn.init.xavier_uniform_(self.lang_embedding.weight)
        nn.init.xavier_uniform_(self.word_embedding.weight)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def _prepare_embedding_inputs(self, list_of_lists) -> Tuple[torch.Tensor, torch.Tensor]:
        flat_list = []
        offsets = [0]
        for sublist in list_of_lists:
            flat_list.extend(sublist)
            offsets.append(offsets[-1] + len(sublist))
        offsets = offsets[:-1]  # Remove last cumulative sum
        offsets = torch.tensor(offsets, dtype=torch.long)
        flat_list = torch.tensor(flat_list, dtype=torch.long)
        return flat_list, offsets   

    def forward(self, row: pl.DataFrame) -> torch.Tensor:
        genres, genres_offsets = self._prepare_embedding_inputs(row['genres_idx'])
        genres_e = self.genres_embedding(genres, genres_offsets)

        comp, comp_offsets = self._prepare_embedding_inputs(row['production_companies_idx'])
        comp_e = self.prod_comp_embedding(comp, comp_offsets)

        cont, cont_offsets = self._prepare_embedding_inputs(row['production_countries_idx'])
        cont_e = self.prod_cont_embedding(cont, cont_offsets)

        lang, lang_offsets = self._prepare_embedding_inputs(row['spoken_languages_idx'])
        lang_e = self.lang_embedding(lang, lang_offsets)

        kw, kw_offsets = self._prepare_embedding_inputs(row['keywords_idx'])
        kw_e = self.word_embedding(kw, kw_offsets)

        tag, tag_offsets = self._prepare_embedding_inputs(row['tagline_idx'])
        tag_e = self.word_embedding(tag, tag_offsets)

        ov, ov_offsets = self._prepare_embedding_inputs(row['overview_idx'])
        ov_e = self.word_embedding(ov, ov_offsets)

        # Scalar features as tensors (ensure shape is [batch_size, 1])
        revenue = torch.tensor(row["revenue"], dtype=torch.float32).unsqueeze(1)
        budget = torch.tensor(row["budget"], dtype=torch.float32).unsqueeze(1)
        runtime = torch.tensor(row["runtime"], dtype=torch.float32).unsqueeze(1)
        adult_idx = torch.tensor(row["adult_idx"], dtype=torch.bool).unsqueeze(1)
        vote_average = torch.tensor(row["vote_average"], dtype=torch.float32).unsqueeze(1)
        vote_count = torch.tensor(row["vote_count"], dtype=torch.float32).unsqueeze(1)
        popularity = torch.tensor(row["popularity"], dtype=torch.float32).unsqueeze(1)

        # Concatenate all embeddings and scalar features
        master_embedding = torch.cat([
            genres_e,
            comp_e,
            cont_e,
            lang_e,
            kw_e,
            tag_e,
            ov_e,
            revenue,
            budget,
            runtime,
            adult_idx,
            vote_average,
            vote_count,
            popularity
        ], dim=1)

        return self.fc(master_embedding)

# Model

In [28]:
class TRXTransformer(nn.Module):
    def __init__(self, d_model: int, n_heads: int, num_layers: int):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.num_layers = num_layers

        self.transformer_blocks = nn.ModuleList([nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, batch_first=True) for _ in range(num_layers)])

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x shape: (batch_size, seq_len, d_model)
        # Placeholder for transformer logic
        print(f"Input shape to transformer: {x.shape}")
        for transformer_block in self.transformer_blocks:
            x = transformer_block(x)
        # x shape: (batch_size, seq_len, d_model)
        print(f"Output shape from transformer: {x.shape}")
        return x

In [70]:
class TRXModel(nn.Module):
    def __init__(self, 
                 d_model: int,
                 n_heads: int,
                 num_layers: int,
                 hidden_size: int,
                 num_list_features: int,
                 num_scalar_features: int,
                 n_genres: int, 
                 n_production_companies: int,
                 n_production_countries: int,
                 n_spoken_languages: int,
                 movie_vocab_stoi: Dict[str, int],
                 user_vocab_stoi: Dict[str, int],
                 n_movies: int,
                 n_words: int,
                 seq_len: int = 4,
                 dropout_rate: float = 0.1):
        super().__init__()
        self.movie_embeddings = MovieEmbeddings(d_model, hidden_size, num_list_features, num_scalar_features, n_genres, n_production_companies, n_production_countries, n_spoken_languages, n_words)
        self.movie_vocab_stoi = movie_vocab_stoi
        self.user_vocab_stoi = user_vocab_stoi
        self.transformer_encoder = TRXTransformer(hidden_size, n_heads=n_heads, num_layers=num_layers)
        self.fc1 = nn.Linear(hidden_size*seq_len, n_movies)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # # rows['sequence_movie_ids'] -> List[List[Any]] of shape [batch_size, 4]
        # batch_movie_ids = rows['sequence_movie_ids'].to_list()

        # # Map movie IDs to vocab indices using movie_vocab_stoi
        # indexed_ids = [
        #     [movie_vocab_stoi.get(str(mid), movie_vocab_stoi['<unk>']) for mid in seq]
        #     for seq in batch_movie_ids
        # ]
        
        # Fetch the rows filtered by the movie IDs
        movie_rows = [movies_df.filter(pl.col('movieId_idx').is_in(x[i])) for i in range(len(x))] # List[pl.DataFrame]
        
        # Get embeddings: [batch_size, seq_len, embedding_dim]
        embeddings = torch.stack([self.movie_embeddings(row) for row in movie_rows], dim=0)

        
        # Pass Embeddings through the Transformer Encoder
        transformer_output = self.transformer_encoder(embeddings)
        # transformer_output shape: [batch_size, seq_len, d_model]

        # Reshape transformer_output to [batch_size, seq_len * d_model]
        transformer_output = transformer_output.view(transformer_output.size(0), -1)
        # transformer_output shape: [batch_size, seq_len * d_model]
        print(f"Transformer output shape: {transformer_output.shape}")

        # Pass to fc1
        fc1_output = self.fc1(transformer_output)
        # fc1_output shape: [batch_size, n_movies]

        # Apply dropout
        fc1_output = self.dropout(fc1_output)
        # fc1_output shape: [batch_size, n_movies]

        # Apply softmax to get probabilities
        probabilities = F.softmax(fc1_output, dim=1)
        # probabilities shape: [batch_size, n_movies]

        return probabilities




In [73]:
model = TRXModel(
    d_model=16,
    n_heads=4,
    num_layers=4,
    hidden_size=256,
    num_list_features=7,
    num_scalar_features=7,
    n_genres=len_genres, 
    n_production_companies=len_prod_comp, 
    n_production_countries=len_prod_cont, 
    n_spoken_languages=len_langs, 
    movie_vocab_stoi=movie_vocab_stoi,
    user_vocab_stoi=user_vocab_stoi,
    n_movies=len(movie_vocab_stoi),
    n_words=len_words
)

for idx, (movie_data, user_data, ratings_data, y_train) in enumerate(train_iter):
    print(movie_data.shape, user_data.shape, ratings_data.shape, y_train.shape)
    print(movie_data[0])
    y = model(movie_data)
    print(y.shape)
    break

torch.Size([16, 4]) torch.Size([16]) torch.Size([16, 4]) torch.Size([16])
tensor([12029, 34224, 26889, 37526])
Input shape to transformer: torch.Size([16, 4, 256])
Output shape from transformer: torch.Size([16, 4, 256])
Transformer output shape: torch.Size([16, 1024])
torch.Size([16, 86494])


In [74]:
# Total parameters in the model
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters in the model: {total_params / 1e6:.2f}M")

Total parameters in the model: 112.02M


# Training Loop

In [79]:
best_val_loss = float('inf')
epochs = 10

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TRXModel(
    d_model=16,
    n_heads=4,
    num_layers=4,
    hidden_size=256,
    num_list_features=7,
    num_scalar_features=7,
    n_genres=len_genres, 
    n_production_companies=len_prod_comp, 
    n_production_countries=len_prod_cont, 
    n_spoken_languages=len_langs, 
    movie_vocab_stoi=movie_vocab_stoi,
    user_vocab_stoi=user_vocab_stoi,
    n_movies=len(movie_vocab_stoi),
    n_words=len_words
)
model.to(device)

TRXModel(
  (movie_embeddings): MovieEmbeddings(
    (genres_embedding): EmbeddingBag(21, 32, mode='mean')
    (prod_comp_embedding): EmbeddingBag(45546, 16, mode='mean')
    (prod_cont_embedding): EmbeddingBag(201, 16, mode='mean')
    (lang_embedding): EmbeddingBag(164, 16, mode='mean')
    (word_embedding): EmbeddingBag(270246, 64, mode='mean')
    (fc): Linear(in_features=279, out_features=256, bias=True)
  )
  (transformer_encoder): TRXTransformer(
    (transformer_blocks): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05,

In [80]:
def train(model: nn.Module, train_iter, epoch) -> None:
    # Switch to training mode
    model.train()
    total_loss = 0.
    log_interval = 200
    start_time = time.time()

    for i, (movie_data, _, _, target) in enumerate(train_iter):
        # Load movie sequence and user id
        movie_data = movie_data.to(device)
        target = target.to(device)

        # Predict movies
        output = model(movie_data) # [batch-size, n_movies]
        
        # Backpropogation process
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        total_loss += loss.item()
        # Results
        if i % log_interval == 0 and i > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

In [78]:
def evaluate(model: nn.Module, eval_data: torch.Tensor) -> float:
    # Switch the model to evaluation mode.
    # This is necessary for layers like dropout,
    model.eval() 
    total_loss = 0.

    with torch.no_grad():
        for i, (movie_data, _, _, target) in enumerate(eval_data):
            # Load movie sequence and user id
            movie_data = movie_data.to(device)
            target = target.to(device)
            
            # Predict movies
            output = model(movie_data)

            # Calculate loss
            loss = criterion(output, target)
            total_loss += loss.item()
    return total_loss / (len(eval_data) - 1)

In [82]:
with TemporaryDirectory() as tempdir:
    best_model_params_path = os.path.join(tempdir, "best_model_params.pt")

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()

        # Training
        train(model, train_iter, epoch)

        # Evaluation
        val_loss = evaluate(model, val_iter)

        # Compute the perplexity of the validation loss
        val_ppl = math.exp(val_loss)
        elapsed = time.time() - epoch_start_time

        # Results
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
            f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
        print('-' * 89)

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_params_path)

        scheduler.step()
    model.load_state_dict(torch.load(best_model_params_path)) # load best model

Input shape to transformer: torch.Size([16, 4, 256])
Output shape from transformer: torch.Size([16, 4, 256])
Transformer output shape: torch.Size([16, 1024])
Input shape to transformer: torch.Size([16, 4, 256])
Output shape from transformer: torch.Size([16, 4, 256])
Transformer output shape: torch.Size([16, 1024])


KeyboardInterrupt: 