### Importing Modules

In [1]:
import torch
from torch import nn
import numpy as np
import polars as pl
import math

### Positional Encoding for movie sequence

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float=0.1, max_len: int=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        positions = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,d_model,2)*(-math.log(10000.0)/d_model))
        
        pe = torch.zeros(max_len,1,d_model)

        pe[:, 0, 0::2] = torch.sin(positions * div_term)
        pe[:, 0, 1::2] = torch.cos(positions * div_term)
        self.register_buffer('pe', pe)
    
    def forward(self, x: torch.Tensor):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)
    

In [None]:
pe = PositionalEncoding(2)
x = torch.tensor([[0,0,0,0,0]])
print(x.shape)
e = nn.Embedding(10,2)
y_e = e(x)
print(y_e.shape)
y=pe(y_e)
print(y.shape)

### Generate embeddings for batch of movies

In [10]:
import torch 
from torch import nn
from typing import Tuple

class MovieEmbeddings(nn.Module):
    def __init__(self, 
                 d_model: int,
                 hidden_size: int,
                 num_list_features: int,
                 num_scalar_features: int,
                 n_genres: int, 
                 n_production_companies: int,
                 n_production_countries: int,
                 n_spoken_languages: int,
                 n_words: int):
        super().__init__()
        self.genres_embedding = nn.EmbeddingBag(n_genres, d_model*2, mode='mean')
        self.prod_comp_embedding = nn.EmbeddingBag(n_production_companies, d_model, mode='mean')
        self.prod_cont_embedding = nn.EmbeddingBag(n_production_countries, d_model, mode='mean')
        self.lang_embedding = nn.EmbeddingBag(n_spoken_languages, d_model, mode='mean')
        self.word_embedding = nn.EmbeddingBag(n_words, d_model*4, mode='mean')
        self.fc = nn.Linear(d_model*(10+num_list_features)+num_scalar_features,hidden_size)
        self._init_weights()

    def _init_weights(self) -> None:
        nn.init.xavier_uniform_(self.genres_embedding.weight)
        nn.init.xavier_uniform_(self.prod_comp_embedding.weight)
        nn.init.xavier_uniform_(self.prod_cont_embedding.weight)
        nn.init.xavier_uniform_(self.lang_embedding.weight)
        nn.init.xavier_uniform_(self.word_embedding.weight)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def _prepare_embedding_inputs(self, list_of_lists) -> Tuple[torch.Tensor, torch.Tensor]:
        flat_list = []
        offsets = [0]
        for sublist in list_of_lists:
            flat_list.extend(sublist)
            offsets.append(offsets[-1] + len(sublist))
        offsets = offsets[:-1]  # Remove last cumulative sum
        offsets = torch.tensor(offsets, dtype=torch.long)
        flat_list = torch.tensor(flat_list, dtype=torch.long)
        return flat_list, offsets   

    def forward(self, row: pl.DataFrame) -> torch.Tensor:
        genres, genres_offsets = self._prepare_embedding_inputs(row['genres_idx'])
        genres_e = self.genres_embedding(genres, genres_offsets)

        comp, comp_offsets = self._prepare_embedding_inputs(row['production_companies_idx'])
        comp_e = self.prod_comp_embedding(comp, comp_offsets)

        cont, cont_offsets = self._prepare_embedding_inputs(row['production_countries_idx'])
        cont_e = self.prod_cont_embedding(cont, cont_offsets)

        lang, lang_offsets = self._prepare_embedding_inputs(row['spoken_languages_idx'])
        lang_e = self.lang_embedding(lang, lang_offsets)

        kw, kw_offsets = self._prepare_embedding_inputs(row['keywords_idx'])
        kw_e = self.word_embedding(kw, kw_offsets)

        tag, tag_offsets = self._prepare_embedding_inputs(row['tagline_idx'])
        tag_e = self.word_embedding(tag, tag_offsets)

        ov, ov_offsets = self._prepare_embedding_inputs(row['overview_idx'])
        ov_e = self.word_embedding(ov, ov_offsets)

        # Scalar features as tensors (ensure shape is [batch_size, 1])
        revenue = torch.tensor(row["revenue"], dtype=torch.float32).unsqueeze(1)
        budget = torch.tensor(row["budget"], dtype=torch.float32).unsqueeze(1)
        runtime = torch.tensor(row["runtime"], dtype=torch.float32).unsqueeze(1)
        adult_idx = torch.tensor(row["adult_idx"], dtype=torch.bool).unsqueeze(1)
        vote_average = torch.tensor(row["vote_average"], dtype=torch.float32).unsqueeze(1)
        vote_count = torch.tensor(row["vote_count"], dtype=torch.float32).unsqueeze(1)
        popularity = torch.tensor(row["popularity"], dtype=torch.float32).unsqueeze(1)

        # Concatenate all embeddings and scalar features
        master_embedding = torch.cat([
            genres_e,
            comp_e,
            cont_e,
            lang_e,
            kw_e,
            tag_e,
            ov_e,
            revenue,
            budget,
            runtime,
            adult_idx,
            vote_average,
            vote_count,
            popularity
        ], dim=1)

        return self.fc(master_embedding)

In [17]:
vocabs = torch.load('../data/all_vocabs.pth')

user_vocab = vocabs["user_vocab"]
movie_vocab = vocabs["movie_vocab"]
genres_vocab = vocabs["genres_vocab"]
prod_comp_vocab = vocabs["prod_comp_vocab"]
prod_countries_vocab = vocabs["prod_countries_vocab"]
languages_vocab = vocabs["languages_vocab"]
words_vocab = vocabs["words_vocab"]
movie_vocab_stoi = movie_vocab.get_stoi()
user_vocab_stoi = user_vocab.get_stoi()

In [12]:
movies_prepped = pl.read_parquet('../data/processed/output.parquet')

me = MovieEmbeddings(16,256,7,7,len(genres_vocab),len(prod_comp_vocab),len(prod_countries_vocab),len(languages_vocab),len(words_vocab))
print(f"Number of parameters: {sum(p.numel() for p in me.parameters() if p.requires_grad)}")
y = me(movies_prepped[:10])
print(y.shape)
print(y)

Number of parameters: 18102672
torch.Size([10, 256])
tensor([[ 4347.0571,  2064.8047,  1902.0768,  ...,  3807.1909, -2614.1748,
          -671.3229],
        [ 4014.5408,  2019.2638,  1782.0667,  ...,  3566.9268, -2405.8521,
          -626.9221],
        [ 4107.3535,  1571.4141,  1645.0946,  ...,  3552.3320, -2475.6802,
          -632.4916],
        ...,
        [ 2912.2559,  2153.8618,  1563.6954,  ...,  2677.5276, -1732.4044,
          -457.1087],
        [ 3454.8982,  1460.3892,  1426.9508,  ...,  3037.6448, -2076.8008,
          -549.8853],
        [ 2934.0432,  1941.7190,  1530.2401,  ...,  2573.1045, -1762.0673,
          -430.7871]], grad_fn=<AddmmBackward0>)


### Loading Data

In [13]:
train_data_raw = pl.read_parquet('../data/processed/train.parquet').to_numpy()
test_data_raw = pl.read_parquet('../data/processed/test.parquet').to_numpy()

In [15]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class MovieSeqDataset(Dataset):
    def __init__(self, data, movie_vocab_stoi, user_vocab_stoi):
        self.data = data
        self.movie_vocab_stoi = movie_vocab_stoi
        self.user_vocab_stoi = user_vocab_stoi
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        user, movie_sequence, rating_sequence = self.data[idx]
        movie_data = [self.movie_vocab_stoi.get(item,movie_vocab_stoi['<unk>']) for item in movie_sequence]
        user_data = self.user_vocab_stoi[user]
        return torch.tensor(movie_data), torch.tensor(user_data), torch.tensor(rating_sequence)
    
def collate_batch(batch):
    movie_list = [item[0] for item in batch]
    user_list = [item[1] for item in batch]
    rating_list = [item[2] for item in batch]
    return pad_sequence(movie_list, padding_value=movie_vocab_stoi['<unk>'], batch_first=True), torch.stack(user_list), pad_sequence(rating_list, padding_value=3, batch_first=True)

In [18]:
BATCH_SIZE = 16

train_dataset = MovieSeqDataset(train_data_raw, movie_vocab_stoi, user_vocab_stoi)
val_dataset = MovieSeqDataset(test_data_raw, movie_vocab_stoi, user_vocab_stoi)

train_iter = DataLoader(train_dataset, batch_size=BATCH_SIZE,shuffle=True, collate_fn=collate_batch)
val_iter = DataLoader(val_dataset, batch_size=BATCH_SIZE,shuffle=False, collate_fn=collate_batch)

In [19]:
for i, (movie_data, user_data, ratings_data) in enumerate(train_iter):
    print(movie_data.shape, user_data.shape, ratings_data.shape)
    break

torch.Size([16, 5]) torch.Size([16]) torch.Size([16, 5])
