In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm
import csv

In [13]:
# 1. Load the datasets
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv', low_memory=False)

# Create a dictionary mapping movie IDs to their titles
movie_id_to_title = {}
with open('data/movies.csv', 'r', encoding='utf8') as f:
    reader = csv.reader(f)
    next(reader)  # Skip header row
    for row in reader:
        movie_id = int(row[0])
        title = row[1]
        movie_id_to_title[movie_id] = title

# Merge the datasets
data = pd.merge(movies, ratings, on='movieId')

# Convert user and item IDs to integers (index-based)
user_ids = {id: i for i, id in enumerate(data['userId'].unique())}
movie_ids = {id: i for i, id in enumerate(data['movieId'].unique())}
n_users = len(user_ids)
n_movies = len(movie_ids)

data['userId'] = data['userId'].apply(lambda x: user_ids[x])
data['movieId'] = data['movieId'].apply(lambda x: movie_ids[x])

data[data['userId'] == 69212]

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
95290,3,Waiting to Exhale (1995),Comedy|Drama|Romance,69212,1.0,840658825
171357,9,GoldenEye (1995),Action|Adventure|Thriller,69212,2.0,840657994
222700,15,Casino (1995),Crime|Drama,69212,1.0,840658497
242167,16,Sense and Sensibility (1995),Drama|Romance,69212,5.0,840658286
276458,19,Money Train (1995),Action|Comedy|Crime|Drama|Thriller,69212,2.0,840658904
...,...,...,...,...,...,...
4753007,769,Kingpin (1996),Comedy,69212,3.0,840660583
4767266,770,Eraser (1996),Action|Drama|Thriller,69212,3.0,840658979
4804539,784,Lone Star (1996),Drama|Mystery|Western,69212,3.0,840659629
4871922,818,Chain Reaction (1996),Action|Adventure|Thriller,69212,1.0,840660315


In [14]:
# 2. Train-test split
train_data = data.sample(frac=0.8, random_state=123)
test_data = data.drop(train_data.index)

# Define Dataset
class RatingDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.num_users = data["userId"].unique()
        
        self.user_ids = data["userId"].unique()

        self.num_movies = data["movieId"].max()
        
    def __len__(self):
        return len(self.user_ids)
    
    def __getitem__(self, idx):
        user_id = self.user_ids[idx]
        user_data = self.data[self.data["userId"] == user_id]

        rating_vector = np.full(self.num_movies + 1, -1)
        rating_vector[user_data['movieId']] = user_data['rating']

        return rating_vector, user_id

train_dataset = RatingDataset(train_data)
test_dataset = RatingDataset(test_data)

r, u = next(iter(train_dataset))

# # DataLoaders
# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

1

In [40]:
# 3. Define Model: Hybrid Matrix Factorization + Neural Collaborative Filtering (NCF)
class TwoTowerSys(nn.Module):
    def __init__(self, n_users, n_movies, embedding_size=32, user_hidden_size=[2048, 1024, 1024], movie_hidden_size=[256], final_embed_size=256):
        super(TwoTowerSys, self).__init__()

        self.user_linear = nn.Sequential([
            nn.Linear(n_movies, user_hidden_size[0]),
            nn.Linear(user_hidden_size[0], user_hidden_size[1]),
            nn.Linear(user_hidden_size[1], user_hidden_size[2]),
            nn.Linear(user_hidden_size[2], final_embed_size),
        ])
        
        # Embeddings for MF
        self.movie_embedding = nn.Embedding(n_movies, embedding_size)
        self.movie_linear = nn.Sequential([
            nn.Linear(embedding_size, movie_hidden_size[0]),
            nn.Linear( movie_hidden_size[0], final_embed_size),
        ])
    
    def forward(self, user_ratings, movie_id):
        # Movie Branch
        movie_embed = self.movie_embedding(movie_id)
        representational_movie_embed = self.movie_linear(movie_embed)

        representational_user = self.user_linear(user_ratings)

        distance = torch.mul(representational_user, representational_movie_embed).sum(dim=1)  # Dot product
        
        return distance

In [42]:
# Initialize model and optimizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = TwoTowerSys(n_users, n_movies)
model.to(device)

criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)

# 4. Train the Model
def train_model(train_loader, optimizer, model, criterion, num_epochs=10):
    model.train()
    for epoch in tqdm(range(num_epochs)):
        print("Epoch:", epoch)
        running_loss = 0.0
        for user_id, movie_id, rating in train_loader:
            user_id = user_id.to(device)
            movie_id = movie_id.to(device)
            rating = rating.to(device).float()  # Convert to float for MSE

            # Drop out a rating
            
            
            optimizer.zero_grad()
            outputs = model(user_id, movie_id)
            loss = criterion(outputs, rating)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * len(user_id)
        
        epoch_loss = running_loss / len(train_loader.dataset)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

# Train the model
train_model(train_loader, optimizer, model, criterion, num_epochs=10)



Epoch: 0


  0%|          | 0/10 [40:25<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# 5. Evaluate the Model
def evaluate_model(test_loader, model):
    model.eval()
    preds, actuals = [], []
    
    with torch.no_grad():
        for user_id, movie_id, rating in test_loader:
            user_id = user_id.to(device)
            movie_id = movie_id.to(device)
            rating = rating.to(device).float()
            
            outputs = model(user_id, movie_id)
            preds.append(outputs.cpu().numpy())
            actuals.append(rating.cpu().numpy())
    
    preds = np.concatenate(preds)
    actuals = np.concatenate(actuals)
    
    rmse = np.sqrt(mean_squared_error(actuals, preds))
    print(f"Test RMSE: {rmse:.4f}")
    return rmse

# Evaluate the model
evaluate_model(test_loader, model)

# 6. Save the model
torch.save(model.state_dict(), 'movie_recommendation_model.pth')

In [None]:
# 7. Load the saved model
loaded_model = TwoTowerSys(n_users, n_movies, embedding_size=32, hidden_size=[128, 64, 32])
loaded_model.load_state_dict(torch.load('movie_recommendation_model.pth'))
loaded_model.to(device)
loaded_model.eval()

In [None]:
# 8. Recommend movies for a user
def recommend_top_n(user_id, model, top_n=10):
    model.eval()
    user_embedding = model.user_embedding_mf(torch.tensor([user_id]).to(device))
    all_movie_embeddings = model.movie_embedding_mf.weight.data  # Get all movie embeddings

    # Compute dot product similarity
    scores = torch.matmul(user_embedding, all_movie_embeddings.T).squeeze(0)
    top_movie_ids = torch.topk(scores, top_n).indices.cpu().numpy()

    # Get corresponding movie titles
    top_movie_titles = [movie_id_to_title[movie_ids_inv[movie_id]] for movie_id in top_movie_ids]
    return top_movie_titles

# Example: Recommend movies for user with ID 12
movie_ids_inv = {v: k for k, v in movie_ids.items()}  # Reverse movie_id mapping for recommendation
recommendations = recommend_top_n(user_id=12, model=loaded_model, top_n=10)
print("Top 10 recommendations for user 12:", recommendations)