In [3]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import csv
from tqdm import tqdm

In [8]:
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv', low_memory=False)

movie_id_to_title = {}
with open('data/movies.csv', 'r', encoding='utf8') as f:
    reader = csv.reader(f)
    next(reader)  # Skip header row
    for row in reader:
        movie_id = int(row[0])
        title = row[1]
        movie_id_to_title[movie_id] = title

In [9]:
# Merge the datasets
df = pd.merge(movies, ratings, on='movieId')
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,3.5,1141415820
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1439472215
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,3.0,1573944252
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,858625949
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,890492517


In [None]:
# Convert user and item IDs to integers (index-based)
user_ids = {id: i for i, id in enumerate(df['userId'].unique())}
movie_ids = {id: i for i, id in enumerate(df['movieId'].unique())}

df['userId'] = df['userId'].apply(lambda x: user_ids[x])
df['movieId'] = df['movieId'].apply(lambda x: movie_ids[x])

# One-hot encoding genres
df['genres'] = df['genres'].str.split('|')
genres_set = set(g for sublist in df['genres'] for g in sublist)
for genre in genres_set:
    df[genre] = df['genres'].apply(lambda x: int(genre in x))

df.head()['Adventure']

In [88]:
# Split into training and test sets
train_data, test_data = train_test_split(df[['userId', 'movieId', 'rating']], test_size=0.2)

# Convert to tensors
train_tensor = torch.tensor(train_data.values, dtype=torch.float32).to('mps')
test_tensor = torch.tensor(test_data.values, dtype=torch.float32).to('mps')

In [89]:
n_users = len(user_ids)
n_movies = len(movie_ids)

# Define the Enhanced Recommendation Model
class EnhancedRecommendationModel(nn.Module):
    def __init__(self, n_users, n_movies, n_genres, n_factors=50):
        super(EnhancedRecommendationModel, self).__init__()
        # User and movie latent factors
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.movie_factors = nn.Embedding(n_movies, n_factors)
        self.genre_factors = nn.Embedding(n_genres, n_factors)  # Genre embeddings
        
        # Fully connected layers
        self.fc1 = nn.Linear(n_factors * 2 + n_factors, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

        # Dropout for regularization
        self.dropout = nn.Dropout(0.3)

    def forward(self, user, movie, genres):
        user_embedding = self.user_factors(user)
        movie_embedding = self.movie_factors(movie)
        genre_embedding = self.genre_factors(genres)

        # Concatenate user, movie, and genre embeddings
        x = torch.cat([user_embedding, movie_embedding, genre_embedding], dim=1)
        
        # Pass through fully connected layers
        x = self.dropout(torch.relu(self.fc1(x)))
        x = self.dropout(torch.relu(self.fc2(x)))
        return self.fc3(x)
    
# Initialize model, loss function, and optimizer
n_genres = len(genres_set)  # Number of unique genres
model = EnhancedRecommendationModel(n_users, n_movies, n_genres).to('mps')
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [90]:
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
def rmse(predictions, targets):
    return np.sqrt(mean_squared_error(targets.cpu().detach().numpy(), predictions.cpu().detach().numpy()))

# Function to evaluate RMSE
def rmse(predictions, targets):
    return np.sqrt(mean_squared_error(targets.cpu().detach().numpy(), predictions.cpu().detach().numpy()))

# Train the model
n_epochs = 50
batch_size = 256

In [91]:
# for epoch in range(n_epochs):
#     model.train()
#     losses = []
#     for i in tqdm(range(0, len(train_tensor), batch_size)):
#         batch = train_tensor[i:i+batch_size]
#         users = batch[:, 0].long()
#         movies = batch[:, 1].long()
#         ratings = batch[:, 2]

#         # Improve this later
#         genres = torch.zeros_like(users).long() 

#         # Zero the gradients
#         optimizer.zero_grad()
        
#         # Forward pass
#         preds = model(users, movies, genres)
#         loss = loss_fn(preds.view(-1), ratings)  # Flatten predictions
        
#         # Backward pass and optimization
#         loss.backward()
#         optimizer.step()
        
#         losses.append(loss.item())

In [92]:
with torch.no_grad():
    users_test = test_tensor[:, 0].long()
    movies_test = test_tensor[:, 1].long()
    ratings_test = test_tensor[:, 2]
    
    genres_test = torch.zeros_like(users_test).long()
    
    preds_test = model(users_test, movies_test, genres_test)
    test_rmse = rmse(preds_test.view(-1), ratings_test)
    print(f'Test RMSE: {test_rmse}')

state_dict = torch.load('models/enhanced_movie_recommendation_model.pth')
model.load_state_dict(state_dict)

model.eval()
with torch.no_grad():
    users_test = test_tensor[:, 0].long()
    movies_test = test_tensor[:, 1].long()
    ratings_test = test_tensor[:, 2]
    
    # Use the same dummy genre input
    genres_test = torch.zeros_like(users_test).long()
    
    preds_test = model(users_test, movies_test, genres_test)
    test_rmse = rmse(preds_test.view(-1), ratings_test)
    print(f'Test RMSE: {test_rmse}')


Test RMSE: 5.968047142028809
Test RMSE: 0.9515255689620972


  state_dict = torch.load('models/new_movie_recommendation_model.pth')


In [93]:
# Example of new user's rated movies
new_user_ratings = [
    {'movieId': 1, 'rating': 4.0},  # Movie 1 with a rating of 4.0
    {'movieId': 100, 'rating': 3.5},
    {'movieId': 500, 'rating': 5.0}, 
    {'movieId': 5000, 'rating': 2.0}, 
]

new_user_ratings_df = pd.DataFrame(new_user_ratings)
new_user_ratings_df['movieId'] = new_user_ratings_df['movieId'].apply(lambda x: movie_ids[x])
new_user_ratings_tensor = torch.tensor(new_user_ratings_df.values, dtype=torch.float32).to('mps')


   movieId  rating                    title  Western  Musical  IMAX  Sci-Fi  \
0        1     5.0           Jumanji (1995)        0        0     0       0   
1        2     3.0  Grumpier Old Men (1995)        0        0     0       0   
2       50     4.5    Guardian Angel (1994)        0        0     0       0   
3      100     1.5         Mr. Wrong (1996)        0        0     0       0   

   War  Thriller  Film-Noir  ...  Animation  Comedy  Crime  Fantasy  Action  \
0    0         0          0  ...          0       0      0        1       0   
1    0         0          0  ...          0       1      0        0       0   
2    0         1          0  ...          0       0      0        0       1   
3    0         0          0  ...          0       1      0        0       0   

   Mystery  Children  Documentary  Horror  Drama  
0        0         1            0       0      0  
1        0         0            0       0      0  
2        0         0            0       0      1  
3   

Unnamed: 0,movieId,title,similarity
0,23063,Dragonheart 2: A New Beginning (2000),0.777778
1,34412,Christmas Town (2008),0.777778
2,8570,"Wonderful World of the Brothers Grimm, The (1962)",0.75
3,9175,Revolutionary Girl Utena: Adolescence of Utena...,0.736111
4,13897,Aelita: The Queen of Mars (Aelita) (1924),0.736111
5,4850,"Stunt Man, The (1980)",0.722222
6,31737,Joseph Andrews (1977),0.722222
7,15777,Kaho Naa... Pyaar Hai (2000),0.722222
8,32506,Once Upon a Time (2008),0.722222
9,14928,"Sorcerer's Apprentice, The (2010)",0.666667


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate genre vectors for all movies
movie_genres = df[['movieId'] + list(genres_set)].drop_duplicates().set_index('movieId')

# Create a profile vector for the new user based on their ratings
user_genre_profile = np.dot(new_user_ratings_df['rating'], movie_genres.loc[new_user_ratings_df['movieId']])
user_genre_profile = user_genre_profile.reshape(1, -1)

# Compute similarity between user's genre profile and all movies
similarities = cosine_similarity(user_genre_profile, movie_genres.values)[0]
movie_genres['similarity'] = similarities
recommended_movies_content = movie_genres.sort_values(by='similarity', ascending=False).head(10)

In [None]:
# Extract latent factors from the model
user_embedding_matrix = model.user_factors.weight.data.cpu().numpy()
movie_embedding_matrix = model.movie_factors.weight.data.cpu().numpy()

# Average latent factors based on new user’s rated movies
new_user_movie_ids = new_user_ratings_df['movieId'].values
user_profile = movie_embedding_matrix[new_user_movie_ids].mean(axis=0)

# Calculate cosine similarity with all other movie embeddings
similarities = cosine_similarity([user_profile], movie_embedding_matrix)[0]
latent_factor_recommendations = np.argsort(similarities)[::-1][:10]
latent_factor_recommendations = [movie_id_to_title[movie] for movie in latent_factor_recommendations]

In [94]:
# Combine genre-based and latent factor recommendations using a weighted average
content_weight = 0.5
latent_weight = 0.5

# Normalize both scores to make them comparable
genre_similarity_scores = movie_genres['similarity'] / movie_genres['similarity'].max()
latent_similarity_scores = pd.Series(similarities).rank(pct=True)

# Hybrid recommendation score
hybrid_scores = content_weight * genre_similarity_scores + latent_weight * latent_similarity_scores
recommended_movies_hybrid = hybrid_scores.sort_values(ascending=False).head(10).index
recommended_movies_hybrid = [movie_id_to_title[movie] for movie in recommended_movies_hybrid]

Unnamed: 0,movieId,title,similarity
0,46202,The White Meadows (2009),9.055716
1,33590,Reilly: Ace of Spies (1983),8.215811
2,27475,Up Your Anchor (1985),7.205197
3,43548,Looking for Kitty (2004),7.038386
4,19508,Night Across the Street (La noche de enfrente)...,6.97637
5,50,Guardian Angel (1994),6.911576
6,47379,Sombrero (1953),6.770538
7,55242,"Hannah, Queen of the Vampires (1973)",6.764743
8,36525,La Clé des Champs (2011),6.746008
9,35222,White Rage (2015),6.625898


In [None]:
print("Content-Based Recommendations:", recommended_movies_content.index[:10].map(movie_id_to_title))
print("Latent Factor Recommendations:", latent_factor_recommendations)
print("Hybrid Recommendations:", recommended_movies_hybrid)

In [95]:
def min_max_normalization(scores):
    min_score = scores.min()
    max_score = scores.max()
    normalized_scores = (scores - min_score) / (max_score - min_score)
    return normalized_scores



   movieId  rating                    title  Western  Musical  IMAX  Sci-Fi  \
0        1     5.0           Jumanji (1995)        0        0     0       0   
1        2     3.0  Grumpier Old Men (1995)        0        0     0       0   
2       50     4.5    Guardian Angel (1994)        0        0     0       0   
3      100     1.5         Mr. Wrong (1996)        0        0     0       0   

   War  Thriller  Film-Noir  ...  Animation  Comedy  Crime  Fantasy  Action  \
0    0         0          0  ...          0       0      0        1       0   
1    0         0          0  ...          0       1      0        0       0   
2    0         1          0  ...          0       0      0        0       1   
3    0         0          0  ...          0       1      0        0       0   

   Mystery  Children  Documentary  Horror  Drama  
0        0         1            0       0      0  
1        0         0            0       0      0  
2        0         0            0       0      1  
3   

Unnamed: 0,movieId,title,similarity,source
0,46202,The White Meadows (2009),1.0,latent
1,33590,Reilly: Ace of Spies (1983),0.654334,latent
0,23063,Dragonheart 2: A New Beginning (2000),0.25,genre
1,34412,Christmas Town (2008),0.25,genre
2,27475,Up Your Anchor (1985),0.238413,latent
2,8570,"Wonderful World of the Brothers Grimm, The (1962)",0.1875,genre
3,43548,Looking for Kitty (2004),0.169761,latent
4,13897,Aelita: The Queen of Mars (Aelita) (1924),0.15625,genre
3,9175,Revolutionary Girl Utena: Adolescence of Utena...,0.15625,genre
4,19508,Night Across the Street (La noche de enfrente)...,0.144238,latent
