In [3]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import csv
from tqdm import tqdm

In [4]:
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv', low_memory=False)

In [5]:
# Merge the datasets
df = pd.merge(movies, ratings, on='movieId')
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,3.5,1141415820
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1439472215
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,3.0,1573944252
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,858625949
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,890492517


In [6]:
# Convert user and item IDs to integers (index-based)
user_ids = {id: i for i, id in enumerate(df['userId'].unique())}
movie_ids = {id: i for i, id in enumerate(df['movieId'].unique())}

df['userId'] = df['userId'].apply(lambda x: user_ids[x])
df['movieId'] = df['movieId'].apply(lambda x: movie_ids[x])

# One-hot encoding genres
df['genres'] = df['genres'].str.split('|')
genres_set = set(g for sublist in df['genres'] for g in sublist)
for genre in genres_set:
    df[genre] = df['genres'].apply(lambda x: int(genre in x))

df.head()['Adventure']

0    1
1    1
2    1
3    1
4    1
Name: Adventure, dtype: int64

In [23]:
ratings['new'] = (ratings['rating'] - ratings['rating'].mean()) ** 2
np.sqrt(ratings['new'].sum() / len(ratings['new']))

np.float64(1.0607439399275531)

In [12]:
df_2 = df.pivot(index='userId', columns='movieId', values='rating')

  df_2 = df.pivot(index='userId', columns='movieId', values='rating')


In [21]:
ur_vc = df_2.value_counts()
ur_vc

Series([], Name: count, dtype: int64)

In [22]:
movie_mapping = {index: [movieId, df[df['movieId'] == movieId]['title'].values[0]] for index, movieId in enumerate(df['movieId'].unique())}

KeyboardInterrupt: 

In [100]:
print(movie_mapping[0])

[np.int64(0), 'Toy Story (1995)']


: 

In [88]:
# Split into training and test sets
train_data, test_data = train_test_split(df[['userId', 'movieId', 'rating']], test_size=0.2)

# Convert to tensors
train_tensor = torch.tensor(train_data.values, dtype=torch.float32).to('mps')
test_tensor = torch.tensor(test_data.values, dtype=torch.float32).to('mps')

In [89]:
n_users = len(user_ids)
n_movies = len(movie_ids)

# Define the Matrix Factorization model
class MF(nn.Module):
    def __init__(self, n_users, n_movies, n_factors=20):
        super(MF, self).__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)  # User latent factors, batch_size x n_factors
        self.movie_factors = nn.Embedding(n_movies, n_factors)  # Movie latent factors
        self.user_biases = nn.Embedding(n_users, 1)  # User biases, batch_size x 1
        self.movie_biases = nn.Embedding(n_movies, 1)  # Movie biases

    def forward(self, user, movie):
        # Matrix factorization: dot product of user and movie latent factors + bias
        pred = (self.user_factors(user) * self.movie_factors(movie)).sum(1)
        pred += self.user_biases(user).squeeze() + self.movie_biases(movie).squeeze()
        return pred
    
model = MF(n_users, n_movies).to('mps')

In [90]:
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
n_epochs = 10
batch_size = 1024
def rmse(predictions, targets):
    return np.sqrt(mean_squared_error(targets.cpu().detach().numpy(), predictions.cpu().detach().numpy()))

In [91]:
# Train the model
# for epoch in range(n_epochs):
#     model.train()
#     losses = []
#     for i in tqdm(range(0, len(train_tensor), batch_size)):
#         batch = train_tensor[i:i+batch_size]
#         users = batch[:, 0].long()
#         movies = batch[:, 1].long()
#         ratings = batch[:, 2]
        
#         # Zero the gradients
#         optimizer.zero_grad()
        
#         # Forward pass
#         preds = model(users, movies)
#         loss = loss_fn(preds, ratings)
        
#         # Backward pass and optimization
#         loss.backward()
#         optimizer.step()
        
#         losses.append(loss.item())
    
#     # Print loss at the end of each epoch
#     print(f'Epoch {epoch+1}/{n_epochs}, Loss: {np.mean(losses)}')

In [92]:
model.eval()
with torch.no_grad():
    users_test = test_tensor[:, 0].long()
    movies_test = test_tensor[:, 1].long()
    ratings_test = test_tensor[:, 2]
    
    preds_test = model(users_test, movies_test)
    test_rmse = rmse(preds_test, ratings_test)
    print(f'Test RMSE: {test_rmse}')

state_dict = torch.load('models/new_movie_recommendation_model.pth')
model.load_state_dict(state_dict)
model.eval()
with torch.no_grad():
    users_test = test_tensor[:, 0].long()
    movies_test = test_tensor[:, 1].long()
    ratings_test = test_tensor[:, 2]
    
    preds_test = model(users_test, movies_test)
    test_rmse = rmse(preds_test, ratings_test)
    print(f'Test RMSE: {test_rmse}')

Test RMSE: 5.968047142028809
Test RMSE: 0.9515255689620972


  state_dict = torch.load('models/new_movie_recommendation_model.pth')


In [93]:
# Example input for a new user
new_user_ratings = {
    1: 5.0,  # MovieId 1, rating 5.0
    2: 3.0,
    50: 4.5,
    100: 1.5
}

def recommend_genre_based(new_user_ratings, df, top_n=10):
    # Create a DataFrame for the movies the new user has rated
    rated_movies = pd.DataFrame(new_user_ratings.items(), columns=['movieId', 'rating'])
    
    # Merge the rated movies with the original dataframe to get genres
    rated_movies = pd.merge(rated_movies, df[['movieId', 'title'] + list(genres_set)].drop_duplicates(subset='movieId'), on='movieId')
    print(rated_movies)
    print(list(genres_set))
    # Get the genre profile by weighting the genres of the movies the user liked
    genre_profile = rated_movies[list(genres_set)].T.dot(rated_movies['rating'])
    
    # Normalize the genre profile to sum up to 1
    genre_profile /= genre_profile.sum()
    
    # Calculate the similarity of other movies to this genre profile
    all_movies_genres = df[['movieId', 'title'] + list(genres_set)].drop_duplicates(subset='movieId').copy()
    all_movies_genres['similarity'] = all_movies_genres[list(genres_set)].dot(genre_profile)
    
    # Exclude movies that the user has already rated
    all_movies_genres = all_movies_genres[~all_movies_genres['movieId'].isin(new_user_ratings.keys())]
    
    # Recommend the top N movies with the highest similarity scores
    recommendations = all_movies_genres[['movieId', 'title', 'similarity']].sort_values(by='similarity', ascending=False).head(top_n)
    
    return recommendations.reset_index(drop=True)

# Example usage:
recommendations = recommend_genre_based(new_user_ratings, df)
recommendations

   movieId  rating                    title  Western  Musical  IMAX  Sci-Fi  \
0        1     5.0           Jumanji (1995)        0        0     0       0   
1        2     3.0  Grumpier Old Men (1995)        0        0     0       0   
2       50     4.5    Guardian Angel (1994)        0        0     0       0   
3      100     1.5         Mr. Wrong (1996)        0        0     0       0   

   War  Thriller  Film-Noir  ...  Animation  Comedy  Crime  Fantasy  Action  \
0    0         0          0  ...          0       0      0        1       0   
1    0         0          0  ...          0       1      0        0       0   
2    0         1          0  ...          0       0      0        0       1   
3    0         0          0  ...          0       1      0        0       0   

   Mystery  Children  Documentary  Horror  Drama  
0        0         1            0       0      0  
1        0         0            0       0      0  
2        0         0            0       0      1  
3   

Unnamed: 0,movieId,title,similarity
0,23063,Dragonheart 2: A New Beginning (2000),0.777778
1,34412,Christmas Town (2008),0.777778
2,8570,"Wonderful World of the Brothers Grimm, The (1962)",0.75
3,9175,Revolutionary Girl Utena: Adolescence of Utena...,0.736111
4,13897,Aelita: The Queen of Mars (Aelita) (1924),0.736111
5,4850,"Stunt Man, The (1980)",0.722222
6,31737,Joseph Andrews (1977),0.722222
7,15777,Kaho Naa... Pyaar Hai (2000),0.722222
8,32506,Once Upon a Time (2008),0.722222
9,14928,"Sorcerer's Apprentice, The (2010)",0.666667


In [94]:
def recommend_latent_factor_based(model, new_user_ratings, movie_mapping, top_n=10):
    # Get the latent factors of the movies the new user rated
    rated_movie_ids = list(new_user_ratings.keys())

    rated_movie_ratings = torch.tensor(list(new_user_ratings.values()), dtype=torch.float32).to('mps')

    # Get movie latent factors from the model
    rated_movie_latents = model.movie_factors(torch.tensor(rated_movie_ids, dtype=torch.long).to('mps'))

    # Compute weighted average of movie latent factors, weighted by the user's rating
    user_latent_profile = (rated_movie_latents.T @ rated_movie_ratings).T / rated_movie_ratings.sum()

    # Compute similarity of all movies to the user's latent profile
    all_movie_latents = model.movie_factors.weight.data
    similarities = torch.matmul(all_movie_latents, user_latent_profile)

    # Get top N most similar movies
    _, top_movie_indices = torch.topk(similarities, top_n)

    # Map indices back to movie IDs and titles
    recommended_movies = pd.DataFrame([{'movieId': movie_mapping[movie_idx.item()][0], 
                           'title': movie_mapping[movie_idx.item()][1],
                           'similarity': similarities[movie_idx].item()}
                          for movie_idx in top_movie_indices])
    
    return recommended_movies

recommendedations = recommend_latent_factor_based(model, new_user_ratings, movie_mapping)
recommendedations

Unnamed: 0,movieId,title,similarity
0,46202,The White Meadows (2009),9.055716
1,33590,Reilly: Ace of Spies (1983),8.215811
2,27475,Up Your Anchor (1985),7.205197
3,43548,Looking for Kitty (2004),7.038386
4,19508,Night Across the Street (La noche de enfrente)...,6.97637
5,50,Guardian Angel (1994),6.911576
6,47379,Sombrero (1953),6.770538
7,55242,"Hannah, Queen of the Vampires (1973)",6.764743
8,36525,La Clé des Champs (2011),6.746008
9,35222,White Rage (2015),6.625898


In [95]:
def min_max_normalization(scores):
    min_score = scores.min()
    max_score = scores.max()
    normalized_scores = (scores - min_score) / (max_score - min_score)
    return normalized_scores

def recommend_combined(new_user_ratings, df, model, movie_mapping, top_n=10, weight_latent=0.8):
    # Get genre-based recommendations
    genre_recommendations = recommend_genre_based(new_user_ratings, df, top_n=top_n)
    genre_recommendations['source'] = 'genre'  # Mark source as genre

    # Get latent-factor-based recommendations
    latent_recommendations = recommend_latent_factor_based(model, new_user_ratings, movie_mapping, top_n=top_n)
    latent_recommendations['source'] = 'latent'  # Mark source as latent

    # Normalize similarity scores
    genre_recommendations['similarity'] = min_max_normalization(genre_recommendations['similarity']) * (1 - weight_latent)
    latent_recommendations['similarity'] = min_max_normalization(latent_recommendations['similarity']) * weight_latent

    # Combine both recommendations
    combined = pd.concat([genre_recommendations, latent_recommendations])
    combined['similarity'] = combined['similarity'] * 1 / weight_latent

    # Add a new column for the source of the recommendation
    combined['source'] = combined.apply(lambda row: 'both' if combined[combined['movieId'] == row['movieId']].shape[0] > 1 else row['source'], axis=1)

    # Sort by the combined score
    final_recommendations = combined[['movieId', 'title', 'similarity', 'source']].sort_values(by='similarity', ascending=False).head(top_n)

    return final_recommendations

combined_recommendations = recommend_combined(new_user_ratings, df, model, movie_mapping)
combined_recommendations

   movieId  rating                    title  Western  Musical  IMAX  Sci-Fi  \
0        1     5.0           Jumanji (1995)        0        0     0       0   
1        2     3.0  Grumpier Old Men (1995)        0        0     0       0   
2       50     4.5    Guardian Angel (1994)        0        0     0       0   
3      100     1.5         Mr. Wrong (1996)        0        0     0       0   

   War  Thriller  Film-Noir  ...  Animation  Comedy  Crime  Fantasy  Action  \
0    0         0          0  ...          0       0      0        1       0   
1    0         0          0  ...          0       1      0        0       0   
2    0         1          0  ...          0       0      0        0       1   
3    0         0          0  ...          0       1      0        0       0   

   Mystery  Children  Documentary  Horror  Drama  
0        0         1            0       0      0  
1        0         0            0       0      0  
2        0         0            0       0      1  
3   

Unnamed: 0,movieId,title,similarity,source
0,46202,The White Meadows (2009),1.0,latent
1,33590,Reilly: Ace of Spies (1983),0.654334,latent
0,23063,Dragonheart 2: A New Beginning (2000),0.25,genre
1,34412,Christmas Town (2008),0.25,genre
2,27475,Up Your Anchor (1985),0.238413,latent
2,8570,"Wonderful World of the Brothers Grimm, The (1962)",0.1875,genre
3,43548,Looking for Kitty (2004),0.169761,latent
4,13897,Aelita: The Queen of Mars (Aelita) (1924),0.15625,genre
3,9175,Revolutionary Girl Utena: Adolescence of Utena...,0.15625,genre
4,19508,Night Across the Street (La noche de enfrente)...,0.144238,latent
