In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import torch
from torch.utils.data import Dataset, DataLoader

import ast
import time
from sklearn.preprocessing import OneHotEncoder

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/the-movies-dataset/ratings.csv
/kaggle/input/the-movies-dataset/links_small.csv
/kaggle/input/the-movies-dataset/credits.csv
/kaggle/input/the-movies-dataset/keywords.csv
/kaggle/input/the-movies-dataset/movies_metadata.csv
/kaggle/input/the-movies-dataset/ratings_small.csv
/kaggle/input/the-movies-dataset/links.csv


# Data cleaning and preprocessing

In [2]:
df_movies = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv')
df_ratings = pd.read_csv('/kaggle/input/the-movies-dataset/ratings.csv')

df_ratings_counts = df_ratings.userId.value_counts()

  df_movies = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv')


In [3]:
df_keywords = pd.read_csv('/kaggle/input/the-movies-dataset/keywords.csv')

In [4]:
print(str(df_ratings.memory_usage(deep=True, index=True).sum() / 1000000) + " MB")
print(str(df_movies.memory_usage(deep=True, index=True).sum() / 1000000) + " MB")
print(str(df_keywords.memory_usage(deep=True, index=True).sum() / 1000000) + " MB")


832.777376 MB
83.324495 MB
8.863881 MB


In [5]:
df_keywords.keywords.iloc[0]

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

In [6]:
print(f"Users with only 1 rating: {100*df_ratings_counts[df_ratings_counts == 1].shape[0] / df_ratings_counts.shape[0]:.2f}%")
print(f"Movies with no ratings: {100*df_movies[(df_movies.vote_count.isnull()) | (df_movies.vote_count == 0)].shape[0] / df_movies.shape[0]:.2f}%")

Users with only 1 rating: 1.88%
Movies with no ratings: 6.39%


## Cleaning up dtypes

In [7]:
df_movies = df_movies[(df_movies.vote_count.notnull()) & (df_movies.vote_count != 0)]
df_movies["id"] = pd.to_numeric(df_movies["id"], errors='coerce')
df_movies["genres_expanded"] = df_movies["genres"].apply(lambda x: " ".join([y["name"] for y in ast.literal_eval(x)]))

In [8]:
df_ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

Clean out movies with less than 20 votes

In [9]:
N = 20
print("Before cleaning:", len(df_movies))
df_movies_clean = df_movies[df_movies.vote_count > N]
print("After cleaning:", len(df_movies_clean))

Before cleaning: 42561
After cleaning: 15297


In [10]:
N = 100
print("Before cleaning:", len(df_ratings))
users_with_more_than_n_ratings = df_ratings_counts[df_ratings_counts > N].index.tolist()
df_ratings_clean = df_ratings[(df_ratings.userId.isin(users_with_more_than_n_ratings)) & (df_ratings.movieId.isin(df_movies.id.unique()))]
print("After cleaning:", len(df_ratings_clean))

Before cleaning: 26024289
After cleaning: 8268273


In [11]:
df_user_movie_ratings = pd.merge(df_movies_clean, df_ratings_clean, left_on="id", right_on="movieId", how='inner')
df_user_movie_ratings = df_user_movie_ratings[["userId", "movieId", "rating"]].drop_duplicates()
df_user_movie_ratings.head()
print(df_user_movie_ratings.shape)

(6451781, 3)


In [12]:
print(str(df_user_movie_ratings.memory_usage(deep=True, index=True).sum() / 1000000) + " MB")

206.456992 MB


In [13]:
df_user_movie_rating = df_user_movie_ratings.pivot(index="userId", columns="movieId", values="rating")
print(df_user_movie_rating.shape)
df_user_movie_rating.iloc[:10, :10]

(63577, 3209)


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


movieId,2,3,5,6,11,12,13,14,15,16
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
8,,,,,,,,,,
11,,,,,,,,,,
12,,,,,,,,,,4.0
15,,,,4.0,,,,,,
16,,,,,,,,,,
20,,,,,,,,,,
24,3.0,,,4.0,,,,,,3.0
30,,,,,,,,,,
34,3.0,,,4.0,,,,,,5.0
37,,,,,,,,,,


In [14]:
print(str(df_user_movie_rating.memory_usage(deep=True, index=True).sum() / 1000000) + " MB")

1632.65736 MB


In [15]:
df_user_movie_rating = df_user_movie_rating.fillna(0)
df_user_movie_rating

movieId,2,3,5,6,11,12,13,14,15,16,...,173185,173495,173847,173897,174645,174671,174675,174751,175291,175555
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270887,5.0,4.0,0.0,5.0,4.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
def expand_to_df(df: pd.DataFrame, colname: str) -> pd.DataFrame:
    all_names = [{movieid: ast.literal_eval(name)} for (movieid, name) in df[["id", colname]].values]
    
    dfs = []
    for item in all_names:
        movieid, name = list(item.items())[0]
        _df = pd.DataFrame(name)
        _df["movieId"] = movieid
        dfs.append(_df)
    
    return pd.concat(dfs)

In [17]:
# df_keywords_expanded = expand_to_df(df_keywords, "keywords")
# df_keywords_expanded

In [18]:
# df_keywords_expanded.id.nunique()

In [19]:
# df_genres = expand_to_df(df_movies, "genres")
# df_genres

In [20]:
# df_genres.name.nunique()

In [21]:
users = df_ratings.userId.unique()
users

array([     1,      2,      3, ..., 270894, 270895, 270896])

In [22]:
movies = df_movies["id"].unique()

# Matrix Factorisation using SGD

In [23]:
def initialise_matrices(users, items, num_factors):
    """Initializes user and item feature matrices with small random values."""
    num_users = len(users)
    num_items = len(items)
    
    U = pd.DataFrame(np.random.rand(num_users, num_factors), index=users, columns=range(num_factors))
    V = pd.DataFrame(np.random.rand(num_items, num_factors), index=items, columns=range(num_factors))
    return U, V

def predict_rating(user_vector, item_vector):
    """Predicts the rating using the dot product of user and item vectors."""
    return np.dot(user_vector, item_vector)

def SGD(ratings, U, V, learning_rate=0.01, num_epochs=100, regularization=0.02):
    """Trains the user and item feature matrices using Stochastic Gradient Descent."""
    num_users, num_factors = U.shape
    num_items, _ = V.shape

    for epoch in range(num_epochs):
        ratings = ratings.sample(frac=1)  # Shuffle data for each epoch
        total_loss = 0
        for ix, row in ratings.iterrows():
            u_id, i_id, rating = row["userId"], row["movieId"], row["rating"]
            
            # Predict rating
            prediction = predict_rating(U.loc[u_id, :], V.loc[i_id, :])
            error = rating - prediction

            # Update user and item features
            U.loc[u_id, :] += learning_rate * (error * V.loc[i_id, :] - regularization * U.loc[u_id, :])
            V.loc[i_id, :] += learning_rate * (error * U.loc[u_id, :] - regularization * V.loc[i_id, :])

            total_loss += error**2

        print(f"Epoch {epoch+1}, Loss: {total_loss / len(ratings)}")

    return U, V

Running this as-is would take a long time due to the fact we have >6M records to calculate gradients for in each epoch. So we sample using 10k ratings.

In [24]:
U, V = initialise_matrices(users=users, items=movies, num_factors=28)
n_samples = 50000
epochs = 10
s = time.time()
U_hat, V_hat = SGD(df_user_movie_ratings.sort_values("userId")[:n_samples], U, V, num_epochs=epochs)
f = time.time() - s
print(f"Training took {f:.2f} seconds with {n_samples} samples, {f / epochs:.2f}s per epoch.")

Epoch 1, Loss: 1.874861329771778
Epoch 2, Loss: 0.8691365609967394
Epoch 3, Loss: 0.7522798454841122
Epoch 4, Loss: 0.6892999306627201
Epoch 5, Loss: 0.6464494844713912
Epoch 6, Loss: 0.6124548973533518
Epoch 7, Loss: 0.5826836674625513
Epoch 8, Loss: 0.5567244443231918
Epoch 9, Loss: 0.5319720788707184
Epoch 10, Loss: 0.5096319565306624
Training took 875.49 seconds with 50000 samples, 87.55s per epoch.


In [25]:
def predict_for_user(userId: int, U: pd.DataFrame, V: pd.DataFrame, topn: int = 10) -> pd.DataFrame:
    
    preds = np.dot(U.loc[userId, :], V.T)
    preds = pd.Series(preds, index=V.index, name="pred_ratings").sort_values(ascending=False)[:topn]
    pred_df = preds.reset_index().rename(columns={"index":"movieId"})

    return pred_df

In [26]:
# Predict 10 movies for user 8
userId = 8
movie_cols = ["title", "release_date", "vote_average", "original_language", "genres_expanded"]
preds = predict_for_user(userId, U, V)

In [27]:
preds

Unnamed: 0,movieId,pred_ratings
0,56811,5.581428
1,18500,5.556256
2,120729,5.530602
3,31863,5.494405
4,140300,5.466098
5,139609,5.455499
6,35405,5.434637
7,47245,5.419922
8,20720,5.413979
9,55194,5.400439


In [28]:
df_predictions = pd.merge(df_movies, preds, left_on="id", right_on="movieId", how="inner")[movie_cols + ["pred_ratings"]]
df_predictions

Unnamed: 0,title,release_date,vote_average,original_language,genres_expanded,pred_ratings
0,True Believer,1989-02-17,6.3,en,Crime Drama,5.494405
1,Cropsey,2010-04-29,6.1,en,Documentary Horror,5.581428
2,S*P*Y*S,1974-09-05,5.9,en,Action Comedy,5.434637
3,Jedi Junkies,2010-05-25,6.3,en,Documentary,5.400439
4,Journey with Papa,1982-01-01,6.3,it,Comedy,5.419922
5,Gabriel Iglesias: Hot and Fluffy,2007-08-04,6.9,en,Comedy,5.413979
6,Les quatre vérités,1962-12-21,4.0,en,,5.455499
7,Kung Fu Panda 3,2016-01-23,6.7,en,Action Adventure Animation Comedy Family,5.466098
8,An Englishman Abroad,1983-11-29,6.0,en,Drama TV Movie Comedy,5.530602
9,The Sandlot 2,2005-05-03,5.2,en,Comedy Family,5.556256


In [103]:
df_predictions = pd.merge(df_movies, preds, left_on="id", right_on="movieId", how="inner")[movie_cols + ["pred_ratings"]]
df_predictions

Unnamed: 0,title,release_date,vote_average,original_language,genres_expanded,pred_ratings
0,The Outlaw,1943-02-05,4.9,en,Action Adventure Western,5.427951
1,Worth Winning,1989-10-27,6.0,en,Comedy Romance,5.466124
2,Lord Jim,1965-02-15,6.8,en,Action Adventure Drama,5.421635
3,Half Nelson,2006-08-11,6.5,en,Drama,5.456646
4,Heartbeat Detector,2007-09-12,6.5,fr,Drama Thriller Foreign,5.413094
5,Tropical Fish,1995-08-01,7.0,zh,Comedy Drama,5.283605
6,Bitter Feast,2010-07-08,5.5,en,Horror Thriller,5.478699
7,Bang Bang!,2014-10-02,6.3,hi,Action Adventure Comedy Romance Thriller,5.292882
8,Secrets of Life,1956-11-06,8.0,en,,5.425788
9,The Gerson Miracle,2004-01-01,6.8,en,Documentary,5.390791


In [104]:
df_user_movie_ratings[df_user_movie_ratings.userId == userId].merge(df_movies, how="inner", left_on="movieId", right_on="id").sort_values("rating", ascending=False)[movie_cols + ["rating"]]

Unnamed: 0,title,release_date,vote_average,original_language,genres_expanded,rating
2,Hard Target,1993-08-20,6.1,en,Action Adventure Crime Thriller,5.0
39,Street Kings,2008-04-10,6.3,en,Action Crime Drama Thriller,5.0
16,The Million Dollar Hotel,2000-02-09,5.9,en,Drama Thriller,5.0
32,Don't Come Knocking,2005-05-19,6.4,en,Drama Music,5.0
11,Barry Lyndon,1975-12-18,7.7,en,Drama Romance War,5.0
29,Greed,1924-12-04,7.5,en,Drama History,4.0
24,Dogville,2003-05-19,7.6,da,Crime Drama Thriller,4.0
6,Nosferatu,1922-03-15,7.7,de,Fantasy Horror,4.0
31,Final Fantasy VII: Advent Children,2005-07-14,6.7,ja,Action Adventure Animation Fantasy,4.0
20,Nowhere in Africa,2001-12-11,7.0,de,Drama,4.0


Therefore we refactor using PyTorch to attach GPU resources and increase the number of samples that we use to train. 

## Matrix Factorisation SGD (with PyTorch)

In [27]:
class RatingDatasetWithMapping(Dataset):
    """Dataset of user-item-rating triples with explicit ID mapping."""
    def __init__(self, ratings):
        self.ratings = [(u_id, i_id, rating) for (u_id, i_id), rating in ratings.items()]
        self.user_ids = sorted(list(set(u_id for u_id, _, _ in self.ratings)))
        self.item_ids = sorted(list(set(i_id for _, i_id, _ in self.ratings)))

        self.user_id_to_index = {user_id: index for index, user_id in enumerate(self.user_ids)}
        self.item_id_to_index = {item_id: index for index, item_id in enumerate(self.item_ids)}

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        user_id, item_id, rating = self.ratings[idx]
        user_index = self.user_id_to_index[user_id]
        item_index = self.item_id_to_index[item_id]
        return torch.tensor([user_index], dtype=torch.long), \
               torch.tensor([item_index], dtype=torch.long), \
               torch.tensor([rating], dtype=torch.float32)

def initialise_matrices_torch(users, items, num_factors, device):
    """Initializes user and item feature matrices as PyTorch tensors."""
    num_users = len(users)
    num_items = len(items)
    
    U = torch.randn(num_users, num_factors, requires_grad=True, device=device)
    V = torch.randn(num_items, num_factors, requires_grad=True, device=device)

    return U, V


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def predict_rating_torch(user_vector, item_vector):
    """Predicts the rating using the dot product of user and item tensors."""
    return torch.dot(user_vector, item_vector)

def mse_loss_torch(predictions, targets):
    """Calculates the Mean Squared Error loss."""
    return torch.mean((predictions - targets)**2)

def SGD_torch(ratings, U, V, users, items, learning_rate=0.01, num_epochs=100, regularization=0.02):
    """Trains the user and item feature matrices using PyTorch and SGD."""
    optimizer = torch.optim.SGD([U, V], lr=learning_rate, weight_decay=regularization)
    known_ratings = [(torch.tensor([row["userId"]], dtype=torch.long, device=device),
                      torch.tensor([row["movieId"]], dtype=torch.long, device=device),
                      torch.tensor([row["rating"]], dtype=torch.float32, device=device))
                     for ix, row in ratings.iterrows()]

    for epoch in range(num_epochs):
        np.random.shuffle(known_ratings)
        total_loss = 0
        for user, item, target_rating in known_ratings:
            # Zero the gradients from the previous iteration
            optimizer.zero_grad()

            user_idx = np.where(users == user)
            item_idx = np.where(items == item)

            # Get the user and item vectors
            user_vector = U[user_idx].flatten()
            item_vector = V[item_idx].flatten()

            # Predict the rating
            prediction = predict_rating_torch(user_vector, item_vector)

            # Calculate the loss
            loss = mse_loss_torch(prediction, target_rating)

            # Backpropagate the gradients
            loss.backward()

            # Update the parameters
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss / len(known_ratings)}")

    return U, V

In [29]:
U_torch_init, V_torch_init = initialise_matrices_torch(users=users, items=movies, num_factors=28, device=device)
U_torch_trained, V_torch_trained = SGD_torch(df_ratings, U_torch_init, V_torch_init, users, movies, learning_rate=0.05, num_epochs=5)

print("\nTrained User Matrix (U_torch_trained):\n", U_torch_trained.cpu().detach().numpy())
print("\nTrained Item Matrix (V_torch_trained):\n", V_torch_trained.cpu().detach().numpy())

KeyboardInterrupt: 