### Colab Setup (Don't run this cell if you're not using Colab)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
movie_titles_path = '/content/drive/MyDrive/CSC 422/CSC422 Class Project/codes/prize_dataset/movie_titles.csv'
movie_metadata_path = '/content/drive/MyDrive/CSC 422/CSC422 Class Project/codes/checkpoints/merge4.csv'
combined_data_1_path = '/content/drive/MyDrive/CSC 422/CSC422 Class Project/codes/prize_dataset/combined_data_1.txt'
bellkor_requirements_path = './BellkorAlgorithm/requirements.txt'
bellkor_import_path = 'BellkorAlgorithm'

### Non-Colab Setup

In [19]:
movie_titles_path = '../prize_dataset/movie_titles.csv'
movie_metadata_path = '../IMDB_data/merge4.csv'
combined_data_1_path = '../prize_dataset/combined_data_1.txt'
bellkor_requirements_path = './BellkorAlgorithm/requirements.txt'
bellkor_import_path = 'BellkorAlgorithm/Bellkor'

In [3]:
# See the GPU specs
!nvidia-smi

Tue Apr 11 17:51:08 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# To store the data
#import pandas as pd

# To do linear algebra
#import numpy as np

# To create plots
#import matplotlib.pyplot as plt

# To create interactive plots
#import nbformat
#from plotly.offline import init_notebook_mode, plot, iplot
#import plotly.graph_objs as go
#init_notebook_mode(connected=True)

# To compute similarities between vectors
#from sklearn.metrics import mean_squared_error
#from sklearn.metrics.pairwise import cosine_similarity
##from sklearn.feature_extraction.text import TfidfVectorizer

# To create sparse matrices
#from scipy.sparse import coo_matrix

# To stack sparse matrices
#from scipy.sparse import vstack

In [4]:
import pandas as pd
import numpy as np

## Load Data

### Load Movie Tiles w/o metadata

In [5]:
from io import StringIO
import re

for_pd = StringIO()
with open(movie_titles_path, encoding = 'ISO-8859-1') as movie_titles:
    for line in movie_titles:
        new_line = re.sub(r',', '|', line.rstrip(), count=2)
        print (new_line, file=for_pd)

for_pd.seek(0)

movie_titles = pd.read_csv(for_pd, sep='|', header=None, names=['Id', 'Year', 'Name']).set_index('Id')
del for_pd

print('Shape Movie-Titles:\t{}'.format(movie_titles.shape))
movie_titles.sample(5)

Shape Movie-Titles:	(17770, 2)


Unnamed: 0_level_0,Year,Name
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
17619,1947.0,Out of the Past
15245,1996.0,Hetty Wainthropp Investigates: Series 1
3031,1996.0,Pompatus of Love
4492,2004.0,Club Dread
3351,1987.0,Gospel According to Al Green


### Load Movie Titles w/ metadata

In [6]:
movie_metadata_raw = pd.read_csv(movie_metadata_path)

movie_metadata = movie_metadata_raw[movie_metadata_raw['imdbID'].notnull()][['Name', 'description', 'NumRating']].set_index('Name').dropna()
del movie_metadata_raw
movie_metadata.sample(5)

Unnamed: 0_level_0,description,NumRating
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Legend of Crystania: The Motion Picture,Dreams begin to haunt Lady Sheru -- dreams of ...,252.0
America's Most Haunted Inns,Have you ever seen a ghost? After seeing this ...,40.0
Sacred Silence,A young priest crusades against organized crim...,407.0
Moonlight and Valentino,A young widow still grieving over the death of...,3815.0
Bullet,"Paroled after 8 years in prison, Bullet's pick...",8617.0


### Load user-data structure (1/4 to save memory + speed up compute) and preprocess to extract all rating to form a matrix. File structure is messy mix of json and csv.

In [7]:
from collections import deque

# Load single data-file
df_raw = pd.read_csv(combined_data_1_path, header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])


# Find empty rows to slice dataframe for each movie
tmp_movies = df_raw[df_raw['Rating'].isna()]['User'].reset_index()
movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
shifted_movie_indices = deque(movie_indices)
shifted_movie_indices.rotate(-1)


# Gather all dataframes
user_data = []

# Iterate over all movies
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):
    
    # Check if it is the last movie in the file
    if df_id_1<df_id_2:
        tmp_df = df_raw.loc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = df_raw.loc[df_id_1+1:].copy()
        
    # Create movie_id column
    tmp_df['Movie'] = movie_id
    
    # Append dataframe to list
    user_data.append(tmp_df)

# Combine all dataframes
df = pd.concat(user_data)
del user_data, df_raw, tmp_movies, tmp_df, shifted_movie_indices, movie_indices, df_id_1, movie_id, df_id_2, next_movie_id
print('Shape User-Ratings:\t{}'.format(df.shape))
df.sample(5)

Shape User-Ratings:	(24053764, 4)


Unnamed: 0,User,Rating,Date,Movie
20836965,1692256,4.0,2003-11-03,3925
22314237,1130409,5.0,2004-11-10,4227
20785876,2473613,1.0,2005-06-23,3925
23622086,743002,4.0,2004-11-30,4420
10908647,2230283,5.0,2004-10-26,2122


#### More formatting for user-data and only use X of the users (choose users with the most ratings) from (1/4) of the total data. Number subject to change.

In [8]:
unique_movies = df['Movie'].nunique()
unique_users = df['User'].nunique()

print(f'Number of unique users:\t{unique_users}')
print(f'Number of unique movies:\t{unique_movies}')

pct_movies = unique_movies
pct_users = int(unique_users * 0.3)

filter_movies = df['Movie'].value_counts().sort_values(ascending=False)[:pct_movies].index

filter_users = df['User'].value_counts().sort_values(ascending=False)[:pct_users].index

df_filtered = df[df["Movie"].isin(filter_movies) & df["User"].isin(filter_users)]
del filter_movies, filter_users, df

# rename the users and movies with new ids start from 0
df_filtered['User'] = df_filtered['User'].astype("category")
df_filtered['Movie'] = df_filtered['Movie'].astype("category")
df_filtered['User'] = df_filtered['User'].cat.codes.values
df_filtered['Movie'] = df_filtered['Movie'].cat.codes.values

# make user the index and sort the index
df_filtered.set_index('User', inplace=True)
df_filtered.sort_index(inplace=True)

print(f'Number users: {df_filtered.index.nunique()}')
print(f'Number movies: {df_filtered["Movie"].nunique()}')
print(f'Shape: {df_filtered.shape}')
df_filtered.head(5)

Number of unique users:	470758
Number of unique movies:	4499
Number users: 141227
Number movies: 4499
Shape: (18338551, 3)


Unnamed: 0_level_0,Rating,Date,Movie
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4.0,2004-04-29,808
0,2.0,2005-10-04,4091
0,3.0,2005-05-03,2455
0,3.0,2004-08-03,110
0,4.0,2004-11-09,1743


### Shuffle the filtered dataframe and split into train and test set

In [9]:
# Shuffle DataFrame
df_filtered = df_filtered.sample(frac=1).reset_index()

percent_test = .2

# create random seed
import random
seed = random.seed(42)


# Split train and set set based on percentage
df_train = df_filtered.sample(frac=1-percent_test, random_state=seed).reset_index(drop=True)
df_test = df_filtered.drop(df_train.index).reset_index(drop=True)

# split into X and y
X_train = df_train.drop('Rating', axis=1)
y_train = df_train['Rating']

X_test = df_test.drop('Rating', axis=1)
y_test = df_test['Rating']

df_train.head(10)

Unnamed: 0,User,Rating,Date,Movie
0,33535,4.0,2003-05-30,758
1,47919,5.0,2005-07-16,3863
2,82542,4.0,2004-11-01,4392
3,98082,5.0,2005-07-21,850
4,100949,3.0,2004-05-27,1598
5,68264,5.0,2005-09-27,3017
6,23654,4.0,2005-06-01,2432
7,127163,3.0,2004-06-21,4261
8,8324,1.0,2004-08-06,3611
9,7247,2.0,2005-08-26,2557


## Baseline Algorithms

### Bellkor Algorithm 
Uses library from https://github.com/dandxy89/BellkorAlgorithm<br>
Based on paper https://www2.seas.gwu.edu/~simhaweb/champalg/cf/papers/KorenBellKor2009.pdf

##### Setup

In [10]:
!git clone https://github.com/dandxy89/BellkorAlgorithm
#!pip install -r {bellkor_requirements_path}

fatal: destination path 'BellkorAlgorithm' already exists and is not an empty directory.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ipython==8.10.0
  Using cached ipython-8.10.0-py3-none-any.whl (784 kB)
Collecting notebook==6.4.12
  Using cached notebook-6.4.12-py3-none-any.whl (9.9 MB)
Collecting numpy==1.22.0
  Using cached numpy-1.22.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
Collecting pandas==1.0.3
  Using cached pandas-1.0.3.tar.gz (5.0 MB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpip subprocess to install build dependencies[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Installing build dependencies ... [?25l[?25herror
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mp

In [11]:
import sys
sys.path.append(f'/content/{bellkor_import_path}')
from Bellkor.Algorithm import BellkorAlgorithm

##### Train Data

In [12]:
import time
import datetime

df_train_timestamp = df_train.copy().rename(columns={'Date': 'timestamp'})
df_train_timestamp['timestamp'] = pd.to_datetime(df_train['Date']).apply(lambda x: datetime.datetime.timestamp(x)).astype(int)
start_time = df_train_timestamp["timestamp"].min()
end_time = df_train_timestamp["timestamp"].max()


adjusted_start_day = int(time.mktime(datetime.datetime.fromtimestamp(start_time).date().timetuple()))
adjusted_end_day = int(time.mktime(datetime.datetime.fromtimestamp(end_time).date().timetuple())) + 86400
movie_count = df_train_timestamp["Movie"].nunique()
user_count = df_train_timestamp["User"].nunique()
global_mean = df_train_timestamp["Rating"].mean()
average_df = df_train_timestamp.groupby("User")["timestamp"].mean().reset_index()
average_times = pd.Series(average_df.timestamp.values, index=average_df.User).to_dict()

In [13]:
calibrator = BellkorAlgorithm(n_items=movie_count, 
                              n_users=user_count, 
                              global_mean=global_mean,
                              time_setting=dict(Start=adjusted_start_day, 
                                                End=adjusted_end_day))

In [14]:
#rename the columns
X = df_train_timestamp.rename(columns={'User': 'UserId', 'Movie': 'MovieId', 'Rating': 'rating'}, inplace=False)
indices = X.index.values
X = X.loc[:, ["timestamp", "UserId", "MovieId", "rating"]].to_numpy()

# add index to the front of x
X = np.insert(X, 0, indices, axis=1)

In [15]:
# TODO: increase sample size and #iterations for algorithm
# TODO: Probably need to run a script to auto optimize parameters on this
cost, error = calibrator.train(x=X, average_times=average_times, sample_size=100, iterations=1000)

##### Test Data

In [16]:
# calc average_times and X for test data
import time
import datetime

df_test_timestamp = df_test.copy().rename(columns={'Date': 'timestamp'})
df_test_timestamp['timestamp'] = pd.to_datetime(df_filtered['Date']).apply(lambda x: datetime.datetime.timestamp(x)).astype(int)
start_time = df_test_timestamp["timestamp"].min()
end_time = df_test_timestamp["timestamp"].max()


adjusted_start_day = int(time.mktime(datetime.datetime.fromtimestamp(start_time).date().timetuple()))
adjusted_end_day = int(time.mktime(datetime.datetime.fromtimestamp(end_time).date().timetuple())) + 86400
movie_count = df_test_timestamp["Movie"].nunique()
user_count = df_test_timestamp["User"].nunique()
global_mean = df_test_timestamp["Rating"].mean()
average_df = df_test_timestamp.groupby("User")["timestamp"].mean().reset_index()
average_times = pd.Series(average_df.timestamp.values, index=average_df.User).to_dict()

In [17]:
#rename the columns
X = df_test_timestamp.rename(columns={'User': 'UserId', 'Movie': 'MovieId', 'Rating': 'rating'}, inplace=False)
indices = X.index.values
X = X.loc[:, ["timestamp", "UserId", "MovieId", "rating"]].to_numpy()

# add index to the front of x
X = np.insert(X, 0, indices, axis=1)

In [18]:
preds = calibrator.predict(x=X, average_times=average_times)

In [19]:
# Evaluation
from sklearn.metrics import mean_squared_error

def evaluate(preds, y_test):
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(preds.values, y_test.values))
    print("RMSE: {}".format(rmse))

In [20]:
predictions = pd.DataFrame(data=preds, columns=["Index", "Prediction"])
predictions.head(n=10)

# convert predictions to series
predictions = pd.Series(data=predictions["Prediction"].values, index=predictions["Index"].values)

In [21]:
evaluate(predictions, y_test)

RMSE: 1.0279828962725628


### Weighted Mean Rating (!!!Needs conversion for RMSE testing)

##### Setup
Create sparse matrix. Each row represents a user and its ratings and the columns are the movies. We're interested in finding the empty values (unrated movies for that user).

In [None]:
# Create a user-movie matrix with empty values
df_p = df_train.pivot_table(index='User', columns='Movie', values='Rating')
print('Shape User-Movie-Matrix:\t{}'.format(df_p.shape))
df_p.sample(3)

##### Training

In [None]:
# Number of minimum votes to be considered
m = 1000

# Mean rating for all movies
C = df_p.stack().mean()

# Mean rating for all movies separatly
R = df_p.mean(axis=0).values

# Rating count for all movies separatly
v = df_p.count().values


# Weighted formula to compute the weighted rating
weighted_score = (v/ (v+m) *R) + (m/ (v+m) *C)
# Sort ids to ranking
weighted_ranking = np.argsort(weighted_score)[::-1]
# Sort scores to ranking
weighted_score = np.sort(weighted_score)[::-1]
# Get movie ids
weighted_movie_ids = df_p.columns[weighted_ranking]


# Join labels and predictions
df_prediction = df_test.set_index('Movie').join(pd.DataFrame(weighted_score, index=weighted_movie_ids, columns=['Prediction']))[['Rating', 'Prediction']]
y_true = df_prediction['Rating']
y_pred = df_prediction['Prediction']

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))

### Cosine User-User Similarity (!!!Needs conversion for RMSE testing) (prob not use) (Don't use midterm paper)
Interpreting each row of the matrix as a vector, a similarity between all user-vectors can be computed. This enables us to find all similar users and to work on user-specific recommendations. **Recommending high rated movies of similar users** to a specific user seems reasonable.<br>
Since there are still empty values left in the matrix, we have to use a reliable way to impute a decent value. A simple first approach is to **fill in the mean of each user into the empty values.**<br>
Afterwards the **ratings of all similar users will be weighted with their similarity score and the mean will be computed.** Filtering for the unrated movies of a user reveals the best recommendations.<br>
You can easily adapt this process to find similar items by computing the item-item similarity the same way. Since the matrix is mostly sparse and there are more users than items, this could be better for the RMSE score.

In [None]:
#from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
##from sklearn.feature_extraction.text import TfidfVectorizer

# User index for recommendation
user_index = 0

# Number of similar users for recommendation
n_recommendation = 100

# Plot top n recommendations
n_plot = 10


# Fill in missing values
df_p_imputed = df_p.T.fillna(df_p.mean(axis=1)).T

# Compute similarity between all users
similarity = cosine_similarity(df_p_imputed.values)

# Remove self-similarity from similarity-matrix
similarity -= np.eye(similarity.shape[0])


# Sort similar users by index
similar_user_index = np.argsort(similarity[user_index])[::-1]
# Sort similar users by score
similar_user_score = np.sort(similarity[user_index])[::-1]


# Get unrated movies
unrated_movies = df_p.iloc[user_index][df_p.iloc[user_index].isna()].index

# Weight ratings of the top n most similar users with their rating and compute the mean for each movie
mean_movie_recommendations = (df_p_imputed.iloc[similar_user_index[:n_recommendation]].T * similar_user_score[:n_recommendation]).T.mean(axis=0)

# Filter for unrated movies and sort results
best_movie_recommendations = mean_movie_recommendations[unrated_movies].sort_values(ascending=False).to_frame().join(movie_titles)


# Create user-id mapping
user_id_mapping = {id:i for i, id in enumerate(df_p_imputed.index)}

prediction = []
# Iterate over all testset items
for user_id in df_test['User'].unique():
    
    # Sort similar users by index
    similar_user_index = np.argsort(similarity[user_id_mapping[user_id]])[::-1]
    # Sort similar users by score
    similar_user_score = np.sort(similarity[user_id_mapping[user_id]])[::-1]
    
    for movie_id in df_test[df_test['User']==user_id]['Movie'].values:

        # Compute predicted score
        score = (df_p_imputed.iloc[similar_user_index[:n_recommendation]][movie_id] * similar_user_score[:n_recommendation]).values.sum() / similar_user_score[:n_recommendation].sum()
        prediction.append([user_id, movie_id, score])
        

# Create prediction DataFrame
df_pred = pd.DataFrame(prediction, columns=['User', 'Movie', 'Prediction']).set_index(['User', 'Movie'])
df_pred = df_test.set_index(['User', 'Movie']).join(df_pred)


# Get labels and predictions
y_true = df_pred['Rating'].values
y_pred = df_pred['Prediction'].values

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))


# Create trace
trace = go.Bar(x = best_movie_recommendations.iloc[:n_plot, 0],
               text = best_movie_recommendations['Name'],
               textposition = 'inside',
               textfont = dict(color = '#000000'),
               orientation = 'h',
               y = list(range(1, n_plot+1)),
               marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = 'Ranking Of Top {} Recommended Movies For A User Based On Similarity: {:.4f} RMSE'.format(n_plot, rmse),
              xaxis = dict(title = 'Recommendation-Rating',
                           range = (4.1, 4.5)),
              yaxis = dict(title = 'Movie'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

## Machine Learning Models

### Matrix Factorization (Dot Product) w/ hidden layers
Uses embeddings to represent users and movies. The dot product of user embeddings (n_users x e_dims) and movie embedding matrix (n_movies x e_dims) is a good approx of rating from user to movie.

##### Setup

In [None]:
df_filtered = df_filtered.drop('Date', axis=1)
X_train = X_train.drop('Date', axis=1)
X_test = X_test.drop('Date', axis=1)

NameError: name 'df_filtered' is not defined

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


# TODO: Figure out if we the code for correct mapping or can just use the original data

# # Create user- & movie-id mapping
# user_id_mapping = {id:i for i, id in enumerate(df_filtered['User'].unique())}
# movie_id_mapping = {id:i for i, id in enumerate(df_filtered['Movie'].unique())}

# # Create correctly mapped train- & testset
# train_user_data = df_train['User'].map(user_id_mapping)
# train_movie_data = df_train['Movie'].map(movie_id_mapping)

# test_user_data = df_test['User'].map(user_id_mapping)
# test_movie_data = df_test['Movie'].map(movie_id_mapping)

# # Get input variable-sizes
# users = len(user_id_mapping)
# movies = len(movie_id_mapping)
# embedding_size = 10

class MovieDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x[0])

    def __getitem__(self, idx):
        return self.x[0][idx], self.x[1][idx], self.y[idx]

class RecommenderModel(nn.Module):
    def __init__(self, n_users, n_movies, e_dimension):
        super(RecommenderModel, self).__init__()
        self.user_embedding = nn.Embedding(n_users, e_dimension)
        self.movie_embedding = nn.Embedding(n_movies, e_dimension)
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(2 * e_dimension + 1, 256)
        self.fc2 = nn.Linear(256, 1)

    def forward(self, user, movie):
        u = self.user_embedding(user).squeeze(1)
        m = self.movie_embedding(movie).squeeze(1)
        x = torch.mul(u, m).sum(1).unsqueeze(1)
        x = torch.cat([u, m, x], dim=1)
        x = self.dropout(x)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x.flatten()

def train(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for user, movie, rating in dataloader:
        user, movie, rating = user.to(device), movie.to(device), rating.to(device)
        optimizer.zero_grad()
        outputs = model(user, movie)
        loss = criterion(outputs, rating)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * user.size(0)
    return running_loss / len(dataloader.dataset)

def validate(model, dataloader, criterion, device):
    with torch.no_grad():
        model.eval()
        running_loss = 0.0
        with torch.no_grad():
            for user, movie, rating in dataloader:
                user, movie, rating = user.to(device), movie.to(device), rating.to(device)
                outputs = model(user, movie)
                loss = criterion(outputs, rating)
                running_loss += loss.item() * user.size(0)
        return running_loss / len(dataloader.dataset)

e_dimension = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RecommenderModel(n_users, n_movies, e_dimension).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

train_dataset = MovieDataset(torch.tensor(X_train.values, dtype=torch.long), torch.tensor(y_train.values, dtype=torch.float))
val_dataset = MovieDataset(torch.tensor(X_test.values, dtype=torch.long), torch.tensor(y_test.values, dtype=torch.float))
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

n_epochs = 50
best_val_loss = float('inf')
bad_epochs = 0
for epoch in range(n_epochs):
    train_loss = train(model, train_dataloader, criterion, optimizer, device)
    val_loss = validate(model, val_dataloader, criterion, device)
    print(f"Epoch {epoch+1}/{n_epochs} - Train Loss: {train_loss}, Val Loss: {val_loss}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "Model_1.pt")
    else:
        bad_epochs += 1
        if bad_epochs >= 5:
            print("Early stopping")
            break

# Evaluation
from sklearn.metrics import mean_squared_error
import numpy as np

def evaluate(model, dataloader, device):
    with torch.no_grad():
        model.eval()
        predictions = []
        ground_truth = []
        with torch.no_grad():
            for user, movie, rating in dataloader:
                user, movie, rating = user.to(device), movie.to(device), rating.to(device)
                outputs = model(user, movie)
                predictions.extend(outputs.view(-1).cpu().numpy())
                ground_truth.extend(rating.view(-1).cpu().numpy())
        return np.sqrt(mean_squared_error(ground_truth, predictions))

test_dataset = MovieDataset(torch.tensor(X_test.values, dtype=torch.long), torch.tensor(y_test.values, dtype=torch.float))
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Load the best model
model.load_state_dict(torch.load("model_mf_dot_product_w_hidden.pt"))

# Evaluate the model on the test set
rmse = evaluate(model, test_dataloader, device)
print(f"Test RMSE: {rmse}")

# Making predictions
def predict(model, user, movie, device):
    with torch.no_grad():
        model.eval()
        user_tensor = torch.tensor([user], dtype=torch.long, device=device).unsqueeze(0)
        movie_tensor = torch.tensor([movie], dtype=torch.long, device=device).unsqueeze(0)
        output = model(user_tensor, movie_tensor)
        return output.item()

user_id = 1
movie_id = 100

#prediction = predict(model, user_id, movie_id, device)
#print(f"Predicted rating for user {user_id} and movie {movie_id}: {prediction}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.05 GiB (GPU 0; 10.76 GiB total capacity; 9.94 GiB already allocated; 57.44 MiB free; 9.96 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

#### Matrix Factorization + Gradient Descent
Reduces dimensionality to represent data in dense form using embeddings. Then calculates dot product of user and movie embeddings to get rating prediction. Uses gradient descent to optimize embeddings.

##### Setup

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error


# Create user- & movie-id mapping
user_id_mapping = {id:i for i, id in enumerate(df_filtered['User'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df_filtered['Movie'].unique())}

# Create correctly mapped train- & testset
train_user_data = X_train['User'].map(user_id_mapping)
train_movie_data = X_train['Movie'].map(movie_id_mapping)

test_user_data = X_test['User'].map(user_id_mapping)
test_movie_data = X_test['Movie'].map(movie_id_mapping)

# Get input variable-sizes
users = len(user_id_mapping)
movies = len(movie_id_mapping)
embedding_size = 10

##### Train/Test

In [None]:
class MatrixFactorizationDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx]


class MatrixFactorization(nn.Module):
    def __init__(self, users, movies, embedding_size):
        super(MatrixFactorization, self).__init__()
        self.user_embedding = nn.Embedding(users, embedding_size)
        self.movie_embedding = nn.Embedding(movies, embedding_size)

    def forward(self, user_ids, movie_ids):
        user_vectors = self.user_embedding(user_ids)
        movie_vectors = self.movie_embedding(movie_ids)
        y = torch.sum(user_vectors * movie_vectors, dim=1)
        return y


# Create Dataset and DataLoader
train_dataset = MatrixFactorizationDataset(train_user_data.values, train_movie_data.values, df_train['Rating'].values)
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True)

# Initialize the model, loss function, and optimizer
model = MatrixFactorization(users, movies, embedding_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

epochs = 10

# Train the model
model.train()
for epoch in range(epochs):
    for user_ids, movie_ids, ratings in train_dataloader:
        user_ids = user_ids.long()
        movie_ids = movie_ids.long()
        ratings = ratings.float()
        
        optimizer.zero_grad()
        y_pred = model(user_ids, movie_ids)
        loss = criterion(y_pred, ratings)
        loss.backward()
        optimizer.step()
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, loss.item()))

# Test the model
model.eval()
with torch.no_grad():
    test_user_data_tensor = torch.tensor(test_user_data.values).long()
    test_movie_data_tensor = torch.tensor(test_movie_data.values).long()
    y_pred = model(test_user_data_tensor, test_movie_data_tensor).numpy()
y_true = df_test['Rating'].values

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With PyTorch Matrix-Factorization: {:.4f} RMSE'.format(rmse))


Epoch [1/10], Loss: 1.0346
Epoch [2/10], Loss: 0.9365
Epoch [3/10], Loss: 0.7360


KeyboardInterrupt: 

In [None]:
# save model to disk
torch.save(model.state_dict(), "model_nn_matrix_w_gradient.pt")

NameError: name 'torch' is not defined

### Matrix Factorization (hidden layers)

##### Setup

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error
import numpy as np

# Setup variables
user_embedding_size = 20
movie_embedding_size = 10

# Create user- & movie-id mapping
user_id_mapping = {id:i for i, id in enumerate(df_filterd['User'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df_filterd['Movie'].unique())}

# Create correctly mapped train- & testset
train_user_data = df_train['User'].map(user_id_mapping)
train_movie_data = df_train['Movie'].map(movie_id_mapping)

test_user_data = df_test['User'].map(user_id_mapping)
test_movie_data = df_test['Movie'].map(movie_id_mapping)

# Get input variable-sizes
users = len(user_id_mapping)
movies = len(movie_id_mapping)

##### Train/Test

In [None]:
class Recommender(nn.Module):
    def __init__(self, user_embedding_size, movie_embedding_size, users, movies):
        super(Recommender, self).__init__()
        self.user_embedding = nn.Embedding(users, user_embedding_size)
        self.movie_embedding = nn.Embedding(movies, movie_embedding_size)
        self.fc1 = nn.Linear(user_embedding_size + movie_embedding_size, 256)
        self.fc2 = nn.Linear(256, 1)

    def forward(self, user_ids, movie_ids):
        user_vector = self.user_embedding(user_ids)
        movie_vector = self.movie_embedding(movie_ids)
        concat = torch.cat((user_vector, movie_vector), dim=-1)
        dense = self.fc1(concat)
        y = self.fc2(dense)
        return y

model = Recommender(user_embedding_size, movie_embedding_size, users, movies)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

# Prepare data
train_data = TensorDataset(torch.tensor(train_user_data.values, dtype=torch.int32),
                            torch.tensor(train_movie_data.values, dtype=torch.int32),
                            torch.tensor(df_train['Rating'].values, dtype=torch.float))
train_loader = DataLoader(train_data, batch_size=256, shuffle=True)

epochs = 10

# Fit model
model.train()
for epoch in range(epochs):
    for _, (user_ids, movie_ids, ratings) in enumerate(train_loader):
        optimizer.zero_grad()
        predictions = model(user_ids, movie_ids).squeeze()
        loss = criterion(predictions, ratings)
        loss.backward()
        optimizer.step()

# Test model
test_data = TensorDataset(torch.tensor(test_user_data.values, dtype=torch.int32),
                          torch.tensor(test_movie_data.values, dtype=torch.int32),
                          torch.tensor(df_test['Rating'].values, dtype=torch.float))
test_loader = DataLoader(test_data, batch_size=256)

model.eval()
y_pred = []
y_true = []
with torch.no_grad():
    for _, (user_ids, movie_ids, ratings) in enumerate(test_loader):
        predictions = model(user_ids, movie_ids).squeeze().tolist()
        y_pred.extend(predictions)
        y_true.extend(ratings.tolist())

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With PyTorch Deep Learning: {:.4f} RMSE'.format(rmse))




Testing Result With PyTorch Deep Learning: 0.9142 RMSE


In [None]:
# save model to disk
torch.save(model.state_dict(), "model_nn_matrix_w_gradient.pt")

### The Deep Hybrid System with Metadata
Uses movie metadata to improve recommendations. Currently only uses tf-idf vectorizations of descriptions (might swap/add tf-idf keywords in future). Metadata is combined with embeddings of user-id and movie-id. Reduces cold-start problem.

##### Setup

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from scipy.sparse import vstack
import numpy as np

A faster tensor data loader to batch load and speed up tabular data loading. Modified from source to support sparse tensors.

Source: https://github.com/hcarlens/pytorch-tabular/blob/master/fast_tensor_data_loader.py

In [None]:
# Create user- & movie-id mapping
user_id_mapping = {id:i for i, id in enumerate(df_filterd['User'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df_filterd['Movie'].unique())}

# Use mapping to get better ids
df_filterd['User'] = df_filterd['User'].map(user_id_mapping)
df_filterd['Movie'] = df_filterd['Movie'].map(movie_id_mapping)


##### Combine both datasets to get movies with metadata
# Preprocess metadata
tmp_metadata = movie_metadata.copy()
tmp_metadata.index = tmp_metadata.index.str.lower()

# Preprocess titles
tmp_titles = movie_titles.drop('Year', axis=1).copy()
tmp_titles = tmp_titles.reset_index().set_index('Name')
tmp_titles.index = tmp_titles.index.str.lower()

# Combine titles and metadata
df_id_descriptions = tmp_titles.join(tmp_metadata).dropna().set_index('Id')
df_id_descriptions['description'] = df_id_descriptions['description'].str.lower()
del tmp_metadata,tmp_titles

# Filter all ratings with metadata
df_hybrid = df_filterd.set_index('Movie').join(df_id_descriptions).dropna().drop('description', axis=1).reset_index().rename({'index':'Movie'}, axis=1)

NameError: name 'df_filterd' is not defined

In [None]:
# Split train- & testset
n = 100000
df_hybrid = df_hybrid.sample(frac=1).reset_index(drop=True)
df_hybrid_train = df_hybrid[:1500000]
df_hybrid_test = df_hybrid[-n:]

In [None]:
# Create tf-idf matrix for text comparison
tfidf = TfidfVectorizer(stop_words='english')
tfidf_hybrid = tfidf.fit_transform(df_id_descriptions['description'])

# Get mapping from movie-ids to indices in tfidf-matrix
mapping = {id:i for i, id in enumerate(df_id_descriptions.index)}

train_tfidf = []
# Iterate over all movie-ids and save the tfidf-vector
for id in df_hybrid_train['Movie'].values:
    index = mapping[id]
    train_tfidf.append(tfidf_hybrid[index])
    
test_tfidf = []
# Iterate over all movie-ids and save the tfidf-vector
for id in df_hybrid_test['Movie'].values:
    index = mapping[id]
    test_tfidf.append(tfidf_hybrid[index])

# Stack the sparse matrices
train_tfidf = vstack(train_tfidf)
test_tfidf = vstack(test_tfidf)

# TODO: Test if can remove below
# Create dense numpy arrays (might not need)
# tfidf_hybrid = tfidf_hybrid.toarray() # Convert to dense array


# # Get mapping from movie-ids to indices in tfidf-matrix
# mapping = {id:i for i, id in enumerate(df_id_descriptions.index)}

# train_tfidf = np.array([tfidf_hybrid[mapping[id]] for id in df_hybrid_train['Movie'].values]) # Dense tensor array
# test_tfidf = np.array([tfidf_hybrid[mapping[id]] for id in df_hybrid_test['Movie'].values]) # Dense tensor array

Create Datasets and dataloaders

In [None]:
# TODO: Remove commented


#train_dataset = HybridDataset(df_hybrid_train['User'].values, df_hybrid_train['Movie'].values, sparse_to_torch_sparse(train_tfidf), df_hybrid_train['Rating'].values)
#train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#test_dataset = HybridDataset(df_hybrid_test['User'].values, df_hybrid_test['Movie'].values, sparse_to_torch_sparse(test_tfidf), df_hybrid_test['Rating'].values)
#test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
#train_dataset = TensorDataset(torch.tensor(df_hybrid_train['User'].values, dtype=torch.long), 
                              # torch.tensor(df_hybrid_train['Movie'].values, dtype=torch.long), 
                              # sparse_to_torch_sparse(train_tfidf), 
                              # torch.FloatTensor(df_hybrid_train['Rating'].values))
train_dataloader = FastTensorDataLoader(torch.tensor(df_hybrid_train['User'].values, dtype=torch.long), 
                              torch.tensor(df_hybrid_train['Movie'].values, dtype=torch.long), 
                              sparse_to_torch_sparse(train_tfidf), 
                              torch.FloatTensor(df_hybrid_train['Rating'].values), batch_size=batch_size, shuffle=True)
#test_dataset = TensorDataset(torch.tensor(df_hybrid_test['User'].values, dtype=torch.long), 
                            #  torch.tensor(df_hybrid_test['Movie'].values, dtype=torch.long), 
                            #  sparse_to_torch_sparse(test_tfidf), 
                            #  torch.FloatTensor(df_hybrid_test['Rating'].values))
test_dataloader = FastTensorDataLoader(torch.tensor(df_hybrid_test['User'].values, dtype=torch.long), 
                             torch.tensor(df_hybrid_test['Movie'].values, dtype=torch.long), 
                             sparse_to_torch_sparse(test_tfidf), 
                             torch.FloatTensor(df_hybrid_test['Rating'].values), batch_size=batch_size, shuffle=False)

NameError: name 'FastTensorDataLoader' is not defined

##### Train/Test

In [None]:
# Hyperparameters
user_embed_dim = 10
movie_embed_dim = 10
tfidf_dim = train_tfidf.shape[1]
num_users = len(user_id_mapping)
num_movies = len(movie_id_mapping)
epochs = 100
batch_size = 256

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Network definition
class HybridModel(nn.Module):
    def __init__(self, num_users, num_movies, user_embed_dim, movie_embed_dim, tfidf_dim):
        super(HybridModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, user_embed_dim)
        self.movie_embedding = nn.Embedding(num_movies, movie_embed_dim)
        self.fc1 = nn.Linear(tfidf_dim, 128)
        self.fc2 = nn.Linear(128, 32)
        self.fc3 = nn.Linear(user_embed_dim + movie_embed_dim + 32, 512)
        self.dropout = nn.Dropout(0.2)
        self.fc4 = nn.Linear(512, 1)

    def forward(self, user_ids, movie_ids, tfidf_vectors):
        user_embed = self.user_embedding(user_ids)
        movie_embed = self.movie_embedding(movie_ids)
        x = torch.relu(self.fc1(tfidf_vectors))
        x = torch.relu(self.fc2(x))
        x = torch.cat((user_embed, movie_embed, x), dim=-1)
        x = torch.relu(self.fc3(x))
        x = self.dropout(x)
        out = self.fc4(x)
        return out.squeeze()

# Training and testing
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for batch in dataloader:
        user_ids, movie_ids, tfidf_vectors, ratings = batch
        user_ids = user_ids.to(device)
        movie_ids = movie_ids.to(device)
        tfidf_vectors = tfidf_vectors.to(device)
        ratings = ratings.to(device)

        optimizer.zero_grad()
        predictions = model(user_ids, movie_ids, tfidf_vectors)
        loss = criterion(predictions, ratings)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(dataloader)

def test(model, dataloader, device):
    model.eval()
    y_pred = []
    y_true = []

    with torch.no_grad():
        for batch in dataloader:
            user_ids, movie_ids, tfidf_vectors, ratings = batch
            user_ids = user_ids.to(device)
            movie_ids = movie_ids.to(device)
            tfidf_vectors = tfidf_vectors.to(device)

            predictions = model(user_ids, movie_ids, tfidf_vectors)
            y_pred.extend(predictions.tolist())
            y_true.extend(ratings.tolist())
    
    return y_pred, y_true

# Convert sparse matrix to a PyTorch sparse tensor
def sparse_to_torch_sparse(data):
    values = data.data
    indices = np.vstack((data.nonzero()[0], data.nonzero()[1]))

    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = data.shape

    return torch.sparse.FloatTensor(i, v, torch.Size(shape))

# Initialize model, criterion, and optimizer
model = HybridModel(num_users, num_movies, user_embed_dim, movie_embed_dim, tfidf_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

# Train the model
for epoch in range(epochs):
    train_loss = train(model, train_dataloader, criterion, optimizer, device)
    y_pred, y_true = test(model, test_dataloader, device)
    rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
    print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss:.4f}, RMSE: {rmse:.4f}")

# Test the model
y_pred, y_true = test(model, test_dataloader, device)
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print(f'\nTesting Result With PyTorch Hybrid Deep Learning: {rmse:.4f} RMSE')


Using a target size (torch.Size([1])) that is different to the input size (torch.Size([])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.



In [None]:
# save model to disk
torch.save(model.state_dict(), "/content/drive/MyDrive/CSC 422/CSC422 Class Project/codes/checkpoints/model_nn_hybrid_w_metadata.pt")

#### TPU Version (not currently using) (only works with dense tensors (need a tonnnnnnn of memory))

##### Setup

In [None]:
!pip install cloud-tpu-client==0.10 torch==2.0.0 torchvision==0.15.1 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-2.0-cp39-cp39-linux_x86_64.whl

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch-xla==2.0
  Downloading https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-2.0-cp39-cp39-linux_x86_64.whl (115.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.7/115.7 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cloud-tpu-client==0.10
  Downloading cloud_tpu_client-0.10-py3-none-any.whl (7.4 kB)
Collecting google-api-python-client==1.8.0
  Downloading google_api_python_client-1.8.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting google-api-core<2dev,>=1.13.0
  Downloading google_api_core-1.34.0-py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.2/120.2 KB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting uritemplate<4dev,>=3.0.0
  Downloading uritemplate-3.0.1-py2.

##### Train/Test

In [None]:
import torch_xla
import torch_xla.core.xla_model as xm

# Create a custom dataset for DataLoader
class HybridDataset(Dataset):
    def __init__(self, users, movies, tfidf, ratings):
        self.users = torch.tensor(users, dtype=torch.long)
        self.movies = torch.tensor(movies, dtype=torch.long)
        self.tfidf = tfidf
        self.ratings = torch.FloatTensor(ratings)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        tfidf_vector = self.tfidf[idx].squeeze()
        return self.users[idx], self.movies[idx], tfidf_vector, self.ratings[idx]

# Network definition
class HybridModel(nn.Module):
    def __init__(self, num_users, num_movies, user_embed_dim, movie_embed_dim, tfidf_dim):
        super(HybridModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, user_embed_dim)
        self.movie_embedding = nn.Embedding(num_movies, movie_embed_dim)
        self.fc1 = nn.Linear(tfidf_dim, 128)
        self.fc2 = nn.Linear(128, 32)
        self.fc3 = nn.Linear(user_embed_dim + movie_embed_dim + 32, 512)
        self.dropout = nn.Dropout(0.2)
        self.fc4 = nn.Linear(512, 1)

    def forward(self, user_ids, movie_ids, tfidf_vectors):
        user_embed = self.user_embedding(user_ids)
        movie_embed = self.movie_embedding(movie_ids)
        x = torch.relu(self.fc1(tfidf_vectors))
        x = torch.relu(self.fc2(x))
        x = torch.cat((user_embed, movie_embed, x), dim=-1)
        x = torch.relu(self.fc3(x))
        x = self.dropout(x)
        out = self.fc4(x)
        return out.squeeze()

# Training and testing
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for batch in dataloader:
        user_ids, movie_ids, tfidf_vectors, ratings = batch
        user_ids = user_ids.to(device)
        movie_ids = movie_ids.to(device)
        tfidf_vectors = tfidf_vectors.to(device)
        ratings = ratings.to(device)

        optimizer.zero_grad()
        predictions = model(user_ids, movie_ids, tfidf_vectors)
        loss = criterion(predictions, ratings)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(dataloader)

def test(model, dataloader, device):
    model.eval()
    y_pred = []
    y_true = []

    with torch.no_grad():
        for batch in dataloader:
            user_ids, movie_ids, tfidf_vectors, ratings = batch
            user_ids = user_ids.to(device)
            movie_ids = movie_ids.to(device)
            tfidf_vectors = tfidf_vectors.to(device)

            predictions = model(user_ids, movie_ids, tfidf_vectors)
            y_pred.extend(predictions.tolist())
            y_true.extend(ratings.tolist())
    
    return y_pred, y_true

# Convert sparse matrix to a PyTorch sparse tensor
def sparse_to_torch_sparse(data):
    values = data.data
    indices = np.vstack((data.nonzero()[0], data.nonzero()[1]))

    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = data.shape

    return torch.sparse.FloatTensor(i, v, torch.Size(shape))

# Set device
device = xm.xla_device()

# Hyperparameters
user_embed_dim = 10
movie_embed_dim = 10
tfidf_dim = train_tfidf.shape[1]
num_users = len(user_id_mapping)
num_movies = len(movie_id_mapping)
epochs = 10
batch_size = 8

# Create datasets and dataloaders
train_dataset = HybridDataset(df_hybrid_train['User'].values, df_hybrid_train['Movie'].values, sparse_to_torch_sparse(train_tfidf), df_hybrid_train['Rating'].values)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = HybridDataset(df_hybrid_test['User'].values, df_hybrid_test['Movie'].values, sparse_to_torch_sparse(test_tfidf), df_hybrid_test['Rating'].values)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize model, criterion, and optimizer
model = HybridModel(num_users, num_movies, user_embed_dim, movie_embed_dim, tfidf_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

# Train the model
for epoch in range(epochs):
    train_loss = train(model, train_dataloader, criterion, optimizer, device)
    y_pred, y_true = test(model, test_dataloader, device)
    rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
    print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss:.4f}, RMSE: {rmse:.4f}")

    xm.rendezvous('sync_epoch')
    xm.mark_step()

# Test the model
y_pred, y_true = test(model, test_dataloader, device)
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print(f'\nTesting Result With PyTorch Hybrid Deep Learning: {rmse:.4f} RMSE')

ModuleNotFoundError: ignored

##### Utils

In [None]:
import torch

# Create a dense PyTorch tensor
dense_tensor = torch.from_numpy(train_tfidf.toarray())

# Calculate the number of non-zero elements in the tensor
num_nonzero = torch.nonzero(dense_tensor).size(0)

# Calculate the total number of elements in the tensor
total_elements = dense_tensor.numel()

# Calculate the sparsity ratio
sparsity_ratio = 1.0 - (num_nonzero / total_elements)

# Print the sparsity ratio
print("Sparsity ratio: {:.2f}%".format(sparsity_ratio * 100))

Sparsity ratio: 99.91%
