In [None]:
from pathlib import Path
import sys
import matplotlib.pyplot as plt

In [None]:
NB_DIR = Path.cwd()
PROJ_ROOT = NB_DIR.parent
sys.path.append(str(PROJ_ROOT))

In [None]:
from src.movie_train import train
train_history, val_history = train()

In [None]:
import torch

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(train_history, label='Training Loss (MSE)', marker='o')
plt.plot(val_history, label='Validation Loss (MSE)', marker='x')
plt.title('Learning Curve')
plt.xlabel('Epochs')
plt.ylabel('Mean Squared Error (MSE)')
plt.legend()
plt.show()

In [None]:
print("Last Training Loss (MSE):", train_history[-1])
print("Last Validation Loss (MSE):", val_history[-1])

### Experiment 1
##### Base on the baseline model trained on 1M rows in the dataset with the hyperparameters:
EPOCHS = 20 <br>
BATCH_SIZE = 512 <br>
LR = 0.001 <br>
Decay = 1e-4 <br>
Dropout = 0.2 <br>
Embedding Dimensions = 50

##### This yielded a loss of: 
Last Training Loss (MSE): 0.6919749808936873 <br>
Last Validation Loss (MSE): 0.7188025415705903

### Experiment 2
##### After training on the baseline model with the 25M rows with the hyperparameters:
EPOCHS = 20 <br>
BATCH_SIZE = 512 <br>
LR = 0.001 <br>
Decay = 1e-4 <br>
Dropout = 0.2 <br>
Embedding Dimensions = 50

##### This yielded a loss of:
Last Training Loss (MSE): 0.8682332674888099 <br>
Last Validation Loss (MSE): 0.8681166282130782

The model is clearly not overfitting so we can reduce the regularization (Dropout and Decay), and try to reduce the loss by increasing the embedding dimensions to 128.

### Experiment 3
##### After training on the updated model with the 25M rows with the hyperparameters:
EPOCHS = 20 <br>
BATCH_SIZE = 512 <br>
LR = 0.001 <br>
Decay = 1e-5 <br>
Dropout = 0.05 <br>
Embedding Dimensions = 128

#### This yielded a loss of:
Last Training Loss (MSE): 0.7058186896250819 <br>
Last Validation Loss (MSE): 0.7072422166974628

In [None]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np
import torch
from src.movie_model import MatrixFactorization
from src.movie_dataset import MovieRatingsDataset

In [None]:
dataset = MovieRatingsDataset()

In [None]:
num_users = len(dataset.movie_user_encoder.classes_)
num_items = len(dataset.movie_encoder.classes_)

In [None]:
checkpoint = torch.load(PROJ_ROOT / "models" / "movie_matrix_factorization_checkpoint.pth")

In [None]:
model = MatrixFactorization(num_users, num_items)
model.load_state_dict(checkpoint['model_state_dict'])
model.to('cpu')
model.eval()

In [None]:
movie_matrix = model.item_embedding.weight.data.numpy()

In [None]:
knn = NearestNeighbors(n_neighbors=10, metric='cosine', algorithm='brute')
knn.fit(movie_matrix)

In [None]:
movie_df = pd.read_csv(PROJ_ROOT / "data" / "raw" / "Movies.csv")

In [None]:
query_movie_id = 858
title = movie_df[movie_df['movieId'] == query_movie_id]['title'].values[0]

In [None]:
encoded_id = dataset.movie_encoder.transform([query_movie_id])[0]

In [None]:
distances, indices = knn.kneighbors([movie_matrix[encoded_id]])
print(f"Movies similar to '{title}':")
for i in range(1, len(indices[0])):
    similar_movie_encoded_id = indices[0][i]
    similar_movie_id = dataset.movie_encoder.inverse_transform([similar_movie_encoded_id])[0]
    similar_movie_title = movie_df[movie_df['movieId'] == similar_movie_id]['title'].values[0]
    print(f"{i}. {similar_movie_title} (Movie ID: {similar_movie_id})")