In [1]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Modeling
# from matrix_factorization import BaselineModel, KernelMF, train_update_test_split
# from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import train_test_split

import pickle

# Other
import os
import random
import sys

from letterboxdpy import user
from letterboxdpy import movie

import sqlite3
import pandas as pd

from surprise.model_selection import train_test_split
from surprise import SVD
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise import Dataset
from surprise import Reader
from surprise import accuracy

import sklearn

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Get the data (database shit - oud)

In [None]:
%%time

user1 = "sverlaan"
user1_inst = user.User(user1)
wl1 = {movie['slug'] for movie in user1_inst.get_watchlist()['data'].values()}
get_movies1 = user1_inst.get_films()
allmovies1 = [movie for movie in get_movies1['movies'].keys()]

user2 = "liannehr"
user2_inst = user.User(user2)
wl2 = {movie['slug'] for movie in user2_inst.get_watchlist()['data'].values()}
get_movies2 = user2_inst.get_films()
allmovies2 = [movie for movie in get_movies2['movies'].keys()]

allmovies = list(wl2.union(wl1).union(allmovies2).union(allmovies1))

In [None]:
len(allmovies)

In [None]:
# Path to your SQLite database file
db_path = "bootstrap_project/ratingsdb/ratings.db"  # Change this to your actual file path
# List of movie IDs you want to filter
movie_ids = allmovies

# Connect to the database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

In [None]:
%%time
# Create a dynamic SQL query using placeholders
placeholders = ", ".join(["?"] * len(movie_ids))  # Creates "?, ?, ?" for each movie_id
query = f"SELECT * FROM ratings WHERE movie_id IN ({placeholders})"

# Execute query with movie_ids as parameters
cursor.execute(query, movie_ids)

# Fetch all matching rows
results = cursor.fetchall()

# Close the connection when done
conn.close()

In [None]:
%%time
# Define column names
columns = ["userId", "movieId", "rating"]

# Create DataFrame
df = pd.DataFrame(results, columns=columns)

# Print DataFrame
df.head()

In [None]:
unique_counts = df.nunique()
print(unique_counts)


# Get the data (database shit - oud)

# Setup data

In [25]:
df = pd.read_csv("data/ratings.csv", dtype={
    'user_name': 'string',
    'film_id': 'string',
    'rating': 'float64'
})

df = df.rename(columns={'user_name': 'userId', 'film_id': 'movieId'})
df

Unnamed: 0,userId,movieId,rating
0,kurstboy,spider-man-2,4.5
1,kurstboy,tetsuo-the-iron-man,4.0
2,kurstboy,bram-stokers-dracula,4.5
3,kurstboy,poison-2023,4.5
4,kurstboy,the-rat-catcher-2023,3.5
...,...,...,...
18175540,beef,parasite-2019,4.0
18175541,beef,the-paper,2.5
18175542,beef,the-farewell-2019,3.5
18175543,beef,ready-or-not-2019,4.0


In [None]:
df[df["movieId"] == "pickpocket"]

In [26]:
df = df[df.groupby("movieId")["movieId"].transform("count") >= 1000]   # only movies that more than 1000 users have seen
df = df[df.groupby("userId")["userId"].transform("count") >= 10]       # only users with more than 10 watched movies
print("Num users:", len(df["userId"].unique()))
print("Num movies:", len(df["movieId"].unique()))

#df.to_csv('data/ratings_filtered.csv', index=False)

df

Num users: 10989
Num movies: 4111


Unnamed: 0,userId,movieId,rating
0,kurstboy,spider-man-2,4.5
1,kurstboy,tetsuo-the-iron-man,4.0
2,kurstboy,bram-stokers-dracula,4.5
3,kurstboy,poison-2023,4.5
4,kurstboy,the-rat-catcher-2023,3.5
...,...,...,...
18175532,beef,hustlers-2019,3.0
18175538,beef,a-matter-of-life-and-death,4.0
18175540,beef,parasite-2019,4.0
18175542,beef,the-farewell-2019,3.5


In [27]:
# For faster model training
random_users = random_users = df['userId'].drop_duplicates().sample(n=5000, replace=False)
df = df[df['userId'].isin(random_users)]

In [28]:
%%time
username1 = "liannehr"
user_inst1 = user.User(username1)
watchlist1 = {movie['slug'] for movie in user_inst1.get_watchlist()['data'].values()}
ratings1 = user_inst1.get_films()

username2 = "sverlaan"
user_inst2 = user.User(username2)
watchlist2 = {movie['slug'] for movie in user_inst2.get_watchlist()['data'].values()}
ratings2 = user_inst2.get_films()

username3 = "flrz"
user_inst3 = user.User(username3)
watchlist3 = {movie['slug'] for movie in user_inst3.get_watchlist()['data'].values()}
ratings3 = user_inst3.get_films()

CPU times: user 3.36 s, sys: 133 ms, total: 3.49 s
Wall time: 31.5 s


In [29]:
user_rows1 = [(username1, slug, info['rating'] / 2.0) for slug, info in ratings1['movies'].items() if info['rating'] is not None]
user_rows2 = [(username2, slug, info['rating'] / 2.0) for slug, info in ratings2['movies'].items() if info['rating'] is not None]
user_rows3 = [(username3, slug, info['rating'] / 2.0) for slug, info in ratings3['movies'].items() if info['rating'] is not None]
combined_rows = user_rows1+user_rows2+user_rows3

In [None]:
# Add high star ratings multiple times?
high_star_rows = [(u, m, r) for (u, m, r) in user_rows1 if r >= 4.5] * 3
combined_rows = user_rows1+high_star_rows

In [30]:
new_df = pd.DataFrame(combined_rows, columns=df.columns)
df = pd.concat([df, new_df], ignore_index=True)
df.tail()

Unnamed: 0,userId,movieId,rating
4429751,flrz,the-cabinet-of-dr-caligari-1920,4.0
4429752,flrz,utrecht,4.0
4429753,flrz,the-great-train-robbery,2.0
4429754,flrz,a-trip-to-the-moon,4.5
4429755,flrz,the-arrival-of-a-train-at-la-ciotat,2.5


In [31]:
# Encode user_id and item_id
df["userId"], user_mapping = pd.factorize(df["userId"])
df["movieId"], item_mapping = pd.factorize(df["movieId"])

# Function to get user ID from username
def get_user_id(username):
    try:
        return user_mapping.tolist().index(username)
    except:
        return None

def get_user_name(id):
    try:
        return user_mapping[id]
    except:
        return None

def get_movie_id(slug):
    try:
        return item_mapping.tolist().index(slug)
    except:
        return None
        
def get_movie_name(id):
    try:
        return item_mapping[id]
    except:
        return None

In [24]:
print(get_user_id("liannehr"))
print(get_user_name(11033))

print(get_movie_id("metropolis"))
print(get_movie_name(537))

5000
None
5218
2322


In [32]:
# Get minimum and maximum rating from the dataset
min_rating = df.rating.min()
max_rating = df.rating.max()

# Find best params

In [None]:
%%time
 
svd = SVD(n_epochs=30, n_factors=100)
results = cross_validate(svd, data, measures=['mse', 'rmse'], cv=3, verbose=True)

In [None]:
%%time

param_grid = {
  'n_factors': [100, 150],
  'n_epochs': [20, 35]
}
 
gs = GridSearchCV(SVD, param_grid, measures=['mse', 'rmse'], cv=5)
gs.fit(data)
 
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
print(gs.best_score['mse'])
print(gs.best_params['mse'])

# Train

In [33]:
%%time

reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=.1)
 
svd = SVD(n_factors=50, n_epochs=10, verbose=True)
svd.fit(trainset)

predictions = svd.test(testset)
rmse = accuracy.rmse(predictions)
print(f"Test RMSE: {rmse}")

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
RMSE: 0.6860
Test RMSE: 0.6859560145303532
CPU times: user 15 s, sys: 122 ms, total: 15.1 s
Wall time: 15.1 s


# Recommend

In [10]:
watchlist1_ids = [get_movie_id(slug) for slug in watchlist1 if get_movie_id(slug) is not None]
watchlist2_ids = [get_movie_id(slug) for slug in watchlist2 if get_movie_id(slug) is not None]
common_watchlist = list(set(watchlist1_ids).intersection(set(watchlist2_ids)))
union_watchlist = list(set(watchlist1_ids).union(set(watchlist2_ids)))

In [34]:
def generate_recommendation(model, user_id, ratings_df, n_items):
    movie_ids = ratings_df["movieId"].unique()
    movie_ids_user = ratings_df.loc[ratings_df["userId"] == user_id, "movieId"]
    movie_ids_to_pred = np.setdiff1d(movie_ids, movie_ids_user)
     
    # Apply a rating of 4 to all interactions (only to match the Surprise dataset format)
    test_set = [[user_id, movie_id, 4] for movie_id in movie_ids_to_pred]
    
    predictions = model.test(test_set)
    
    pred_ratings = np.array([pred.est for pred in predictions])
    index_max = (-pred_ratings).argsort()[:n_items]

    print("Top {0} item recommendations for user {1} ({2}):".format(n_items, get_user_name(user_id), user_id))
    for i in index_max:
       movie_id = movie_ids_to_pred[i]
       print(f"{get_movie_name(movie_id):<50} {round(pred_ratings[i], 3):>6}")
 
 
userID = get_user_id("liannehr")
n_items = 100

generate_recommendation(svd, userID, df, n_items)

Top 100 item recommendations for user liannehr (5000):
black-swan                                           4.56
over-the-garden-wall-2014                           4.546
the-lord-of-the-rings-the-return-of-the-king        4.545
portrait-of-a-lady-on-fire                          4.543
12-angry-men                                        4.531
big-little-lies                                     4.514
chernobyl                                           4.512
oj-made-in-america                                  4.505
the-empire-strikes-back                               4.5
the-dark-knight                                     4.496
come-and-see                                        4.495
schindlers-list                                     4.494
twin-peaks-the-return                                4.46
the-lord-of-the-rings-the-two-towers                4.458
twin-peaks                                          4.455
the-handmaiden                                      4.449
moonlight-2016   

In [35]:
userID = get_user_id("sverlaan")
n_items = 100

generate_recommendation(svd, userID, df, n_items)

Top 100 item recommendations for user sverlaan (5001):
over-the-garden-wall-2014                           4.477
big-little-lies                                     4.446
twin-peaks-the-return                               4.437
singin-in-the-rain                                   4.41
a-separation                                        4.395
mommy-2014                                          4.394
oj-made-in-america                                   4.37
night-and-fog                                       4.366
twin-peaks                                          4.365
sharp-objects                                       4.365
secrets-lies                                        4.354
do-the-right-thing                                  4.344
stop-making-sense                                   4.342
normal-people-2020                                  4.329
woman-in-the-dunes                                  4.324
autumn-sonata                                       4.317
seven-samurai    

In [None]:
# TODO: maak de similarity hier beneden onderdeel van de specifieke user (bijv. top-3 meest similar met je 5/4.5/4 ster gerated films)




def get_similar_movies(movie_id, model, trainset, top_n=5):
    movie_id_mapping = {inner_id: trainset.to_raw_iid(inner_id) for inner_id in trainset.all_items()}
    
    if movie_id not in movie_id_mapping.values():
        return "Movie ID not in training set"

    inner_id = trainset.to_inner_iid(movie_id)

    movie_embeddings = model.qi
    
    similarities = cosine_similarity([movie_embeddings[inner_id]], movie_embeddings)[0]
    similar_indices = similarities.argsort()[::-1][1:top_n+1]
    
    similar_movie_ids = [movie_id_mapping[idx] for idx in similar_indices]

    return similar_movie_ids


movie_id = get_movie_id("dead-man")

similar_movies = get_similar_movies(movie_id, svd, trainset, 5)
print(f"Movies similar to {get_movie_name(movie_id)}")
for movie in similar_movies:
    print(get_movie_name(movie))

# store and load model

In [11]:
# Save the model to a file
with open('model/new_model.pkl', 'wb') as file:
    pickle.dump(svd, file)

with open('model/new_trainset.pkl', 'wb') as file:
    pickle.dump(trainset, file)

with open('model/new_item_mapping.pkl', 'wb') as file:
    pickle.dump(item_mapping, file)

with open('model/new_user_mapping.pkl', 'wb') as file:
    pickle.dump(user_mapping, file)

with open('model/new_ratingsdf.pkl', 'wb') as file:
    pickle.dump(df, file)

In [None]:
userID = get_user_id("sverlaan")
n_items = 10

# You need the loaded model and df with ratings to get new recommendations. df should contain the user for which to make a prediction
# Also need the user and movie ID mapping
# all this can be computed locally when personalized recommendations is requested
generate_recommendation(loaded_model, userID, df, n_items)

In [2]:
with open('model/pre_model.pkl', 'rb') as file:
    pre_model = pickle.load(file)

with open('model/pre_trainset.pkl', 'rb') as file:
    trainset = pickle.load(file)

with open('model/pre_item_mapping.pkl', 'rb') as file:
    item_mapping = pickle.load(file)

In [3]:
def get_movie_id(slug):
    try:
        return item_mapping.tolist().index(slug)
    except:
        return None
        
def get_movie_name(id):
    try:
        return item_mapping[id]
    except:
        return None


def get_similar_movies(movie_id, model, trainset, top_n=5):
    movie_id_mapping = {inner_id: trainset.to_raw_iid(inner_id) for inner_id in trainset.all_items()}
    
    if movie_id not in movie_id_mapping.values():
        return "Movie ID not in training set"

    inner_id = trainset.to_inner_iid(movie_id)

    movie_embeddings = model.qi
    
    similarities = cosine_similarity([movie_embeddings[inner_id]], movie_embeddings)[0]
    similar_indices = similarities.argsort()[::-1][1:top_n+1]
    
    similar_movie_ids = [movie_id_mapping[idx] for idx in similar_indices]

    return similar_movie_ids

In [4]:
%%time
movie_id = get_movie_id("minari")

similar_movies = get_similar_movies(movie_id, pre_model, trainset, 5)
print(f"Movies similar to {get_movie_name(movie_id)}")
for movie in similar_movies:
    print(get_movie_name(movie))

Movies similar to minari
sound-of-metal
the-farewell-2019
judas-and-the-black-messiah
moonlight-2016
cmon-cmon
CPU times: user 41.6 ms, sys: 17.1 ms, total: 58.7 ms
Wall time: 21.2 ms


In [None]:
from matrix_factorization import BaselineModel, KernelMF, train_update_test_split

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

X = movie_data[["user_id", "item_id"]]
y = movie_data["rating"]


#Prepare data for online learning
(
    X_train_initial,
    y_train_initial,
    X_train_update,
    y_train_update,
    X_test_update,
    y_test_update,
) = train_update_test_split(movie_data, frac_new_users=0.2)

# Prepare data
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Initial training
matrix_fact = KernelMF(n_epochs=20, n_factors=100, verbose=1, lr=0.001, reg=0.005)
matrix_fact.fit(X_train_initial, y_train_initial)

In [None]:
baseline_model = BaselineModel(method='sgd', n_epochs = 20, reg = 0.005, lr = 0.01, verbose=1)
baseline_model.fit(X_train.to_numpy(), y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

In [None]:
# Initial training
matrix_fact = KernelMF(n_epochs=20, n_factors=100, verbose=1, lr=0.001, reg=0.005)
matrix_fact.fit(X_train_initial, y_train_initial)
