In [3]:
# Data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

# Modeling
# from matrix_factorization import BaselineModel, KernelMF, train_update_test_split
# from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import train_test_split

import pickle

# Other
import os
import random
import sys

from letterboxdpy import user
from letterboxdpy import movie

import sqlite3
import pandas as pd

from surprise.model_selection import train_test_split
from surprise import SVD
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise import Dataset
from surprise import Reader
from surprise import accuracy

import sklearn

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Get the data (database shit - oud)

In [None]:
%%time

user1 = "sverlaan"
user1_inst = user.User(user1)
wl1 = {movie['slug'] for movie in user1_inst.get_watchlist()['data'].values()}
get_movies1 = user1_inst.get_films()
allmovies1 = [movie for movie in get_movies1['movies'].keys()]

user2 = "liannehr"
user2_inst = user.User(user2)
wl2 = {movie['slug'] for movie in user2_inst.get_watchlist()['data'].values()}
get_movies2 = user2_inst.get_films()
allmovies2 = [movie for movie in get_movies2['movies'].keys()]

allmovies = list(wl2.union(wl1).union(allmovies2).union(allmovies1))

In [None]:
len(allmovies)

In [None]:
# Path to your SQLite database file
db_path = "bootstrap_project/ratingsdb/ratings.db"  # Change this to your actual file path
# List of movie IDs you want to filter
movie_ids = allmovies

# Connect to the database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

In [None]:
%%time
# Create a dynamic SQL query using placeholders
placeholders = ", ".join(["?"] * len(movie_ids))  # Creates "?, ?, ?" for each movie_id
query = f"SELECT * FROM ratings WHERE movie_id IN ({placeholders})"

# Execute query with movie_ids as parameters
cursor.execute(query, movie_ids)

# Fetch all matching rows
results = cursor.fetchall()

# Close the connection when done
conn.close()

In [None]:
%%time
# Define column names
columns = ["userId", "movieId", "rating"]

# Create DataFrame
df = pd.DataFrame(results, columns=columns)

# Print DataFrame
df.head()

In [None]:
unique_counts = df.nunique()
print(unique_counts)


# Get the data (database shit - oud)

# Setup data

In [None]:
df = pd.read_csv("data/ratings.csv", dtype={
    'user_name': 'string',
    'film_id': 'string',
    'rating': 'float64'
})

df = df.rename(columns={'user_name': 'userId', 'film_id': 'movieId'})
df

In [None]:
df[df["movieId"] == "pickpocket"]

In [None]:
df = df[df.groupby("movieId")["movieId"].transform("count") >= 1000]   # only movies that more than 1000 users have seen
df = df[df.groupby("userId")["userId"].transform("count") >= 10]       # only users with more than 10 watched movies
print("Num users:", len(df["userId"].unique()))
print("Num movies:", len(df["movieId"].unique()))

#df.to_csv('data/ratings_filtered.csv', index=False)

df

In [None]:
# For faster model training
random_users = random_users = df['userId'].drop_duplicates().sample(n=5000, replace=False)
df = df[df['userId'].isin(random_users)]

In [None]:
%%time
username1 = "liannehr"
user_inst1 = user.User(username1)
watchlist1 = {movie['slug'] for movie in user_inst1.get_watchlist()['data'].values()}
ratings1 = user_inst1.get_films()

username2 = "sverlaan"
user_inst2 = user.User(username2)
watchlist2 = {movie['slug'] for movie in user_inst2.get_watchlist()['data'].values()}
ratings2 = user_inst2.get_films()

In [None]:
user_rows1 = [(username1, slug, info['rating'] / 2.0) for slug, info in ratings1['movies'].items() if info['rating'] is not None]
user_rows2 = [(username2, slug, info['rating'] / 2.0) for slug, info in ratings2['movies'].items() if info['rating'] is not None]
combined_rows = user_rows1+user_rows2

In [None]:
# Add high star ratings multiple times?
high_star_rows = [(u, m, r) for (u, m, r) in user_rows1 if r >= 4.5] * 3
combined_rows = user_rows1+high_star_rows

In [None]:
new_df = pd.DataFrame(combined_rows, columns=df.columns)
df = pd.concat([df, new_df], ignore_index=True)
df.tail()

In [None]:
# Encode user_id and item_id
df["userId"], user_mapping = pd.factorize(df["userId"])
df["movieId"], item_mapping = pd.factorize(df["movieId"])

# Function to get user ID from username
def get_user_id(username):
    try:
        return user_mapping.tolist().index(username)
    except:
        return None

def get_user_name(id):
    try:
        return user_mapping[id]
    except:
        return None

def get_movie_id(slug):
    try:
        return item_mapping.tolist().index(slug)
    except:
        return None
        
def get_movie_name(id):
    try:
        return item_mapping[id]
    except:
        return None

In [None]:
print(get_user_id("liannehr"))
print(get_user_name(11033))

print(get_movie_id("metropolis"))
print(get_movie_name(537))

In [None]:
# Get minimum and maximum rating from the dataset
min_rating = df.rating.min()
max_rating = df.rating.max()

# Find best params

In [None]:
%%time
 
svd = SVD(n_epochs=30, n_factors=100)
results = cross_validate(svd, data, measures=['mse', 'rmse'], cv=3, verbose=True)

In [None]:
%%time

param_grid = {
  'n_factors': [100, 150],
  'n_epochs': [20, 35]
}
 
gs = GridSearchCV(SVD, param_grid, measures=['mse', 'rmse'], cv=5)
gs.fit(data)
 
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
print(gs.best_score['mse'])
print(gs.best_params['mse'])

# Train

In [None]:
%%time

reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=.1)
 
svd = SVD(n_factors=100, n_epochs=30, verbose=True)
svd.fit(trainset)

predictions = svd.test(testset)
rmse = accuracy.rmse(predictions)
print(f"Test RMSE: {rmse}")

# Recommend

In [None]:
watchlist1_ids = [get_movie_id(slug) for slug in watchlist1 if get_movie_id(slug) is not None]
watchlist2_ids = [get_movie_id(slug) for slug in watchlist2 if get_movie_id(slug) is not None]
common_watchlist = list(set(watchlist1_ids).intersection(set(watchlist2_ids)))
union_watchlist = list(set(watchlist1_ids).union(set(watchlist2_ids)))

In [None]:
def generate_recommendation(model, user_id, ratings_df, n_items):
    movie_ids = ratings_df["movieId"].unique()
    movie_ids_user = ratings_df.loc[ratings_df["userId"] == user_id, "movieId"]
    movie_ids_to_pred = np.setdiff1d(movie_ids, movie_ids_user)
     
    # Apply a rating of 4 to all interactions (only to match the Surprise dataset format)
    test_set = [[user_id, movie_id, 4] for movie_id in movie_ids_to_pred]
    
    predictions = model.test(test_set)
    
    pred_ratings = np.array([pred.est for pred in predictions])
    index_max = (-pred_ratings).argsort()[:n_items]

    print("Top {0} item recommendations for user {1} ({2}):".format(n_items, get_user_name(user_id), user_id))
    for i in index_max:
       movie_id = movie_ids_to_pred[i]
       print(f"{get_movie_name(movie_id):<50} {round(pred_ratings[i], 3):>6}")
 
 
userID = get_user_id("liannehr")
n_items = 100

generate_recommendation(svd, userID, df, n_items)

In [None]:
userID = get_user_id("sverlaan")
n_items = 100

generate_recommendation(svd, userID, df, n_items)

In [None]:
# TODO: maak de similarity hier beneden onderdeel van de specifieke user (bijv. top-3 meest similar met je 5/4.5/4 ster gerated films)




def get_similar_movies(movie_id, model, trainset, top_n=5):
    movie_id_mapping = {inner_id: trainset.to_raw_iid(inner_id) for inner_id in trainset.all_items()}
    
    if movie_id not in movie_id_mapping.values():
        return "Movie ID not in training set"

    inner_id = trainset.to_inner_iid(movie_id)

    movie_embeddings = model.qi
    
    similarities = cosine_similarity([movie_embeddings[inner_id]], movie_embeddings)[0]
    similar_indices = similarities.argsort()[::-1][1:top_n+1]
    
    similar_movie_ids = [movie_id_mapping[idx] for idx in similar_indices]

    return similar_movie_ids


movie_id = get_movie_id("dead-man")

similar_movies = get_similar_movies(movie_id, svd, trainset, 5)
print(f"Movies similar to {get_movie_name(movie_id)}")
for movie in similar_movies:
    print(get_movie_name(movie))

# store and load model

In [None]:
# Save the model to a file
with open('model/pre_model.pkl', 'wb') as file:
    pickle.dump(svd, file)

with open('model/pre_trainset.pkl', 'wb') as file:
    pickle.dump(trainset, file)

with open('model/pre_item_mapping.pkl', 'wb') as file:
    pickle.dump(item_mapping, file)

In [None]:
# Load the model using Pickle
with open('model/pre_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [None]:
userID = get_user_id("sverlaan")
n_items = 10

# You need the loaded model and df with ratings to get new recommendations. df should contain the user for which to make a prediction
# Also need the user and movie ID mapping
# all this can be computed locally when personalized recommendations is requested
generate_recommendation(loaded_model, userID, df, n_items)

In [4]:
with open('model/pre_model.pkl', 'rb') as file:
    pre_model = pickle.load(file)

with open('model/pre_trainset.pkl', 'rb') as file:
    trainset = pickle.load(file)

with open('model/pre_item_mapping.pkl', 'rb') as file:
    item_mapping = pickle.load(file)

In [7]:
def get_movie_id(slug):
    try:
        return item_mapping.tolist().index(slug)
    except:
        return None
        
def get_movie_name(id):
    try:
        return item_mapping[id]
    except:
        return None


def get_similar_movies(movie_id, model, trainset, top_n=5):
    movie_id_mapping = {inner_id: trainset.to_raw_iid(inner_id) for inner_id in trainset.all_items()}
    
    if movie_id not in movie_id_mapping.values():
        return "Movie ID not in training set"

    inner_id = trainset.to_inner_iid(movie_id)

    movie_embeddings = model.qi
    
    similarities = cosine_similarity([movie_embeddings[inner_id]], movie_embeddings)[0]
    similar_indices = similarities.argsort()[::-1][1:top_n+1]
    
    similar_movie_ids = [movie_id_mapping[idx] for idx in similar_indices]

    return similar_movie_ids


movie_id = get_movie_id("the-worst-person-in-the-world")

similar_movies = get_similar_movies(movie_id, pre_model, trainset, 5)
print(f"Movies similar to {get_movie_name(movie_id)}")
for movie in similar_movies:
    print(get_movie_name(movie))

Movies similar to the-worst-person-in-the-world
past-lives
minari
tar-2022
aftersun
shiva-baby-2020


In [None]:
from matrix_factorization import BaselineModel, KernelMF, train_update_test_split

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

X = movie_data[["user_id", "item_id"]]
y = movie_data["rating"]


#Prepare data for online learning
(
    X_train_initial,
    y_train_initial,
    X_train_update,
    y_train_update,
    X_test_update,
    y_test_update,
) = train_update_test_split(movie_data, frac_new_users=0.2)

# Prepare data
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Initial training
matrix_fact = KernelMF(n_epochs=20, n_factors=100, verbose=1, lr=0.001, reg=0.005)
matrix_fact.fit(X_train_initial, y_train_initial)

In [None]:
baseline_model = BaselineModel(method='sgd', n_epochs = 20, reg = 0.005, lr = 0.01, verbose=1)
baseline_model.fit(X_train.to_numpy(), y_train)

pred = baseline_model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared = False)

print(f'\nTest RMSE: {rmse:.4f}')

In [None]:
# Initial training
matrix_fact = KernelMF(n_epochs=20, n_factors=100, verbose=1, lr=0.001, reg=0.005)
matrix_fact.fit(X_train_initial, y_train_initial)
