In [1]:
import numpy as np
import pymongo
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import mlflow


## Link the database to the notebook

In [3]:
client = pymongo.MongoClient('localhost:27017')

#db = client['Movielens']
db = client['movie-db']

movies = db['movies']
users = db['users']

## Open MLFLOW in the URL and create the experiment project

In [7]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
mlflow.set_experiment("MLflow_reco_movies")

2024/02/14 15:23:57 INFO mlflow.tracking.fluent: Experiment with name 'MLflow_reco_movies' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/447492215528966918', creation_time=1707920637560, experiment_id='447492215528966918', last_update_time=1707920637560, lifecycle_stage='active', name='MLflow_reco_movies', tags={}>

## Create the dataframe with all datas usefull to run the NMF

In [8]:
# Extract the data from MongoDB
data = list(users.find())

# Create the empty DataFrame with the right columns
user_list = []
movieid_list = []
rating_list = []
timestamp_list = []

# Parcourir les données et extraire les informations nécessaires
for entry in data:
    user_id = entry['_id']
    for movie in entry['movies']:
        user_list.append(user_id)
        movieid_list.append(movie['movieid'])
        rating_list.append(movie['rating'])
        timestamp_list.append(movie['timestamp'])

# Créer le DataFrame
df = pd.DataFrame({
    'user': user_list,
    'movieid': movieid_list,
    'rating': rating_list,
    'timestamp': timestamp_list
})

# Afficher les premières lignes du DataFrame
print(df.head())

Empty DataFrame
Columns: [user, movieid, rating, timestamp]
Index: []


## split the database in train and test df

In [9]:
# Sort the dataframe based on the timestamp column
df_sorted = df.sort_values(by='timestamp')

# pd.to_datetime(df_sorted['timestamp'], unit='s')) => to convert timestamp in date

# delete all raws for users who have rated no movies
df_sorted = df_sorted[df_sorted.sum(axis=1) > 0]
df_sorted

# Calculate the index where 80% of the data ends
train_size = int(len(df_sorted) * 0.8)

# Split the dataframe
train_df = df_sorted[:train_size]
test_df = df_sorted[train_size:]

# Optionally, you can reset the index of the split dataframes
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# train_df will contain 80% of the data based on timestamp for training
# test_df will contain 20% of the data based on timestamp for testing
print(train_df.head())
print(test_df.head())

Empty DataFrame
Columns: [user, movieid, rating, timestamp]
Index: []
Empty DataFrame
Columns: [user, movieid, rating, timestamp]
Index: []


## clean test_df

### Find all the common_movies

In [10]:
# movies for each df
movies_train = set(train_df['movieid'])
movies_test = set(test_df['movieid'])

# Find movies communs
movies_common = movies_train.intersection(movies_test)

# Convert in list (option)
movies_common_list = list(movies_common)
print("Nombre de film communs aux deux DataFrames :", len(movies_common))

# Number of movies in train and not in test
movies_train_not_in_test = movies_train - movies_test
num_movies_train_not_in_test = len(movies_train_not_in_test)

# Number of movies in test and not in train
movies_train_not_in_train = movies_test - movies_train
num_movies_test_not_in_train = len(movies_train_not_in_train)

print("Movies in train but not in test :", num_movies_train_not_in_test)
print("Movies in test but not in train :", num_movies_test_not_in_train)

Nombre de film communs aux deux DataFrames : 0
Movies in train but not in test : 0
Movies in test but not in train : 0


### find the common users

In [11]:
# Users for each df
users_train = set(train_df['user'])
users_test = set(test_df['user'])

# Find the common users
users_common = users_train.intersection(users_test)

# Convert in list (optional)
users_common_list = list(users_common)
print("Common users train and df :", len(users_common))

# Number of users in train and not in test
users_train_not_in_test = users_train - users_test
num_users_train_not_in_test = len(users_train_not_in_test)

# Number of users in test and not in train
users_test_not_in_train = users_test - users_train
num_users_test_not_in_train = len(users_test_not_in_train)

print("Users in train but not in test :", num_users_train_not_in_test)
print("Users in test but not in train :", num_users_test_not_in_train)


Common users train and df : 0
Users in train but not in test : 0
Users in test but not in train : 0


In [12]:
# Filtrer les lignes de df_test pour lesquelles 'user' est dans users_common_list
df_test_filtered_user = test_df[test_df['user'].isin(users_common_list)]

# Filtrer les lignes de df_test pour lesquelles 'movies' est dans users_common_list
df_test_filtered = df_test_filtered_user[df_test_filtered_user['movieid'].isin(movies_common_list)]

# Afficher le DataFrame filtré
print(df_test_filtered)

Empty DataFrame
Columns: [user, movieid, rating, timestamp]
Index: []


## Look for the user in df_test_filtered who has seen the max of movies

In [15]:

# Number of movies seen by each users
movies_seen_by_users = df_test_filtered.groupby('users')['movieid'].count()

# Filtrer les films vus par les utilisateurs communs
movies_seen_by_users_common = movies_seen_by_users[users_common_list]

# Trouver l'utilisateur avec le plus grand nombre de films vus
id_user_max_movies_test_df = movies_seen_by_users_common.idxmax()

# Afficher l'utilisateur avec le plus grand nombre de films vus
print("The user common with train_df who has seen the most of movies in df_test_filtered is : ", id_user_max_movies_test_df)

KeyError: 'users'

## fast check if user is common

In [10]:
user_to_check = 1088
if user_to_check in users_common_list:
    print("L'utilisateur est dans la liste des utilisateurs communs.")
else:
    print("L'utilisateur n'est pas dans la liste des utilisateurs communs.")

L'utilisateur est dans la liste des utilisateurs communs.


## train_df has to be pivoted to run the Model properly => the empty values are turned to 0 to fill the pivot 

In [11]:
train_pivot = pd.pivot_table(train_df, values='rating', index='user', columns='movieid', fill_value=0)
train_pivot

movieid,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
637,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Sparse the pivot table to make sure the 0 are considered as no value

In [12]:
# all elements of the pivot
total_elements = train_pivot.size

# Number of zero
number_zero = np.count_nonzero(train_pivot == 0)

# Sparse calculation
sparsity = (number_zero / total_elements) * 100

print("number of zeros : ", number_zero)
print("total elements : ", total_elements)
print("Sparsity of the train_pivot : {:.0f}%".format(sparsity))

number of zeros :  18974633
total elements :  19774800
Sparsity of the train_pivot : 96%


In [14]:
df_sparse = train_pivot.astype(pd.SparseDtype("float", 0))

## CHANGE THE NUMBER OF COMPONENTS AND RUN ALL

In [15]:
x = 25

## Here you run the model .fit / you create the pred_matrix with the LLM => for each users it gives the rating predicted 

In [16]:

# Define the model hyperparameters

params = {
    'n_components' : x,
    'max_iter' : 200
}

# Train the model
nmf = NMF(**params)
nmf.fit(df_sparse)

# you can skip the fit if you have only few datas to add (exemple new rating on existing movies with the same users) 
U = nmf.transform(df_sparse)
M = nmf.components_ #this are the movies and can not change from the database predict to the database test or transform the perimeter has to be the same as the fit datatbase
pred_matrix = np.dot(U, M) #this is giving for each user a rating for ALL the movies
# we transform the pred_matrix into a dataframe in columns to make sure we can compare training database with the predict results
pred_matrix = pd.DataFrame(pred_matrix, index=df_sparse.index, columns=df_sparse.columns)
pred_matrix




movieid,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
635,0.213345,0.029223,0.076125,0.017034,0.015314,0.043657,0.037748,0.002648,0.000056,0.084372,...,0.020305,0.001105,0.005590,0.021847,0.010017,0.290206,0.059160,0.007989,0.015712,0.157030
636,0.491941,0.400792,0.142929,0.001258,0.004559,0.217658,0.003200,0.008994,0.067322,0.504205,...,0.003741,0.000608,0.002739,0.020863,0.022659,0.085315,0.040788,0.001399,0.002640,0.039671
637,3.480889,1.570954,0.476978,0.129611,0.200510,0.731362,0.160145,0.220817,0.001195,0.738098,...,0.013742,0.003383,0.031042,0.020323,0.100411,0.197797,0.085944,0.005071,0.005513,0.109076
638,0.485974,0.081996,0.065674,0.001015,0.006339,0.211263,0.057645,0.007105,0.000000,0.189848,...,0.051642,0.002657,0.016022,0.070081,0.010973,0.873144,0.125355,0.023520,0.032511,0.414194
639,0.895167,0.266029,0.002268,0.000000,0.005652,1.186454,0.000000,0.012844,0.000000,1.254606,...,0.001561,0.000024,0.001356,0.000684,0.007877,0.008469,0.001078,0.000233,0.000318,0.006099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,3.849340,1.069201,0.148205,0.305397,0.073540,1.891173,0.470137,0.112127,0.002196,0.813604,...,0.096285,0.005862,0.014283,0.075372,0.095145,0.307834,0.348189,0.036480,0.160405,0.459934
6037,0.849193,0.118567,0.000000,0.007723,0.012081,0.785246,0.024171,0.000710,0.000000,0.149047,...,0.016009,0.003731,0.000269,0.000623,0.051873,0.000286,0.013362,0.017532,0.000182,0.052183
6038,0.826037,0.023868,0.041838,0.024934,0.016387,0.000734,0.143815,0.001385,0.000000,0.089828,...,0.007651,0.000379,0.000199,0.000173,0.001436,0.000000,0.000014,0.000000,0.000000,0.000379
6039,1.688477,0.539945,0.098881,0.059517,0.039116,0.000000,0.343288,0.087478,0.000000,0.143868,...,0.025883,0.004641,0.010939,0.000000,0.081480,0.000000,0.020937,0.027210,0.000000,0.012632


## To calculate the mse we have to merge the train table with the predict matrix : 2 steps 

### step 1 : change the predict table into a df with only columns 

In [17]:
df_depivoted = pred_matrix.stack().reset_index()
df_depivoted.columns = ['user', 'movieid', 'predict']
df_depivoted

Unnamed: 0,user,movieid,predict
0,635,1,0.213345
1,635,2,0.029223
2,635,3,0.076125
3,635,4,0.017034
4,635,5,0.015314
...,...,...,...
19774795,6040,3948,0.155392
19774796,6040,3949,0.208701
19774797,6040,3950,0.023026
19774798,6040,3951,0.098693


### step 2 : merge both tables to add a column predict /!\ a simple merge won't run with 20 000 000 raws ! inner merge is mandatory 

In [18]:
train_df
train_df.describe()

Unnamed: 0,user,movieid,rating,timestamp
count,800167.0,800167.0,800167.0,800167.0
mean,3402.733303,1849.283538,3.590508,968393700.0
std,1546.650235,1086.856401,1.120326,5821299.0
min,635.0,1.0,1.0,956703900.0
25%,2032.0,1028.0,3.0,964152800.0
50%,3507.0,1788.0,4.0,967588100.0
75%,4694.0,2750.0,4.0,974688000.0
max,6040.0,3952.0,5.0,975768700.0


## let's compare rating with movies already seen with the predict of our model

In [25]:
df_compare = pd.merge(df_depivoted,train_df, how='inner', on=['user', 'movieid'])
df_compare['delta'] = df_compare['rating'] / df_compare['predict']
df_compare 
#we understand that we compare 2 different things, a position of a movie on a scale (predict) with the value of the rating already done by the user

Unnamed: 0,user,movieid,predict,rating,timestamp,delta
0,635,296,0.165710,4,975768620,24.138541
1,635,480,0.363514,5,975767861,13.754623
2,635,858,0.912633,4,975768664,4.382921
3,635,920,0.313617,4,975767911,12.754406
4,635,1172,0.216053,5,975768620,23.142429
...,...,...,...,...,...,...
800162,6040,3683,2.693478,4,960971696,1.485069
800163,6040,3703,1.005015,4,964828575,3.980042
800164,6040,3735,1.753684,4,960971654,2.280913
800165,6040,3751,1.223400,4,964828782,3.269576


### step 3 : calculate the mse 

In [26]:
#we understand that we compare 2 different things, a position of a movie on a scale (predict) with the value of the rating already done by the user

In [None]:
mse = mean_squared_error(df_mse['rating'], df_mse['predict'])
delta_mse = np.sqrt(mean_squared_error(df_mse['rating'], df_mse['predict']))
print('delta mse train: ', delta_mse)
print('mse train : ',mse)

In [30]:
df_predict_notseen_movies = df_depivoted.drop(df_compare.index)
df_predict_notseen_movies.describe()

Unnamed: 0,user,movieid,predict
count,18974630.0,18974630.0,18974630.0
mean,3448.184,1996.304,0.1695313
std,1497.552,1150.483,0.455613
min,853.0,1.0,0.0
25%,2151.0,993.0,0.001357686
50%,3450.0,2037.0,0.01597528
75%,4745.0,2989.0,0.1084402
max,6040.0,3952.0,11.22631


In [29]:
df_predict_notseen_movies

Unnamed: 0,user,movieid,predict
800167,853,2057,0.079728
800168,853,2058,1.293442
800169,853,2059,0.143627
800170,853,2060,0.000000
800171,853,2061,0.003817
...,...,...,...
19774795,6040,3948,0.155392
19774796,6040,3949,0.208701
19774797,6040,3950,0.023026
19774798,6040,3951,0.098693


## Creation of the MLFLOW run

In [None]:
# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)

    # Log the loss metric
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("delta mse", delta_mse)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("On training datas", f"On the training database, components {x} ")

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=NMF,
        artifact_path="NMF_Model",
        signature=None,
        input_example=train_pivot,
        registered_model_name=f"NMF_on_train_movies_users {x} components",
    )

In [None]:
# Filtrer les lignes où la colonne 'user' est égale à 1088
df_test_user_1088 = df_test_filtered.loc[df_test_filtered['user'] == 1088]

# Afficher les lignes filtrées
print(df_test_user_1088)

## df_test has to be pivoted to run the Model properly => the empty values are turned to 0 to fill the pivot 

In [None]:
test_pivot = pd.pivot_table(test_df, values='rating', index='user', columns='movieid', fill_value=0)
print(test_pivot)