# My recommandation system model training

In [1]:
from pymongo import MongoClient
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import datetime

### MLFlow imports and settings

In [2]:
import mlflow
from mlflow.sklearn import log_model

In [3]:
mlflow.set_tracking_uri('http://127.0.0.1:8080')

### MongoDB connection and settings

In [4]:
client = MongoClient('localhost', 27017)

In [5]:
db = client.Movielens

In [6]:
# Access the 'users' collection
users_col = db.Users

# Access the 'movies' collection
movies_col = db.Movies

In [7]:
# Retrieve all movies
movies = movies_col.find()

# Retrieve all users
users = users_col.find()

### Data spliting using timestamp values

In [10]:
# Query MongoDB to retrieve data
data = []
for user in users:
    user_id = user['_id']  # Extracting user_id from the '_id' field
    for movie in user['movies']:
        data.append({
            'user_id': user_id,
            'movie_id': movie['movieid'],
            'rating': movie['rating'],
            'timestamp': movie['timestamp']
        })

# Convert data to a pandas DataFrame
df = pd.DataFrame(data)

In [11]:
def data_split(data, sample_size, train_size):
    # Sort the DataFrame based on timestamp
    full_df = data.sort_values(by='timestamp', ascending=False)

    split_index = int(train_size * len(full_df))

    train_df = full_df[:split_index]
    test_df = full_df[split_index:]

    # Reduce the DataFrame to a given percent portion
    sample_df = data.sample(frac=sample_size, random_state=42)
    
    sample_df = sample_df.sort_values(by='timestamp', ascending=False)

    split_index_min = int(train_size * len(sample_df))

    train_df_min = sample_df[:split_index_min]
    test_df_min = sample_df[split_index_min:]

    return train_df, test_df, train_df_min, test_df_min

In [12]:
train_df, test_df, train_df_min, test_df_min = data_split(df, 0.1, 0.8)

In [13]:
test_df

Unnamed: 0,user_id,movie_id,rating,timestamp
226331,4611,3466,2,964152800
226523,4611,1682,4,964152800
226342,4611,3499,4,964152800
226478,4611,1280,3,964152753
226430,4611,34,4,964152753
...,...,...,...,...
324,6040,2019,5,956703977
139,6040,1961,4,956703977
5,6040,593,5,956703954
285,6040,2384,4,956703954


In [14]:
# Create a pivot table for my train_df
train_table = train_df.pivot_table(index='user_id', columns='movie_id', values='rating', fill_value=0)

# Display the pivot table
train_table

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,4,0,0,0,0,0
6002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6016,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6028,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Matching my train and test tables

In [15]:
# Get unique movie IDs from train and test sets
train_movie_ids = train_df['movie_id'].unique()
test_movie_ids = test_df['movie_id'].unique()

# Concatenate movie IDs from train and test sets
all_movie_ids = np.unique(np.concatenate([train_movie_ids, test_movie_ids]))

In [16]:
# Create a pivot table for the test set
test_table = test_df.pivot_table(index='user_id', columns='movie_id', values='rating', fill_value=0)

# Reindex test_table to include all movie_ids from both train and test sets
test_table = test_table.reindex(columns=all_movie_ids, fill_value=0)

test_table = test_table[train_table.columns]

# Display the pivot table
test_table

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4613,0,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
4614,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4615,0,0,0,0,0,4,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0,0,0,2,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### First real model training

In [17]:
# Parameters inputs
n_components = 10
max_iter = 500

# Train model on the filled training set
nmf = NMF(n_components=n_components, max_iter=max_iter)
nmf.fit(train_table.values)

# Generate predictions for train and test sets
pred_matrix = nmf.transform(test_table.values) @ nmf.components_

# Exporting my prediction data
df_pred_matrix = pd.DataFrame(pred_matrix, index=test_table.index, columns=test_table.columns)

# Unpivot df_pred_matrix
df_pred = df_pred_matrix.reset_index().melt(id_vars='user_id', var_name='movie_id', value_name='score')

# Join unpivoted predictions with the original train_df
df_pred = df_pred.merge(test_df, on=['user_id', 'movie_id'])

df_pred

Unnamed: 0,user_id,movie_id,score,rating,timestamp
0,4626,1,2.789796,2,964117136
1,4631,1,1.783436,3,964040271
2,4633,1,1.952819,3,964036615
3,4635,1,4.538729,4,964034339
4,4637,1,4.348211,3,964026538
...,...,...,...,...,...
200001,4951,3819,0.840643,4,963435368
200002,4979,3819,0.880817,5,963544577
200003,5034,3819,0.038418,5,963676163
200004,6016,3819,1.539739,4,963617607


In [18]:
# Sorting my df_pred by ascending users and descending scores
df_pred.sort_values(by=['user_id', 'score'], ascending=[True, False], inplace=True)

df_pred

Unnamed: 0,user_id,movie_id,score,rating,timestamp
13376,4611,260,2.687950,5,964151569
61600,4611,1196,2.575612,4,964151918
143638,4611,2571,2.234112,3,964152545
65360,4611,1210,2.157635,4,964151959
16980,4611,318,2.119146,5,964152415
...,...,...,...,...,...
111728,6040,1974,0.005568,1,956716478
167885,6040,3016,0.002306,2,956716157
111771,6040,1975,0.000000,1,956715569
112145,6040,1991,0.000000,1,956716294


In [19]:
# Filtering to only keep the 10 best scores for each user
top_10_per_user = df_pred.sort_values(by=['user_id', 'score'], ascending=[True, False]) \
                         .groupby('user_id').head(10)

top_10_per_user

Unnamed: 0,user_id,movie_id,score,rating,timestamp
13376,4611,260,2.687950,5,964151569
61600,4611,1196,2.575612,4,964151918
143638,4611,2571,2.234112,3,964152545
65360,4611,1210,2.157635,4,964151959
16980,4611,318,2.119146,5,964152415
...,...,...,...,...,...
66567,6040,1213,3.883037,4,957716861
28970,6040,527,3.858994,5,956704219
5236,6040,50,3.765114,4,956704911
14069,6040,260,3.753589,4,956716873


In [20]:
# Using my top 10 movies to check my model performance
average_rating = top_10_per_user['rating'].mean()
diff_rating = 5 - average_rating
print("Average rating:", average_rating)
print("Difference from 5:", diff_rating)

Average rating: 4.240258826839218
Difference from 5: 0.7597411731607817


## 3) Model training with MLFlow logs

In [24]:
%%time

# Start MLflow run
with mlflow.start_run(experiment_id=315400400869642160):

    # Tag settings
    current_date = datetime.datetime.now().strftime("%Y-%m-%d")
    type = 'model_training'
    
    # Parameters inputs
    n_components = 100
    max_iter = 500
    
    # Log parameters
    mlflow.log_params({
        "n_components": n_components,
        "max_iter": max_iter
    })

    # Train model on the training set
    nmf = NMF(n_components=n_components, max_iter=max_iter)
    nmf.fit(train_table.values)  # Training on the values of the aligned train_table

    # Generate predictions for train and test sets
    pred_matrix = nmf.transform(test_table.values) @ nmf.components_
    
    # Exporting my prediction data
    df_pred_matrix = pd.DataFrame(pred_matrix, index=test_table.index, columns=test_table.columns)
    df_pred = df_pred_matrix.reset_index().melt(id_vars='user_id', var_name='movie_id', value_name='score')
    df_pred = df_pred.merge(test_df, on=['user_id', 'movie_id'])
    
    top_10_per_user = df_pred.sort_values(by=['user_id', 'score'], ascending=[True, False]) \
                         .groupby('user_id').head(10)
    worst_10_per_user = df_pred.sort_values(by=['user_id', 'score'], ascending=[True, False]) \
                         .groupby('user_id').tail(10)

    # Calculating metrics
    average_top_rating = top_10_per_user['rating'].mean()
    diff_top_rating = 5 - average_top_rating

    average_worst_rating = worst_10_per_user['rating'].mean()
    diff_worst_rating = average_worst_rating - 1

    # Log metrics
    mlflow.log_metric("average_top_rating", average_top_rating)
    mlflow.log_metric("top_rating_diff", diff_top_rating)
    mlflow.log_metric("average_worst_rating", average_worst_rating)
    mlflow.log_metric("worst_rating_diff", diff_worst_rating)
    

    # Log model
    mlflow.sklearn.log_model(nmf, "nmf_model")

    # Add tags
    mlflow.set_tag("type", type)
    mlflow.set_tag("date", current_date)



CPU times: user 4min 7s, sys: 10 s, total: 4min 17s
Wall time: 1min 6s
