In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [None]:
movies_data_path = "../data/movies.csv"
ratings_data_path = "../data/ratings.csv"
movies_1_data_path = "../data/movies1.csv"
ratings_1_data_path = "../data/ratings1.csv"
movie_matrix_path = "../data/movie_matrix.csv"
ratings_apprentissage_path = "../data/ratings_apprentissage.csv"

In [None]:

df_movies_1 = utils.init_traitement_DB(movies_1_data_path)
df_ratings_1 = utils.init_traitement_DB(ratings_1_data_path).drop(columns=['timestamp'])
df_movies1_trunc, df_ratings1_trunc = utils.filter_dataframes_by_threshold(50, df_movies_1, df_ratings_1)

df_ratings_apprentissage, df_ratings_validation = train_test_split(df_ratings1_trunc, test_size=0.2, random_state=42)

#df_ratings_apprentissage = utils.init_traitement_DB(ratings_apprentissage_path).drop(columns=['timestamp'])

In [None]:
df_movie_matrix = utils.create_db_content_movie(df_movies_1)
print("shape : ", df_movie_matrix.shape)
df_movie_matrix.head()

In [None]:
user_profiles = utils.calculate_user_profiles(df_ratings_apprentissage, df_movie_matrix)

In [None]:
user_profiles.head()

In [None]:
cluster_labels = utils.determine_optimal_clusters_para(user_profiles, cluster_method='K_means', max_clusters=10)

In [None]:
# Fonction pour prédire les évaluations et calculer l'erreur de prédiction
def predict(user_profiles, content_matrix, cluster_labels, df_ratings_validation, user_id_to_index):
    predictions = []
    true_ratings = []

    for index, row in df_ratings_validation.iterrows():
        user_id = row['userId']
        movie_id = row['movieId']
        true_rating = row['rating']

        user_profile = user_profiles.loc[user_id]
        user_cluster = cluster_labels[user_id_to_index[user_id]]  # Use the correct index
        cluster_users = user_profiles[cluster_labels == user_cluster]
        cluster_profile = cluster_users.mean()

        if movie_id in content_matrix.index:
            similarity_scores = content_matrix.loc[movie_id].dot(cluster_profile)
            # Adding influence of the average rating of the movie
            movie_ratings = df_ratings_apprentissage[df_ratings_apprentissage['movieId'] == movie_id]['rating']
            if not movie_ratings.empty:
                average_movie_rating = movie_ratings.mean()
                predicted_rating =  (average_movie_rating+ similarity_scores) /2
                
                predictions.append(predicted_rating)
                true_ratings.append(true_rating)

    mse = mean_squared_error(true_ratings, predictions)
    print(f"Mean Squared Error: {mse}")
    return mse


In [None]:
user_profiles.head()

In [None]:
user_id_to_index = {user_id: index for index, user_id in enumerate(user_profiles.index)}



In [None]:
#optimal_clusters_silhouette = utils.determine_optimal_clusters_para(user_profiles, cluster_method='silhouette', max_clusters=10, n_jobs=-1)
#print(f"The optimal number of clusters for silhouette is: {optimal_clusters_silhouette}")

# Appliquer le clustering spectral
#cluster_labels_silhouette = utils.apply_spectral_clustering(user_profiles, optimal_clusters_silhouette)

#optimal_clusters_Kmeans = determine_optimal_clusters(user_profiles, cluster_method='K_means', max_clusters=10, n_jobs=-1)
optimal_clusters_Kmeans = utils.determine_optimal_clusters_para(user_profiles, cluster_method='K_means', max_clusters=5)
print(f"The optimal number of clusters for Kmeans is: {optimal_clusters_Kmeans}")

# Appliquer le clustering Kmeans
cluster_labels_Kmeans = utils.apply_kmeans_clustering(user_profiles, optimal_clusters_Kmeans)

#print("Shape of cluster_labels_silhouette: ", cluster_labels_silhouette.shape)
print("Shape of cluster_labels_Kmeans: ", cluster_labels_Kmeans.shape)


In [None]:
# Comparer les deux méthodes de clustering
#mse_spectral = predict(user_profiles, df_movie_matrix, cluster_labels_silhouette, df_ratings_validation, user_id_to_index)

#print(f"MSE for Spectral Clustering: {mse_spectral}")


In [None]:
mse_kmeans = predict(user_profiles, df_movie_matrix, cluster_labels_Kmeans, df_ratings_validation, user_id_to_index)

print(f"MSE for Kmeans Clustering: {mse_kmeans}")

In [None]:
cluster_labels_Kmeans_200 = utils.apply_kmeans_clustering(user_profiles, 200)

mse_kmeans_200 = predict(user_profiles, df_movie_matrix, cluster_labels_Kmeans_200, df_ratings_validation, user_id_to_index)

print(f"MSE for Kmeans Clustering 200: {mse_kmeans_200}")

In [None]:
        #user_profile = user_profiles.loc[user_id]
        #user_cluster = cluster_labels[user_id_to_index[user_id]] 
        #cluster_users = user_profiles[cluster_labels == user_cluster]
        #cluster_profile = cluster_users.mean()
#
        #movie_ratings = df_ratings_apprentissage[df_ratings_apprentissage['movieId'] == movie_id]['rating']
        #movie_ratings = movie_ratings[movie_ratings['userId'].isin(cluster_users.index)]