# 0 - Instalation of modules

#Run this cell to have the progress bar
!pip install tqdm
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

# 1 - Importing modules

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# 2 - Get all files needed

In [2]:
data_folder = "../../Dataset/Movie/"

In [None]:
rating = pd.read_csv(f"{data_folder}ratings.csv").drop(["timestamp"], axis=1)
movie = pd.read_csv(f"{data_folder}movies.csv")

data = pd.merge(movie, rating, on="movieId").drop(["genres", "movieId"], axis=1)
data.shape

# 3 - Transforming data

In [None]:
#On garde les NB_FILM films les plus notés
NB_FILM = 500
keep_title = data["title"].value_counts().sort_values(ascending=0)[:NB_FILM].index
data_title = data[data["title"].isin(keep_title)]
data_title.shape

In [None]:
#On garde les users qui ont données une note sur au moins THRESH_USER % des films
THRESH_USER = 0.3
keep_user = data_title["userId"].value_counts()/NB_FILM >= THRESH_USER
data_title_user = data_title[data_title["userId"].isin(keep_user[keep_user].index)]

In [None]:
#On fait notre pivot table
user_title_df = data_title_user.pivot_table(index="userId", columns="title", values="rating")
user_title_df.to_csv(f"{data_folder}user_title.csv")
user_title_df.shape

# 4 - Colaborative filtering

In [3]:
#Récupérer les données si elles sont en local
user_title_df = pd.read_csv(f"{data_folder}user_title.csv", index_col=0)

In [4]:
#Standardisation des lignes pour supprimer les biais
def standardize(row):
    return row - np.mean(row)

user_title_std = user_title_df.apply(standardize, axis=1).fillna(0)
user_title_std.shape

(18391, 500)

In [5]:
#Création de la matrice de cosinus distance
cos = cosine_similarity(user_title_std)
cos_df = pd.DataFrame(cos, index=user_title_std.index, columns=user_title_std.index)
del(cos)

In [None]:
user_title_df.head()

In [None]:
cos_df.head()

In [6]:
thresh = cos_df[cos_df >= 0.7]

In [7]:
test = thresh.dropna(axis=1, thresh=1).dropna(axis=0, thresh=1)

userId,3,12,13,18,23,31,43,72,75,80,...,162484,162495,162508,162512,162516,162519,162521,162529,162533,162534
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1.0,,,,,,,,,,...,,,,,,,,,,
12,,1.0,,,,,,,,,...,,,,,,,,,,
13,,,1.0,,,,,,,,...,,,,,,,,,,
18,,,,1.0,,,,,,,...,,,,,,,,,,
23,,,,,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162519,,,,,,,,,,,...,,,,,,1.0,,,,
162521,,,,,,,,,,,...,,,,,,,1.0,,,
162529,,,,,,,,,,,...,,,,,,,,1.0,,
162533,,,,,,,,,,,...,,,,,,,,,1.0,


In [None]:
users = thresh.index
for user in users:
    print(thresh.loc[user])

In [None]:
thresh.head()

In [None]:
cluster_non_vide = []
for k in cluster.keys():
    if (len(cluster[k]["+"]) != 0):
        cluster_non_vide.append(k)
cluster_non_vide

In [None]:
#On garde les candidats similaire et complétement opposé
    
all_users = user_title_df.index
eps = 0.7
cluster = {user : {"+" : [], "-" : []} for user in all_users}
L = len(cos_df)

i = 1
for user1 in tqdm(all_users):
    for user2 in all_users[i:]:
        tmp = cos_df.loc[user1, user2]

        if (tmp >= eps):
            cluster[user1]["+"].append(user2)
            cluster[user2]["+"].append(user1)
        elif (tmp <= -eps):
            cluster[user1]["-"].append(user2)
            cluster[user2]["-"].append(user1)

    i += 1

In [None]:
pd.DataFrame.from_dict(cluster).T.to_csv(f"{data_folder}cluster.csv")

In [None]:
#On devine la note en fonction des candidtats similaires
def guess_rate_cluster(user, title):
    rate = 0
    coeff = 0

    for u in cluster[user]["+"]:
        if not (pd.isnull(user_title_df.loc[u, title])):
            rate += cos_df.loc[user, u]*user_title_df.loc[u, title]
            coeff += cos_df.loc[user, u]

    if (coeff == 0):
        return 2.5
    else:   
        return rate/coeff

In [None]:
#On devine la note en fonction de tous les candidats
def guess_rate_all(user, title):
    rate = 0
    coef = 0

    for i in user_title_df.index:
        if (i != user) and not (pd.isnull(user_title_df.loc[i, title])):
            rate += cos_df.loc[user, i]*user_title_df.loc[i, title]
            coef += cos_df.loc[user, i]

    if (coef == 0):
            return 2.5
    else:   
        return rate/coef

In [None]:
#On compare les deux méthodes
from time import time

user = cluster_non_vide
titles = user_title_df.columns

deltaT = []
t_cluster = []
t_all = []
positif = 0

deltaN = []
n_cluster = []
n_all = []
sup = 0

for title in tqdm(titles):
    start = time()
    n_cluster.append(guess_rate_cluster(user, title))
    t_cluster.append(time() - start)

    start = time()
    n_all.append(guess_rate_all(user, title))
    t_all.append(time() - start)
    
    dN = n_cluster[-1]-n_all[-1]
    deltaN.append(np.abs(dN))
    sup += 1 if (dN > 0) else 0 

    dT = t_all[-1]-t_cluster[-1]
    deltaT.append(np.abs(dT))
    positif += 1 if (dT > 0) else 0

In [None]:
#On affiche graphiquement le comparatif
plt.figure(figsize=(25,10))

j = 0
plots_pie = [[positif, len(t_cluster)-positif], [sup, len(n_cluster)-sup]]
labels_pie = [["+ Rapide", "- Rapide"], ["+ Haute", "- Haute"]]
titles_pie = ["Nb de fois on Cluster est plus rapide que All", "Nb de fois que Cluster à données une note plus haute"]

k = 0
plots_box = [[t_cluster, t_all], deltaT, [n_cluster, n_all], deltaN]
labels_box = [["Cluster", "All"], ["Delta"], ["Cluster", "All"], ["Delta"]]
titles_box = ["Temps d'éxécution", "Différence de temps", "Notes", "Différence de note"]

for i in range(6):
    plt.subplot(2, 3, i+1)

    if i+1 in [1, 4]:
        plt.pie(plots_pie[j], labels=labels_pie[j], autopct='%.0f%%')
        plt.title(titles_pie[j])
        j += 1
    else:
        plt.boxplot(plots_box[k], labels=labels_box[k])
        plt.title(titles_box[k])
        k += 1

plt.show()

In [None]:
cluster_non_vide = []
for k in cluster.keys():
    if (len(cluster[k]["+"]) < 5) and (len(cluster[k]["+"]) > 0):
        cluster_non_vide.append(k)
len(cluster_non_vide)

In [None]:
users = cluster_non_vide[:10]

n_cluster = []
n_all = []
delta = []
n_true = []

for user in tqdm(users):
    all_rated_film = user_title_df.loc[user].dropna(axis=0)
    n_true += all_rated_film.values.tolist()
    
    for movie in tqdm(all_rated_film.index, leave=False):
        clust = guess_rate_cluster(user, movie)
        all = guess_rate_all(user, movie)

        delta.append(np.abs(all-clust))
        n_cluster.append(clust)
        n_all.append(all)

In [None]:
delta_cluster = []
delta_all = []
for i in range(len(n_true)):
    delta_cluster.append(n_cluster[i]-n_true[i])
    delta_all.append(n_all[i]-n_true[i])

plots = [[delta_cluster, delta_all], delta]
labels = [["Cluster", "All"], ["Delta"]]
titles = ["Différence des notes avec les vrais notes", "Diférence des notes entre la méthodes Cluster et All"]

plt.figure(figsize=(15,6))
for i in range(2):
    plt.subplot(1, 2, i+1)
    plt.boxplot(plots[i], labels=labels[i])
    plt.title(titles[i])

plt.show()

In [None]:
cluster_non_vide[1]

In [None]:
cluster[cluster_non_vide[1]]

In [None]:
cos_df.loc[48508]

In [None]:
guess_rate_cluster(cluster_non_vide[1], movie)

In [None]:
cluster[cluster_non_vide[0]]