# Importing modules

In [1]:
import matplotlib.pyplot as plt
import seaborn as sbn

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from skfuzzy.cluster import cmeans
from sklearn.metrics.pairwise import cosine_similarity

# Fuzzy C means Class

In [2]:
class FCmeans():
    #Constructor
    def __init__(self, n_clusters : int, m : int = 2, error : float = 10**-9, max_iter : int = 300, init : np.ndarray = None, seed : int = None):
        self.n_clusters = n_clusters    #Number of cluster
        self.m = m                      #Array exponentiation applied to the membership function u_old at each iteration, where U_new = u_old ** m.
        self.error = error              #Stopping criterion; stop early if the norm of (u[p] - u[p-1]) < error
        self.max_iter = max_iter        #Maximum number of iterations allowed.
        self.init = init                #Initial fuzzy c-partitioned matrix. If none provided, algorithm is randomly initialized.
        self.seed = seed                #Sets random seed of init


    def _dist(A, B):
        """Compute the euclidean distance between two matrices"""
        return np.sqrt(np.einsum("ijk->ij", (A[:, None, :] - B) ** 2))


    def fit(self, data : np.ndarray):
        """
        Train the fuzzy-c-means model

        Parameters
        ----------
        data : array-like, shape = [n_samples, n_features]
            Training instances to cluster.
        """
        fcm = cmeans(data.T, self.n_clusters, self.m, self.error, self.max_iter, init=self.init, seed=self.seed)
        self.centers = fcm[0]               #Cluster centers
        self.membership = fcm[1].T          #Final fuzzy c-partitioned matrix
        self.init_membership = fcm[2].T     #Initial guess at fuzzy c-partitioned matrix
        self.dist = fcm[3]                  #Final Euclidian distance matrix
        self.inertia = fcm[4]               #Objective function history
        self.n_iter = fcm[5]+1              #Number of iterations run
        self.coeff = fcm[6]                 #Final fuzzy partition coefficient


    def soft_predict(self, X: np.ndarray) -> np.ndarray:
        """
        Soft predict of FCM 

        Parameters
        ----------
        data : array, shape = [n_samples, n_features]
            New data to predict.

        Returns
        -------
        array, shape = [n_samples, n_clusters]
            Fuzzy partition array, returned as an array with n_samples rows
            and n_clusters columns.
        """
        temp = FCmeans._dist(X, self.centers) ** float(2 / (self.m - 1))
        denominator_ = temp.reshape((X.shape[0], 1, -1)).repeat(temp.shape[-1], axis=1)
        denominator_ = temp[:, :, np.newaxis] / denominator_
        return 1 / denominator_.sum(2)


    def predict(self, data : np.ndarray):
        """
        Predict the closest cluster each sample in X belongs to.

        Parameters
        ----------
        data : array, shape = [n_samples, n_features]
            New data to predict.

        Returns
        -------
        labels : array, shape = [n_samples,]
            Index of the cluster each sample belongs to.
        """
        return self.soft_predict(data).argmax(axis=-1)

# 2 - Get all files needed

In [3]:
data_folder = "../../Dataset/Movie/"

In [4]:
rating = pd.read_csv(f"{data_folder}ratings.csv").drop(["timestamp"], axis=1)
movie = pd.read_csv(f"{data_folder}movies.csv")

data = pd.merge(movie, rating, on="movieId").drop(["genres", "movieId"], axis=1)
data.shape

(25000095, 3)

# 3 - Transforming data

In [5]:
#On garde les NB_FILM films les plus notés
NB_FILM = 500
keep_title = data["title"].value_counts().sort_values(ascending=False)[:NB_FILM].index
data_title = data[data["title"].isin(keep_title)]
data_title.shape

(10944751, 3)

In [6]:
#On garde les users qui ont données une note sur au moins THRESH_USER % des films
THRESH_USER = 0.40
keep_user = data_title["userId"].value_counts()/NB_FILM >= THRESH_USER
data_title_user = data_title[data_title["userId"].isin(keep_user[keep_user].index)]

In [7]:
#On fait notre pivot table
user_title_df = data_title_user.pivot_table(index="userId", columns="title", values="rating")
user_title_df.to_csv(f"{data_folder}test_for_users.csv")
user_title_df.shape

(10103, 500)

# Algo

In [16]:
def normalization_min_max(row):
    return (row - row.mean())/(row.max() - row.min())

In [17]:
for_users_data = user_title_df.fillna(0).apply(normalization_min_max, axis=1)
for_movies_data = user_title_df.fillna(0).apply(normalization_min_max, axis=0)

In [18]:
#Création de la matrice de cosinus distance
all_users = user_title_df.index
cos = cosine_similarity(for_users_data)
similarity_users = pd.DataFrame(cos, index=all_users, columns=all_users)
np.fill_diagonal(similarity_users.values, 0)


#Création de la matrice de cosinus distance
all_movies = user_title_df.columns
cos = cosine_similarity(for_movies_data.T)
similarity_movies= pd.DataFrame(cos, index=all_movies, columns=all_movies)
np.fill_diagonal(similarity_movies.values, 0)


del(cos)

In [19]:
#On devine la note en fonction des candidtats similaires
def guess_rate(user, title, N):
    similar_mat = similarity_users
    all_similar_users = similar_mat.loc[user].sort_values(ascending=False).index
    user_title_mat = for_users_data
    
    rate = 0
    weight = 0
    k = 0
    i = 0

    while (k < N):
        similar_user = all_similar_users[i]
        r = user_title_mat.loc[similar_user, title]

        if not (pd.isnull(r)):
            w = similar_mat.loc[similar_user, user]
            rate += w*r
            weight += w
            k += 1

        i += 1

    return rate/weight

In [20]:
def get_similar_movies_from_users(user_id, N):
    all_rates = []
    keep_movies = []

    for movie in tqdm(user_title_df.columns):
        if (pd.isnull(user_title_df.loc[user_id, movie])):
            all_rates.append(guess_rate(user_id, movie, N))
            keep_movies.append(movie)

    return pd.Series(all_rates, keep_movies).sort_values(ascending=False)

In [21]:
def get_similar_movies_from_movies(user_id):
    similar_movies = pd.DataFrame()

    movies_rates = user_title_df.loc[user_id].dropna()
    movies = movies_rates.index
    rates = movies_rates.values
    user = [(movies[i], rates[i]) for i in range(len(rates))]

    for movie, rate in tqdm(user):
        movie_list = (similarity_movies.loc[:, movie]*(rate-2.5)).sort_values(ascending=False)
        similar_movies = similar_movies.append(movie_list)

    
    similar_movies = similar_movies.T.drop([movie for movie, rate in user], axis=0)
    return similar_movies.sum(axis=1).sort_values(ascending=False)

In [22]:
def recommanded_movies(user_id):
    get_users = get_similar_movies_from_users(user_id, 15)
    get_movies = get_similar_movies_from_movies(user_id)

    arg_movies = []
    for title in get_users.index:
        arg_movies.append(int(np.argwhere(get_movies.index == title)))

    recommanded = pd.DataFrame(index=get_users.index, columns=["Arg Users", "Arg Movies"], data=np.array([range(len(arg_movies)), arg_movies]).T)
    recommanded["Mean"] = recommanded.mean(axis=1)

    return recommanded.sort_values(by="Mean")

In [23]:
recommanded_movies(12)

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/252 [00:00<?, ?it/s]

Unnamed: 0,Arg Users,Arg Movies,Mean
"Graduate, The (1967)",9,0,4.5
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),1,16,8.5
Quiz Show (1994),20,1,10.5
One Flew Over the Cuckoo's Nest (1975),8,18,13.0
Close Encounters of the Third Kind (1977),31,2,16.5
...,...,...,...
Edge of Tomorrow (2014),232,244,238.0
Taken (2008),246,233,239.5
Shutter Island (2010),245,239,242.0
Pirates of the Caribbean: At World's End (2007),243,243,243.0
