In [1]:
%reload_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tabulate import tabulate

## Entraînement du modèle

In [None]:
# Import du jeu de données sur les films
df = pd.read_csv('../data/movies.csv')

# Affichage des 10 premières lignes
df.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [None]:
df['title'].iloc[0]

'Toy Story (1995)'

In [None]:
df['genres'].iloc[0]

'Adventure|Animation|Children|Comedy|Fantasy'

In [None]:
print(df['genres'].iloc[0].split('|'))

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']


In [None]:
list_genres = df['genres'].iloc[0].split('|')

In [None]:
genres = ' '.join(list_genres)
print(genres)

Adventure Animation Children Comedy Fantasy


In [None]:
df['Genres'] = df['genres'].apply(lambda x: ' '.join(x.split('|')))
df.head()

Unnamed: 0,movieId,title,genres,Genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,Comedy


In [None]:
example = df["title"].iloc[0]
title = example[:-7]
print(title)

Toy Story


In [None]:
df['Title'] = df["title"].apply(lambda x:x[:-6])
df['Title'].head()

0                      Toy Story 
1                        Jumanji 
2               Grumpier Old Men 
3              Waiting to Exhale 
4    Father of the Bride Part II 
Name: Title, dtype: object

In [None]:
df['Description'] = df['Title'] + df['Genres']
df['Description'].head()

0    Toy Story Adventure Animation Children Comedy ...
1                   Jumanji Adventure Children Fantasy
2                      Grumpier Old Men Comedy Romance
3               Waiting to Exhale Comedy Drama Romance
4                   Father of the Bride Part II Comedy
Name: Description, dtype: object

In [None]:
# Créer un TfidfVectorizer et supprimer les mots vides
tfidf = TfidfVectorizer(stop_words='english')

# Adapter et transformer les données en une matrice tfidf
matrice_tfidf = tfidf.fit_transform(df['Description'])

# Afficher la forme de la matrice tfidf
print(matrice_tfidf.shape)

(27278, 21639)


In [None]:
# On calcule la similarité cosinus
sim_cosinus = cosine_similarity(matrice_tfidf, matrice_tfidf)

## Recommandation à partir d'un film donné

In [None]:
# Créer une série d'indices en utilisant la colonne 'title' comme index
indices = pd.Series(range(0,len(df)), index=df['title'])

In [None]:
indices 

title
Toy Story (1995)                          0
Jumanji (1995)                            1
Grumpier Old Men (1995)                   2
Waiting to Exhale (1995)                  3
Father of the Bride Part II (1995)        4
                                      ...  
Kein Bund für's Leben (2007)          27273
Feuer, Eis & Dosenbier (2002)         27274
The Pirates (2014)                    27275
Rentun Ruusu (2001)                   27276
Innocence (2014)                      27277
Length: 27278, dtype: int64

In [None]:
def recommandations(titre, mat_sim, num_recommendations = 10):
    # On récupère l'indice associé au titre qui servira à identifier le livre dans la matrice de similarité
    idx = indices[titre]

    # On obtient les scores de similarité de tous les livres avec le livre donée et on les garde les tuples d'indice du livre et score dans une liste
    scores_similarite = list(enumerate(mat_sim[idx]))

    # On trie les livres en fonction des scores de similarité
    scores_similarite = sorted(scores_similarite, key=lambda x: x[1], reverse=True)

    # Obtenir les scores des 10 livres les plus similaires
    top_similair = scores_similarite[1:num_recommendations+1]

    # Obtenir les indices des livres
    res = [(indices.index[idx], score) for idx, score in top_similair]

    # Renvoyer les titres des livres les plus similaires
    return tabulate(res, headers=["Titre", "Score de similarité"], tablefmt="pretty")

In [None]:
print("\n Recommandations pour 'Toy Story (1995)' similarité cosinus: \n",recommandations('Toy Story (1995)', sim_cosinus))


 Recommandations pour 'Toy Story (1995)' similarité cosinus: 
 +-------------------------------------------+---------------------+
|                   Titre                   | Score de similarité |
+-------------------------------------------+---------------------+
|            Toy Story 2 (1999)             | 1.0000000000000002  |
|            Toy Story 3 (2010)             | 0.9113859172333056  |
|        Toy Story of Terror (2013)         | 0.7799092699536356  |
|              Toy, The (1982)              | 0.6685923268757011  |
|     Toy Story That Time Forgot (2014)     | 0.6499940292482791  |
|     Toy Story Toons: Small Fry (2011)     | 0.6339633655169195  |
| Toy Story Toons: Hawaiian Vacation (2011) | 0.6299998199532924  |
|              Tin Toy (1988)               | 0.6067693062477115  |
|         Christmas Toy, The (1986)         | 0.5547788491819647  |
|   We're Back! A Dinosaur's Story (1993)   | 0.5351749833344537  |
+-------------------------------------------+-------

## Essai de recommandation pour un user donné à partir du film qu'il a regardé en dernier et de son film préféré

In [None]:
# Import du jeu de données de rating par les users
df = pd.read_csv('../data/ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [None]:
user_id = 1

In [None]:
df_user = df[df["userId"] == user_id]
df_user.shape

(175, 4)

In [None]:
df_last_movie = df_user[df_user["timestamp"] == df_user["timestamp"].max()]
df_favorite_movie = df_user[df_user["rating"] == df_user["rating"].max()].iloc[0]

In [None]:
df_last_movie

Unnamed: 0,userId,movieId,rating,timestamp
62,1,1750,3.5,1112486201


In [None]:
last_movie = indices.index[df_last_movie['movieId'].iloc[0]]
last_movie

'Lost in Space (1998)'

In [None]:
df_favorite_movie

userId       1.000000e+00
movieId      4.993000e+03
rating       5.000000e+00
timestamp    1.112485e+09
Name: 131, dtype: float64

In [None]:
favorite_movie = indices.index[int(df_favorite_movie['movieId'])]
favorite_movie

'Honky Tonk Freeway (1981)'

#### A partir du film regardé en dernier

In [None]:
print("\n Recommandations pour {} similarité cosinus: \n".format(last_movie),recommandations(last_movie, sim_cosinus))


 Recommandations pour Lost in Space (1998) similarité cosinus: 
 +-----------------------------+---------------------+
|            Titre            | Score de similarité |
+-----------------------------+---------------------+
|     All Is Lost (2013)      |  0.656127983156011  |
|   Land of the Lost (2009)   | 0.6394915857680674  |
|   Lost World, The (1925)    | 0.6333715363502492  |
|    Space Cowboys (2000)     | 0.6092701678548771  |
| First Man Into Space (1959) | 0.6055234948925605  |
|  Attack from Space (1965)   | 0.6019934047155743  |
|    Space Raiders (1983)     |  0.591531886876905  |
|        Space (1985)         | 0.5793810513098876  |
| Invaders from Space (1965)  | 0.5624220066315129  |
|  Space Is The Place (1974)  | 0.5541816904744011  |
+-----------------------------+---------------------+


#### A partir du film préféré

In [None]:
print("\n Recommandations pour {} similarité cosinus: \n".format(favorite_movie),recommandations(favorite_movie, sim_cosinus))


 Recommandations pour Honky Tonk Freeway (1981) similarité cosinus: 
 +-----------------------------------------------+---------------------+
|                     Titre                     | Score de similarité |
+-----------------------------------------------+---------------------+
|                Freeway (1988)                 | 0.5156516291820995  |
|                Freeway (1996)                 | 0.4991891124414969  |
| Freeway II: Confessions of a Trickbaby (1999) | 0.2988404653926198  |
|                B*A*P*S (1997)                 | 0.12222530172715305 |
|                In & Out (1997)                | 0.12222530172715305 |
|                H.O.T.S. (1979)                | 0.12222530172715305 |
|                  Made (2001)                  | 0.12222530172715305 |
|                 S.O.B. (1981)                 | 0.12222530172715305 |
|              Another You (1991)               | 0.12222530172715305 |
|            One, Two, Three (1961)             | 0.1222253017271