<h1 style="color:red">Movielens  - Recommandation system</h1>

***
1. [Chargement des fichiers csv](#fich_csv) <br>
2. [Dataframe tags](#dataframe_tags) <br>
3. [Dataframe genres](#dataframe_genres) <br>
4. [Dataframe movies](#dataframe_movies) <br>
5. [Content based filtering](#sklearn_neighbors) <br>
    - [Random movies selection](#random_rec) <br>
    - [User movies selection](#user_rec) <br>
6. [Collaborative filtering](#collaborative_filtering) <br>
    - [Matrice des utilisateurs](#matrice_utilisateurs) <br>
    - [Matrice des films](#matrice_films) <br>
***

<h2 style="color:green"><a id="fich_csv">Chargement des fichiers csv<a></h2>

In [1]:
import pandas as pd
import numpy as np

In [2]:
%%time
mvl_movies = pd.read_csv('datasets\ml-20m\movies.csv',
                         index_col='movieId',
                         dtype={'title':'string',
                                'genres':'string'})

mvl_ratings = pd.read_csv(r'datasets\ml-20m\ratings.csv',
                          dtype={'userId':'int32',
                                 'movieId':'int32',
                                 'rating':'float32',
                                 'timestamp':'int64'})

mvl_tags = pd.read_csv(r'datasets\ml-20m\tags.csv',
                       dtype={'userId':'int32',
                              'movieId':'int32',
                              'tag':'string',
                              'timestamp':'int64'})

mvl_genome_tags = pd.read_csv('datasets\ml-20m\genome-tags.csv',
                              dtype={'tagId':'int32',
                                     'tag':'string'})

mvl_genome_scores = pd.read_csv('datasets\ml-20m\genome-scores.csv', 
                                dtype={'movieId':'int32',
                                       'tagId':'int32',
                                       'relevance':'float16'})

mvl_links = pd.read_csv('datasets\ml-20m\links.csv',
                        dtype={'imdbId':'int32',
                               'movieId':'int32',
                               'tmdbId':'string'})

Wall time: 11 s


In [3]:
mvl_movies.shape

(27278, 2)

In [4]:
%%time
imdb_movies = pd.read_csv(r'datasets\imdb\title.basics.tsv.gz',
                          compression='gzip',
                          sep='\t',
                          dtype={'tconst':'string',
                                 'titleType':'string',
                                 'primaryTitle':'string',
                                 'originalTitle':'string',
                                 'isAdult':'string',
                                 'startYear':'object',
                                 'endYear':'object',
                                 'runtimeMinutes':'string',
                                 'genres':'string'},
                          na_values="\\N")
imdb_movies['imdbId'] = imdb_movies['tconst'].str[2:].astype('int32')

Wall time: 30.2 s


In [5]:
imdb_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8268702 entries, 0 to 8268701
Data columns (total 10 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          string
 1   titleType       string
 2   primaryTitle    string
 3   originalTitle   string
 4   isAdult         string
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  string
 8   genres          string
 9   imdbId          int32 
dtypes: int32(1), object(2), string(7)
memory usage: 599.3+ MB


<h2 style="color:green"><a id="dataframe_tags">Création du dataframe tags</a></h2>
<p>Objectif: obtenir un dataframe de la forme</p>
<table align="left">
  <tr>
    <th>movieId</th>
    <th>tag1</th>
    <th>tag2</th>
    <th>...</th>
  </tr>
  <tr>
    <td>1</td>
    <td>score1</td>
    <td>score2</td>
    <th>...</th>
  </tr>
  <tr>
    <td>2 </td>
    <td>score1</td>
    <td>score2</td>
    <th>...</th>
  </tr>
  <tr>
    <td>3 </td>
    <td>score1</td>
    <td>score2</td>
    <th>...</th>
  </tr>
</table>

In [6]:
mvl_genome_scores.loc[:,'relevance'] = pd.cut(mvl_genome_scores['relevance'], bins=10, labels=np.arange(10))
mvl_genome_scores.loc[:,'relevance'] = mvl_genome_scores['relevance'].astype('uint8')
mvl_genome_scores.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0
1,1,2,0
2,1,3,0
3,1,4,0
4,1,5,1


In [7]:
mvl_genome_scores['relevance'].value_counts()

0    7760502
1    1942627
2     848245
3     439959
4     258863
5     163853
6     112435
7      80397
8      56123
9      46764
Name: relevance, dtype: int64

In [8]:
tags = mvl_genome_scores.merge(mvl_genome_tags, on='tagId').drop('tagId', axis=1)
tags = tags[['movieId', 'tag', 'relevance']]
tags = pd.pivot_table(tags, index='movieId', columns='tag', values='relevance')
tags.head()

tag,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,1,2,0,2,2,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [9]:
tags.shape

(10381, 1128)

In [10]:
#for i, name in enumerate(tags.columns):
#    print(i, name)

In [11]:
from textdistance import levenshtein
from sklearn.neighbors import DistanceMetric 
d = DistanceMetric.get_metric('euclidean')

tags_cols = tags.columns.to_list()[14:] #On ignore les tags avec des chiffres (14 premiers tags)

prob_list = ['australia','australian',
             'god', 'good',
             'german', 'germany',
             'india', 'indie',
             'iran', 'iraq',
             'race', 'rape',
             'remade', 'remake',
             'russia', 'russian',
             'world war i', 'world war ii']

tags_cols = [elt for elt in tags_cols if elt not in prob_list] #On ignore les tags dont la synthaxe est trop similaire et avec une signification trop différente
nb_tags = len(tags_cols)
remove_list = list()

for i in range(nb_tags):
    for j in range(i+1, nb_tags):
        tag = tags_cols[i]
        other_tag = tags_cols[j]
        if tag[0] == other_tag[0]:
            if levenshtein.distance(tag, other_tag) == 1:
                #dist = d.pairwise(tags[tag],  tags[other_tag])
                print (tag, other_tag)
                #print(dist)
                remove_list.append(other_tag)
        else:
            break
                   
remove_list.append('witches')            

alien aliens
assassin assassins
blood bloody
book books
boring boring!
boring boxing
camp campy
comic comics
cute cute!
distopia dystopia
dog dogs
dragon dragons
fairy tale fairy tales
father son relationship father-son relationship
gangster gangsters
geek geeks
gore gory
heroin heroine
hilarious hillarious
lawyer lawyers
money monkey
monster monsters
nazi nazis
non-linear nonlinear
nostalgia nostalgic
paranoia paranoid
post apocalyptic post-apocalyptic
robot robots
sci fi sci-fi
sci fi scifi
sci-fi scifi
sequel sequels
sex sexy
stop motion stop-motion
super hero super-hero
super hero superhero
super-hero superhero
teen teens
teenager teenagers
train trains
vampire vampires
video game video games
video game videogame
visual visuals
zombie zombies


In [12]:
len(remove_list)

46

In [13]:
tags = tags.drop(remove_list, axis=1)

In [14]:
tags.shape

(10381, 1084)

<h2 style="color:green"><a id="dataframe_genres">Création du DataFrame genres</a></h2>

In [15]:
genres = mvl_movies['genres'].str.split('|').explode()
ind_genres = pd.get_dummies(genres)
genres = pd.concat([genres, ind_genres], axis=1)
genres = genres.drop('genres', axis=1)
genres = genres.groupby('movieId').sum()
genres = genres.reset_index()
genres.head()

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
genres.shape

(27278, 21)

<h2 style="color:green"><a id="dataframe_movies">Création du DataFrame movies</a></h2>

In [17]:
movies = mvl_movies.merge(mvl_links[['movieId', 'imdbId']], on='movieId')
movies= movies.merge(imdb_movies[['imdbId', 'primaryTitle', 'startYear']], on='imdbId')
movies = movies[['movieId', 'primaryTitle', 'startYear']]
movies = movies.merge(genres, on='movieId')
movies = movies.merge(tags, on='movieId')
movies = movies.rename(columns={'primaryTitle':'title', 'startYear':'year'})
movies.head()

Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,workplace,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie
0,1,Toy Story,1995,0,0,1,1,1,1,0,...,1,0,0,0,0,1,0,0,0,0
1,2,Jumanji,1995,0,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,0,0,0,0,1,0,...,2,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1


In [18]:
movies.set_index('movieId')

Unnamed: 0_level_0,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,workplace,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story,1995,0,0,1,1,1,1,0,0,...,1,0,0,0,0,1,0,0,0,0
2,Jumanji,1995,0,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,Grumpier Old Men,1995,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,Waiting to Exhale,1995,0,0,0,0,0,1,0,0,...,2,0,0,0,0,1,0,0,0,0
5,Father of the Bride Part II,1995,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130578,The Gunman,2015,0,1,0,0,0,0,0,0,...,1,0,0,0,0,2,0,0,0,0
130840,Spring,2014,0,0,0,0,0,0,0,0,...,0,0,0,0,2,4,1,0,0,2
131013,Get Hard,2015,0,0,0,0,0,1,0,0,...,2,0,0,0,0,1,1,0,0,0
131168,Phoenix,2014,0,0,0,0,0,0,0,0,...,0,2,1,8,0,1,0,0,5,0


In [19]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10366 entries, 0 to 10365
Columns: 1107 entries, movieId to zombie
dtypes: int64(1), object(1), string(1), uint8(1104)
memory usage: 11.2+ MB


In [20]:
movies.tail()

Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,workplace,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie
10361,130578,The Gunman,2015,0,1,0,0,0,0,0,...,1,0,0,0,0,2,0,0,0,0
10362,130840,Spring,2014,0,0,0,0,0,0,0,...,0,0,0,0,2,4,1,0,0,2
10363,131013,Get Hard,2015,0,0,0,0,0,1,0,...,2,0,0,0,0,1,1,0,0,0
10364,131168,Phoenix,2014,0,0,0,0,0,0,0,...,0,2,1,8,0,1,0,0,5,0
10365,131170,Parallels,2015,0,0,0,0,0,0,0,...,0,0,0,0,0,6,5,0,0,1


In [21]:
movies.to_csv('mvl_movies.csv')

## Création du DataFrame ratings

In [22]:

ratings = mvl_ratings.drop('timestamp', axis=1)
ratings = ratings[ratings['movieId'].isin(movies['movieId'].unique())]
ratings.head()


Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [23]:
ratings.to_csv('mvl_ratings.csv')

## Création du DataFrame reviews (travaux en cours/non utilisé)

In [24]:
#reviews = ratings.merge(movies, on='movieId')

In [25]:
#reviews.head(5)

In [26]:
#reviews.info()

<h2 style="color:green"><a id="sklearn_neighbors">Content based filtering with sklearn.neighbors</a></h2>

In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In [28]:
def content_based_rec(user_data_raw, n_rec=10, scaler=StandardScaler(), metric='euclidean'):
    user_data = user_data_raw.drop(['movieId', 'title', 'year'], axis=1)
    data = movies.drop(['movieId', 'title', 'year'], axis=1)
    
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
    user_data = scaler.transform(user_data)
    
    n_neighbors = NearestNeighbors(n_neighbors=n_rec+1, algorithm='auto', metric=metric)
    n_neighbors.fit(data)
    
    dist, results = n_neighbors.kneighbors(user_data, n_rec+1, return_distance=True)
    rec = [movies.iloc[results[i][1:],:] for i in range(len(results))]

    return user_data_raw, rec, dist

<h3 style="color:blue"><a id="random_rec"> Recommandation à partir d'une liste de films choisis au hasard</a></h3>
Création de la fonction select_random_movies() pour selectionner 1 ou plusieurs films au hasard

In [29]:
def select_random_movies(n=1):
    rand_movies = list()
    rand_ids = np.random.choice(movies['movieId'].unique(), size=n, replace=False)
    rand_movies_data = movies[movies['movieId'].isin(rand_ids)]
    return rand_movies_data

In [30]:
def random_content_based_recommandation(n=1, n_rec=10, scaler=StandardScaler(), metric='euclidean'):
    user_data_raw = select_random_movies(n=n)
    return content_based_rec(user_data_raw, n_rec=n_rec, scaler=scaler, metric=metric)

In [31]:
user, rec, dist = random_content_based_recommandation(n=1, n_rec=10)

In [32]:
user

Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,workplace,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie
5579,6584,"What's Up, Tiger Lily?",1966,0,0,1,0,0,1,1,...,1,0,0,0,0,3,1,0,0,1


In [33]:
rec[0]

Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,workplace,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie
6550,8512,Silent Movie,1976,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4110,4649,Wet Hot American Summer,2001,0,0,0,0,0,1,0,...,0,0,0,0,0,3,0,0,0,1
3600,4079,Amazon Women on the Moon,1987,0,0,0,0,0,1,0,...,0,0,0,0,0,2,0,0,0,1
1846,2111,The Man with Two Brains,1983,0,0,0,0,0,1,0,...,1,0,0,0,0,2,0,0,0,0
1005,1125,The Return of the Pink Panther,1975,0,0,0,0,0,1,1,...,0,0,0,0,0,1,0,0,0,0
4137,4678,UHF,1989,0,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
8483,61394,The Onion Movie,2008,0,0,0,0,0,1,0,...,2,1,0,0,0,3,1,0,0,1
10122,112450,They Came Together,2014,0,0,0,0,0,1,0,...,0,0,0,0,0,4,2,0,0,1
2080,2372,Fletch Lives,1989,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
495,520,Robin Hood: Men in Tights,1993,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


<h3 style="color:blue"><a id="user_rec"> Recommandation à partir d'une liste de films déterminés</a></h3>
Création de la fonction select_movies() pour selectionner 1 ou plusieurs films en fonction de leur Id

In [34]:
def select_movies(movies_id=[1198]):
    return movies[movies['movieId'].isin(movies_id)]

In [35]:
def user_content_based_recommandation(movies_id=[1198], n_rec=10, scaler=StandardScaler(), metric='euclidean'):
    user_data_raw = select_movies(movies_id)
    return content_based_rec(user_data_raw, n_rec=n_rec, scaler=scaler, metric=metric)
    

In [36]:
def find_movie_id(title):
    return movies[movies['title'].str.contains(" ".join(title.split()), case=False)]
    
find_movie_id(str(input()))

 hello


Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,workplace,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie
25,26,Othello,1995,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
2518,2848,Othello,1951,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5679,6732,"Hello, Dolly!",1969,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
6510,8382,Hello Again,1987,0,0,0,0,0,1,0,...,1,0,0,0,0,2,0,0,0,2
10257,117511,Hello Ladies: The Movie,2014,0,0,0,0,0,1,0,...,1,0,0,0,0,5,2,0,0,1


In [37]:
user, rec, dist = user_content_based_recommandation(movies_id=[int(input())], n_rec=20)

 26


In [38]:
user

Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,workplace,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie
25,26,Othello,1995,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [47]:
def movie_tags_info(movie, level=3):
    print(f"=== {movie['title'].item()} ===")
    tags = movie.iloc[:,3:].columns
    selected_tags = list()
    for tag in tags:
        relevance = movie[tag].item()
        if relevance>level:
            selected_tags.append((tag, relevance))
    return sorted(selected_tags, key=lambda tag:tag[1], reverse=True)

movie_tags_info(user)

=== Othello ===


[('based on a play', 9),
 ('shakespeare', 9),
 ('adaptation', 8),
 ('betrayal', 8),
 ('good', 7),
 ('great acting', 7),
 ('great ending', 7),
 ('literature', 7),
 ('original', 7),
 ('tragedy', 7),
 ('biographical', 6),
 ('corruption', 6),
 ('dramatic', 6),
 ('great movie', 6),
 ('interesting', 6),
 ('oscar (best actress)', 6),
 ('adapted from:book', 5),
 ('british', 5),
 ('complex characters', 5),
 ('dialogue', 5),
 ('excellent', 5),
 ('good acting', 5),
 ('great', 5),
 ('greed', 5),
 ('mentor', 5),
 ('oscar (best supporting actress)', 5),
 ('passionate', 5),
 ('story', 5),
 ('very good', 5),
 ('very interesting', 5),
 ('witty', 5),
 ('classic', 4),
 ('clever', 4),
 ('complex', 4),
 ('costume drama', 4),
 ('drama', 4),
 ('england', 4),
 ('good soundtrack', 4),
 ('manipulation', 4),
 ('oscar (best directing)', 4),
 ('revenge', 4),
 ('runaway', 4),
 ('secrets', 4),
 ('segregation', 4),
 ('talky', 4),
 ('vengeance', 4)]

In [40]:
dist = dist[0,1:]

In [41]:
dist = dist/np.linalg.norm(dist)

In [42]:
rec[0].loc[:,'dist'] = dist
rec[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,dist
3278,3723,Hamlet,1990,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0.172455
2684,3028,The Taming of The Shrew,1967,0,0,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0.178389
941,1050,Looking for Richard,1996,0,0,0,0,0,0,0,...,2,0,0,0,1,0,0,0,0,0.18768
472,497,Much Ado About Nothing,1993,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0.191972
3165,3598,Hamlet,2000,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,1,0.194603
1258,1411,Hamlet,1996,0,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0.199971
2308,2622,A Midsummer Night's Dream,1999,0,0,0,0,0,1,0,...,0,0,0,0,2,0,0,0,0,0.210491
949,1059,Romeo + Juliet,1996,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0.218444
4198,4745,O,2001,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0.229948
3224,3668,Romeo and Juliet,1968,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0.234169


<h2 style="color:green"><a id="collaborative_filtering">Collaborative filtering</a></h2>

<h3 style="color:blue"><a id="matrice_utilisateurs">Création de la matrice des utilisateurs</a></h3>

In [43]:
user_info = ratings.groupby('userId').agg(nbReviews=('movieId', 'count'),
                                          averageRating=('rating', 'mean')).sort_values(by='nbReviews', ascending=False)

In [44]:
user_genre_info = ratings.merge(genres, on='movieId').drop(['movieId', 'rating'], axis=1).groupby('userId').sum()
user_genre_info['favoriteGenre'] = user_genre_info.apply(lambda x: genres.iloc[:,1:].columns[x.argmax()], axis=1)

KeyboardInterrupt: 

In [None]:
user_info = pd.concat([user_info, user_genre_info['favoriteGenre']], axis=1)

In [None]:
user_info.to_csv('mvl_user_info.csv')

In [None]:
user_list = user_info.sort_values(by='nbReviews', ascending=False).head(1000)
user_list

In [None]:
users = ratings[ratings['userId'].isin(user_list.index)]
users = users.merge(movies[['movieId', 'title']], on='movieId')

In [None]:
users

In [None]:
users.info()

In [None]:
users_movies = users.pivot(index='userId', columns=['movieId', 'title'], values='rating')

In [None]:
not_viewed = users_movies.isna()
users_movies = users_movies.fillna(users_movies.mean().round(3))
#changer les 0 pour les NaN
users_movies

In [None]:
users_movies.to_csv('mvl_top_1000_users_movies.csv')

In [None]:
not_viewed.to_csv('mvl_top_1000_users_movies_not_seen.csv')

In [None]:
not_viewed

<h3 style="color:blue"><a id="recommandation_users">Recommandation à partir d'un utilisateur au hasard</a></h3>

In [None]:
def select_random_user():
    return np.random.choice(users_movies.index)


def collab_rec(user_id, n_rec=10, metric='euclidean'):
    user = users_movies[users_movies.index == user_id]
    n_neighbors = NearestNeighbors(n_neighbors=n_rec+1, algorithm='auto', metric=metric)
    n_neighbors.fit(users_movies)
    
    dist, results = n_neighbors.kneighbors(user, n_rec+1, return_distance=True)
    return user, dist, results

In [None]:
def random_collab_rec(n_rec=10, metric='euclidean'):
    return collab_rec(select_random_user(), n_rec, metric)

In [None]:
user, dist, rec = random_collab_rec()

In [None]:
user

In [None]:
not_viewed.loc[user.index,:]

In [None]:
print('=== Favoris === \n', user.squeeze(axis=0).sort_values(ascending=False).head(50))

In [None]:
prop = list()
for i in range(user.shape[1]):
    if (not_viewed.loc[user.index,:].iloc[0,i]) == True:
        prop.append(not_viewed.columns[i][0])

In [None]:
similar_users = users_movies.iloc[rec[0]]
similar_users

In [None]:
rec_final = similar_users.mean().sort_values(ascending=False)
rec_final = rec_final.to_frame()
rec_final = rec_final.reset_index()

In [None]:
rec_final = rec_final[rec_final['movieId'].isin(prop)]
print(f"=== Films recommandés ===\n{rec_final.head(10)}")

<h3 style="color:blue"><a id="matrice_films">Création de la matrice des films</a></h3>

In [None]:
movies_list = ratings.groupby('movieId').agg(nbReviews=('userId', 'count')).sort_values(by='nbReviews', ascending=False).head(1000)
movies_list

In [None]:
most_rated_movies = ratings[ratings['movieId'].isin(movies_list.index)]
most_rated_movies

In [None]:
movies_users = pd.pivot_table(data=most_rated_movies, index=['movieId'], columns=['userId'], values=['rating'])

In [None]:
movies_users = movies_users.fillna(0)
movies_users

In [None]:
movies_users.to_csv('mvl_movies_users.csv')

In [None]:
#essayer reco `a partir d un film plus facile `a verifier