https://www.datacamp.com/community/tutorials/recommender-systems-python

https://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/

https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a

    

https://github.com/dtrckd/simplon_datai_2020/blob/master/brief_7/brief.md


In [33]:
# import module time

from time import time

In [34]:
# Import Pandas
import pandas as pd

#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer


# Load Movies Metadata
data = pd.read_csv('movies_metadata.csv', low_memory=False)

# Print the first three rows
data.head(3)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [4]:
data.shape

(45466, 24)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

Weighted Randing WR

WR =( v / v + m) * R + (m / v + m) * C

WR = (v * R + R * C) / v + m

WR = R * (v + C ) / v + m

v = vote_count
R = vote_average
m : is a hyperparameter ,chosse  as the 90th percentile . it must have more votes than at least 90% of the movies on the list.

As a first step, let's calculate the value of C, the mean rating across all movies using the pandas .mean() function:


In [35]:
# Calculate mean of vote average column
C = round(data['vote_average'].mean(),3)
print(C)


# you can observe that the average rating of a movie 
#on IMDB is around 5.6 on a scale of 10.

5.618


calculate the number of votes, m, received by a movie in the 90th percentile

In [36]:
# Calculate the minimum number of votes required to be in the chart, m
m = data['vote_count'].quantile(0.90)
print(m)


160.0


In [37]:
# Filter out all qualified movies into a new DataFrame
q_movies = data.copy().loc[data['vote_count'] >= m]
q_movies.shape


(4555, 24)

In [39]:
q_movies.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [40]:
# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [41]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)


In [42]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(10)


Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445865
834,The Godfather,6024.0,8.5,8.425433
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421413
12481,The Dark Knight,12269.0,8.3,8.265474
2843,Fight Club,9678.0,8.3,8.256381
292,Pulp Fiction,8670.0,8.3,8.251402
522,Schindler's List,4436.0,8.3,8.206632
23673,Whiplash,4376.0,8.3,8.205397
5481,Spirited Away,3968.0,8.3,8.196047
2211,Life Is Beautiful,3643.0,8.3,8.187163


In [43]:
#Print plot overviews of the first 5 movies.
data['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [44]:
# import de NLK
import nltk

# stop words multi languages
#search for other languages used in the tfidf.get_feature_names()
#and make a corresponding stop_words
 
stop_words_eng = nltk.corpus.stopwords.words("english")
stop_words_fr = nltk.corpus.stopwords.words("french")


stop_words = stop_words_fr + stop_words_eng
#stop_words =[]

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words = stop_words_eng)

#Replace NaN with an empty string
data['overview'] = data['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(data['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape
# fr + eng = (45466, 75924)
# fr = 45466, 76062)

(45466, 75990)

In [15]:
#Array mapping from feature integer indices to feature name.
#tfidf.get_feature_names()[890:900]

In [45]:
len(tfidf.get_feature_names())

75990

### step2 : content based recommander

In [46]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
#cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

def cosine_sim (i):
    cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix[i])
    return cosine_sim



In [47]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(data.index, index=data['title']).drop_duplicates()


In [48]:


# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(linear_kernel(tfidf_matrix,tfidf_matrix[idx])))

    
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['title'].iloc[movie_indices]

In [49]:
from time import time
start = time()
l = get_recommendations('The Dark Knight Rises')
t = time() - start
l.shape[0]
print("Time:", round(t,3))
for i,j in enumerate(l):
    print(i+1,j)

Time: 0.301
1 The Dark Knight
2 Batman Forever
3 Batman Returns
4 Batman: Under the Red Hood
5 Batman
6 Batman Unmasked: The Psychology of the Dark Knight
7 Batman Beyond: Return of the Joker
8 Batman: Year One
9 Batman: The Dark Knight Returns, Part 1
10 Batman: Bad Blood


In [50]:
def get_recommendations(title, n):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(linear_kernel(tfidf_matrix,tfidf_matrix[idx])))

    
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['title'].iloc[movie_indices]

In [51]:
#get_recommendations(title, n)

from time import time
start = time()
l = get_recommendations('The Dark Knight Rises',15)
t = time() - start
l.shape[0]
print("Time:", round(t,3))
for i,j in enumerate(l):
    print(i+1,j)

Time: 0.297
1 The Dark Knight
2 Batman Forever
3 Batman Returns
4 Batman: Under the Red Hood
5 Batman
6 Batman Unmasked: The Psychology of the Dark Knight
7 Batman Beyond: Return of the Joker
8 Batman: Year One
9 Batman: The Dark Knight Returns, Part 1
10 Batman: Bad Blood
11 Batman: Mask of the Phantasm
12 LEGO DC Comics Super Heroes: Batman: Be-Leaguered
13 Batman: The Dark Knight Returns, Part 2
14 Batman Begins
15 Batman Unlimited: Animal Instincts


In [52]:
# Reduction de dimension
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
tfidf_matrix_reduit= svd.fit_transform(tfidf_matrix)

In [53]:
tfidf_matrix_reduit.shape

(45466, 100)

In [54]:
tfidf_matrix_reduit[2].reshape(-1,1).shape

(100, 1)

In [55]:

def get_recommendations_reduit(title, n , tfidf_matrix_reduit ):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(linear_kernel(tfidf_matrix_reduit,tfidf_matrix_reduit[idx].reshape(1,-1))))

    
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['title'].iloc[movie_indices]

In [56]:
#get_recommendations(title, n)

from time import time
start = time()
l = get_recommendations_reduit('GoldenEye',15,tfidf_matrix_reduit)
t1 = time() - start
l.shape[0]
print("Time:", round(t1,3))
for i,j in enumerate(l):
    print(i+1,j)

Time: 0.456
1 The Package
2 Phantasm
3 3 Ninjas Knuckle Up
4 A Lawless Street
5 Death Valley
6 Stone
7 GoldenEye
8 The Executioner
9 The Man from U.N.C.L.E.
10 The Loves of Hercules
11 The Offering
12 The Transformers: The Movie
13 The Secret Fury
14 R.O.D - Read or Die
15 Quarantine 2: Terminal


In [57]:
#get_recommendations(title, n)

from time import time
start = time()
l = get_recommendations_reduit('GoldenEye',15,tfidf_matrix_reduit)
t = time() - start
l.shape[0]
print("Time:", round(t,3))
for i,j in enumerate(l):
    print(i+1,j)

Time: 0.49
1 The Package
2 Phantasm
3 3 Ninjas Knuckle Up
4 A Lawless Street
5 Death Valley
6 Stone
7 GoldenEye
8 The Executioner
9 The Man from U.N.C.L.E.
10 The Loves of Hercules
11 The Offering
12 The Transformers: The Movie
13 The Secret Fury
14 R.O.D - Read or Die
15 Quarantine 2: Terminal


step3 : amélioration

coder les 2 améliorations :

    Introduce a popularity filter: this recommender would take the 30 most similar movies, calculate the weighted ratings (using the IMDB formula from above), sort movies based on this rating, and return the top 10 movies.
    
    Use the PCA to improve the speed of your similarity search with 100 components. Does the result are coherent.


Les résultats d'une PCA (TruncatedSVD) avec un n_components = 100 et même en changeant le nombre d'itération, ne convergent pas. Il faudrait utiliser un n_components = 1000, voir 2000 pourque les résults soient cohérents.
La PCA augmente le temps de traitement et n'améliore pas les résultats.

In [58]:
# SVD iterration à 15 , composants = 100, title =  GoldenEye
from sklearn.decomposition import TruncatedSVD
svd1 = TruncatedSVD(n_components=100, n_iter=15, random_state=42)
tfidf_matrix_reduit15= svd1.fit_transform(tfidf_matrix)

In [59]:
#get_recommendations(title, n)

from time import time
start = time()
l = get_recommendations_reduit('GoldenEye',15, tfidf_matrix_reduit15 )
t1 = time() - start
l.shape[0]
print("Time:", round(t1,3))
for i,j in enumerate(l):
    print(i+1,j)

Time: 0.465
1 Phantasm
2 The Package
3 Stone
4 A Lawless Street
5 Death Valley
6 Genius Within: The Inner Life of Glenn Gould
7 Tonight She Comes
8 3 Ninjas Knuckle Up
9 The Lost Missile
10 I Love Trouble
11 A Million Ways to Die in the West
12 Batman: Assault on Arkham
13 Death in Buenos Aires
14 Troll Hunter
15 F


In [60]:
# SVD iterration à 15 , composants = 100 , title = The Dark Knight Rises
from time import time
start = time()
l = get_recommendations_reduit('The Dark Knight Rises',15, tfidf_matrix_reduit15 )
t1 = time() - start
l.shape[0]
print("Time:", round(t1,3))
for i,j in enumerate(l):
    print(i+1,j)

Time: 0.46
1 NY77: The Coolest Year in Hell
2 Pixels
3 Hungry Hearts
4 Serpico
5 Max Payne
6 Multi-Facial
7 Rubble Kings
8 Percentage
9 Hobo with a Shotgun
10 The City
11 Dead Man Down
12 Bad Lieutenant
13 Amsterdamned
14 Alphabet City
15 Texas Killing Fields


In [61]:
# Reduction de dimension à 1000
from sklearn.decomposition import TruncatedSVD
svd1000 = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)
tfidf_matrix_reduit1000= svd1000.fit_transform(tfidf_matrix)

In [62]:
from time import time
start = time()
l = get_recommendations_reduit('The Dark Knight Rises',15, tfidf_matrix_reduit1000 )
t1 = time() - start
l.shape[0]
print("Time:", round(t1,3))
for i,j in enumerate(l):
    print(i+1,j)

Time: 0.494
1 Q & A
2 Slow Burn
3 Criminal Law
4 The Payoff
5 Rage of Angels
6 Accused
7 JFK
8 The File on Thelma Jordon
9 Night Falls on Manhattan
10 Batman Forever
11 Ricochet
12 The People Against O'Hara
13 The Dark Knight
14 Jigsaw
15 Batman: Year One


In [63]:
# Reduction de dimension à 2000
from sklearn.decomposition import TruncatedSVD
svd2000 = TruncatedSVD(n_components=2000, n_iter=7, random_state=42)
tfidf_matrix_reduit2000= svd2000.fit_transform(tfidf_matrix)

In [64]:
from time import time
start = time()
l = get_recommendations_reduit('The Dark Knight Rises',15, tfidf_matrix_reduit2000 )
t1 = time() - start
l.shape[0]
print("Time:", round(t1,3))
for i,j in enumerate(l):
    print(i+1,j)

Time: 1.339
1 Batman Forever
2 The Dark Knight
3 Batman Returns
4 Batman: Under the Red Hood
5 Batman Unmasked: The Psychology of the Dark Knight
6 Batman
7 Batman: Year One
8 Batman: Bad Blood
9 Slow Burn
10 Batman: The Dark Knight Returns, Part 1
11 LEGO DC Comics Super Heroes: Batman: Be-Leaguered
12 Q & A
13 Batman: Mask of the Phantasm
14 Criminal Law
15 Batman Beyond: Return of the Joker


step3 : Amélioration

coder les 2 améliorations :

   Introduce a popularity filter: this recommender would take the 30 most similar movies, calculate the weighted ratings (using the IMDB formula from above), sort movies based on this rating, and return the top 10 movies.


In [65]:
data['score'] = data.apply(weighted_rating, axis=1)

In [66]:
def get_recommendations_f(title, n):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(linear_kernel(tfidf_matrix,tfidf_matrix[idx])))

    
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    result = data['title'].iloc[movie_indices]
    
    return result, movie_indices

In [67]:
result, movie_indices = get_recommendations_f('The Dark Knight Rises', 30)
data4 = data.iloc[movie_indices]

#Sort movies based on score calculated above
data4 = data4.sort_values('score', ascending=False)

#Print the top 10 movies
data4[['title', 'score']].head(10)


Unnamed: 0,title,score
12481,The Dark Knight,8.265474
10122,Batman Begins,7.460746
20232,"Batman: The Dark Knight Returns, Part 2",7.276928
19792,"Batman: The Dark Knight Returns, Part 1",7.115579
15511,Batman: Under the Red Hood,7.08769
41976,The Lego Batman Movie,7.044997
585,Batman,6.904069
3095,Batman: Mask of the Phantasm,6.645714
9230,Batman Beyond: Return of the Joker,6.534872
18035,Batman: Year One,6.528627


In [68]:
result, movie_indices = get_recommendations_f('The Dark Knight Rises', 30)
data4 = data.iloc[movie_indices]
data4[['title', 'score']].head(10)

Unnamed: 0,title,score
12481,The Dark Knight,8.265474
150,Batman Forever,5.239597
1328,Batman Returns,6.515798
15511,Batman: Under the Red Hood,7.08769
585,Batman,6.904069
21194,Batman Unmasked: The Psychology of the Dark Kn...,5.84678
9230,Batman Beyond: Return of the Joker,6.534872
18035,Batman: Year One,6.528627
19792,"Batman: The Dark Knight Returns, Part 1",7.115579
35983,Batman: Bad Blood,6.25341
