In [1]:
# Import Pandas
import pandas as pd
from time import time

# Load Movies Metadata
metadata = pd.read_csv('movies_metadata.csv', low_memory=False)

# Print the first three rows
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [2]:
# Calculate mean of vote average column
C = metadata['vote_average'].mean()
print(C)

5.618207215134185


In [3]:
# Calculate the minimum number of votes required to be in the chart, m
m = metadata['vote_count'].quantile(0.90)       #90% des films on recus entre + de 160 votes
print(m)

160.0


In [4]:
# Filter out all qualified movies into a new DataFrame
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]

print(metadata.shape)
print(q_movies.shape)

(45466, 24)
(4555, 24)


In [5]:
metadata.shape

(45466, 24)

In [6]:
# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [7]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)                                     #axis?

In [8]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


In [9]:
#Print plot overviews of the first 5 movies.
metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [10]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(45466, 75827)

In [11]:
tfidf_matrix[:2]

<2x75827 sparse matrix of type '<class 'numpy.float64'>'
	with 58 stored elements in Compressed Sparse Row format>

In [12]:
#Array mapping from feature integer indices to feature name.
tfidf.get_feature_names()[5000:5010]

['avails',
 'avaks',
 'avalanche',
 'avalanches',
 'avallone',
 'avalon',
 'avant',
 'avanthika',
 'avanti',
 'avaracious']

In [13]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
#cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
#cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [14]:
tfidf_matrix[0]

<1x75827 sparse matrix of type '<class 'numpy.float64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [15]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix[0]) # le cosine pour 1 film
#print(tfidf_matrix[0])
list(enumerate(cosine_sim))

[(0, array([1.])),
 (1, array([0.01504121])),
 (2, array([0.])),
 (3, array([0.])),
 (4, array([0.])),
 (5, array([0.])),
 (6, array([0.])),
 (7, array([0.])),
 (8, array([0.])),
 (9, array([0.])),
 (10, array([0.])),
 (11, array([0.])),
 (12, array([0.])),
 (13, array([0.])),
 (14, array([0.])),
 (15, array([0.])),
 (16, array([0.])),
 (17, array([0.03851752])),
 (18, array([0.])),
 (19, array([0.])),
 (20, array([0.00974568])),
 (21, array([0.])),
 (22, array([0.])),
 (23, array([0.])),
 (24, array([0.])),
 (25, array([0.])),
 (26, array([0.])),
 (27, array([0.])),
 (28, array([0.])),
 (29, array([0.])),
 (30, array([0.])),
 (31, array([0.])),
 (32, array([0.])),
 (33, array([0.01862691])),
 (34, array([0.])),
 (35, array([0.])),
 (36, array([0.])),
 (37, array([0.])),
 (38, array([0.])),
 (39, array([0.])),
 (40, array([0.])),
 (41, array([0.])),
 (42, array([0.00634876])),
 (43, array([0.])),
 (44, array([0.])),
 (45, array([0.0089095])),
 (46, array([0.])),
 (47, array([0.])),
 (4

In [16]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [17]:
indices[:10]          

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [18]:
#from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD


In [71]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    #sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = list(enumerate(linear_kernel(tfidf_matrix, tfidf_matrix[idx])))
    

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:5]
    movie_indices = [i[0] for i in sim_scores]
    
    title = metadata['title'].iloc[movie_indices]
    score = weighted_rating(q_movies['score'].iloc[movie_indices]

    # Return the top 10 most similar movies
    return title ,score

In [77]:

def get_recommendations(title):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    #sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = list(enumerate(linear_kernel(tfidf_matrix, tfidf_matrix[idx])))
    

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:5]
    movie_indices = [i[0] for i in sim_scores]
    
    title = metadata['title'].iloc[movie_indices]
    #score = q_movies['score'].iloc[movie_indices]

    # Return the top 10 most similar movies
    return title#,score

Toy Story 3


In [73]:
#metadata[:1]

In [74]:
a = get_recommendations('Toy Story')
#l = get_recommendations('GoldenEye')
#print(result.shape[0])
#for i,j in enumerate(result):
    #print(i+1,j)

In [75]:
a

15348               Toy Story 3
2997                Toy Story 2
10301    The 40 Year Old Virgin
24523                 Small Fry
Name: title, dtype: object

In [22]:
pca = TruncatedSVD(n_components=2000, n_iter=7, random_state=42)


In [23]:
#svd=pca.fit_transform(tfidf_matrix) #un grand n'importe quoi!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [24]:
#svd.shape

In [25]:
#svd[1]

In [26]:
#print(metadata.head)

In [39]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendationsPCA(title, nb):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    #sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = list(enumerate(linear_kernel(svd, svd[idx].reshape(1,-1))))
    

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:nb+1]
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [38]:
#result1 = get_recommendationsPCA('The Godfather',15) #un grand n'importe quoi!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [29]:

#for i,j in enumerate(result1):
#   print(i+1,j)

In [35]:
list(metadata)

['adult',
 'belongs_to_collection',
 'budget',
 'genres',
 'homepage',
 'id',
 'imdb_id',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'video',
 'vote_average',
 'vote_count']

In [36]:
list(q_movies)

['adult',
 'belongs_to_collection',
 'budget',
 'genres',
 'homepage',
 'id',
 'imdb_id',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'video',
 'vote_average',
 'vote_count',
 'score']