In [2]:
import pandas as pd
import numpy as np

#import data from the clean file
df = pd.read_csv('../Movies/Data/metadata_clean.csv')

#print the head of the cleaned Dataframe
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995


In [4]:
#import the original file
original_df = pd.read_csv('../Movies/Data/movies_metadata.csv', low_memory=False)

#Add the useful features into the cleaned dataframe
df['overview'], df['id'] = original_df['overview'], original_df['id']

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862


### Creating the TF-IDF matrix


In [8]:
#import TfIdfVectorizer from the scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a Tf-IDF vectorizer object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df['overview'] = df['overview'].fillna('')

# Construct the required TF-IDF matrix by applying the fit_transform on the 
# overview feature
tfidf_matrix = tfidf.fit_transform(df['overview'])

#output the shape of the tfidf_matrix
tfidf_matrix.shape

(45466, 75827)

### Computing the consine similarity score
calculate the pairwise consine similariy score of every movie

In [12]:
#import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

#compute the consine similarity matrix
consine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

### Building the recommender function

In [14]:
#construct a reverse mapping of indices and movies, title and drop duplicate titles if any

indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [31]:
# functions that takes in movie title as input and gives recommendation

def content_recommender(title, consine_sim=consine_sim, df=df, indices=indices):
    #obtain the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwise similarity scores of all movies with that movie 
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(consine_sim[idx]))
    
    # sort the movies based on the consine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies: ignore the first movies
    sim_scores = sim_scores[1:11]
    
    # Get the movies indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]
    

In [32]:
# Get recommendation for the lion king
content_recommender('The Lion King')

34682    How the Lion Cub and the Turtle Sang a Song
9353                                The Lion King 1½
9115                  The Lion King 2: Simba's Pride
42829                                           Prey
25654                                 Fearless Fagan
17041                                   African Cats
27933              Massaï, les guerriers de la pluie
6094                                       Born Free
37409                                     Sour Grape
3203                                The Waiting Game
Name: title, dtype: object