In [127]:
%%HTML
<h2>Feature : Developing a Movie Recommendation System using Content based Filtering Algorithm </h2>
    
    <p><font size="4">Description -:</font>
        <br>
        <br>
       1) Display the top 10 Movies  <br>
        &nbsp;&nbsp;&nbsp;    a) Select movies present in the 98th percentile <br>
        &nbsp;&nbsp;&nbsp;     b) Calculate weighted rating using IMDB's weighted rating formula <br>
        &nbsp;&nbsp;&nbsp;     c) Display the top movies. <br>
            <br>
       2) Recommend similar movies to the movie taken as input from the user depending upon the top three actors present, genre, the director and the plot of the movie. <br> 
        &nbsp;&nbsp;&nbsp;   a) Use Cosine Similarity to calculate similarity between two movies <br>
        &nbsp;&nbsp;&nbsp;   b) Display 10 similar movies
            
    </p>


In [128]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer
import warnings; warnings.simplefilter('ignore')

In [129]:
# Reads the movies_metadata.csv file and displays the first 5 rows
movie_data = pd.read_csv('movies_metadata_small.csv')
movie_data.head()

# Extracting the data from dictionary to a list using list comprehension containing the genres of each movie in the dataset
movie_data['genres'] = movie_data['genres'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


In [130]:
''' Using IMDB's weighted rating formula to find the rating for each movie to find top movies based upon the following variables -:
    v = number of votes for the movie
    m = minimum votes required to be listed in the chart
    R = average rating of the movie
    C = mean vote across the entire dataset
'''
# storing the count of votes for each movie (v)
vote_counts = movie_data[movie_data['vote_count'].notnull()]['vote_count'].astype('int')

# storing the vote average for each movie  (R)
vote_averages = movie_data[movie_data['vote_average'].notnull()]['vote_average'].astype('int')

# selecting movies present in the 98 th percentile for finding the top 10 all time movies.
m = vote_counts.quantile(0.98) 

# Finding mean of votes across the dataset
C = vote_averages.mean()


In [131]:
# extracting only the year from the movie release date
movie_data['year'] = pd.to_datetime(movie_data['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

# Filtering out movies that qualify for the chart based on the vote count
top_movies = movie_data[(movie_data['vote_count'] >= m) & (movie_data['vote_count'].notnull()) & (movie_data['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average','genres']]
top_movies['vote_count'] = top_movies['vote_count'].astype('int') 
top_movies['vote_average'] = top_movies['vote_average'].astype('int')


In [132]:
# Function to calculate the weighted rating using IMDB's formula
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    weighted_rating = ((v/(v+m) * R) + (m/(m+v) * C))
    return weighted_rating

In [133]:
# calculates and stores the weighted rating for all the qualified movies
top_movies['wr'] = top_movies.apply(weighted_rating, axis=1)

# sorting based on the calculated weighted rating
top_movies = top_movies.sort_values('wr', ascending=False).head(250)

# Displays the top 10 all time movies
top_movies.head(10) 

Unnamed: 0,title,year,vote_count,vote_average,genres,wr
15480,Inception,2010,14075,8,"[Action, Thriller, Science Fiction, Mystery, A...",7.777455
12481,The Dark Knight,2008,12269,8,"[Drama, Action, Crime, Thriller]",7.747696
22879,Interstellar,2014,11187,8,"[Adventure, Drama, Science Fiction]",7.725723
2843,Fight Club,1999,9678,8,[Drama],7.687804
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,"[Adventure, Fantasy, Action]",7.663577
292,Pulp Fiction,1994,8670,8,"[Thriller, Crime]",7.656038
314,The Shawshank Redemption,1994,8358,8,"[Drama, Crime]",7.644853
7000,The Lord of the Rings: The Return of the King,2003,8226,8,"[Adventure, Fantasy, Action]",7.639899
351,Forrest Gump,1994,8147,8,"[Comedy, Drama, Romance]",7.636868
5814,The Lord of the Rings: The Two Towers,2002,7641,8,"[Adventure, Fantasy, Action]",7.616171


In [134]:
# Load keywords and credits. Keywords.csv contains certain keywords associated with the movie 
# and credits.csv contains the cast and crew information
credits = pd.read_csv('credits_small.csv')
keywords = pd.read_csv('keywords.csv')


# Converting movie IDs to int in order to merge with the movies dataset.
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
movie_data['id'] = movie_data['id'].astype('int')

# Merging keywords and credits into the movies dataset
movie_data = movie_data.merge(credits, on='id')
movie_data = movie_data.merge(keywords, on='id')

# Printing the first two movies of the newly merged movie_data
movie_data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,year,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,1995,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [135]:
# extracting data stored in form of strings to python objects. 
movie_data['cast'] = movie_data['cast'].apply(literal_eval)
movie_data['crew'] = movie_data['crew'].apply(literal_eval)
movie_data['keywords'] = movie_data['keywords'].apply(literal_eval)
movie_data['cast_size'] = movie_data['cast'].apply(lambda x: len(x))
movie_data['crew_size'] = movie_data['crew'].apply(lambda x: len(x))

In [136]:
# Function to extract the director's name from the crew coloumn. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

movie_data['director'] = movie_data['crew'].apply(get_director)



In [137]:
# Extracting the top 3 actors from cast data.
movie_data['cast'] = movie_data['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movie_data['cast'] = movie_data['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [138]:
# Extracting keywords from keywords data
movie_data['keywords'] = movie_data['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [139]:
# removing spaces and converting cast data to lowercase.
movie_data['cast'] = movie_data['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

# removing spaces and converting director names to lowercase.
movie_data['director'] = movie_data['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

# Appending the director name 3 times to give the director more weightage compared to the entire of the cast
movie_data['director'] = movie_data['director'].apply(lambda x: [x,x,x])


In [140]:
# converting every word to its stem word
word_stem = SnowballStemmer('english')
movie_data['keywords'] = movie_data['keywords'].apply(lambda x: [word_stem.stem(i) for i in x])
# to convert all strings to lower case and strip names of spaces
movie_data['keywords'] = movie_data['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [141]:
# creating a coloumn all_info which contains a string that contains all the data that has to be sent to the vectorizer
movie_data['all_info'] = movie_data['keywords'] + movie_data['cast'] + movie_data['director'] + movie_data['genres']
movie_data['all_info'] = movie_data['all_info'].apply(lambda x: ' '.join(x))

In [142]:
#generating a count matrix depending on data present in all_info field 
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(movie_data['all_info'])

In [143]:
# passing the count matrix generated from the vectorizer to the cosine similarity function for calculating cosine similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [144]:
# Resetting the index of movie_data and constructing reverse mapping
movie_data = movie_data.reset_index()
titles = movie_data['title']
indices = pd.Series(movie_data.index, index=movie_data['title'])

In [145]:
# Function to recommend the top 10 movies. The input is the name of movie and 
# the output is the 10 most similar movies to the movie name provided by the user

def get_recommendations(movie_name, cosine_sim=cosine_sim):
    # Storing the index of the movie name
    movie_index = indices[movie_name]

    # Storing the pairwise similarity scores for all movies with respect to the movie name provided.
    similarity_scores = list(enumerate(cosine_sim[movie_index]))

    # Sorting the movies based on cosine similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    similarity_scores = similarity_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in similarity_scores]
    print(" Since you watched",movie_name,",You may also like to watch -:")
    # Return the top 10 most similar movies
    return movie_data['title'].iloc[movie_indices]

In [146]:
# top 10 similar movies to Forrest Gump
recommendations = get_recommendations('Forrest Gump').head(10)
print(recommendations)

 Since you watched Forrest Gump ,You may also like to watch -:
39091                  The Lift
8280     I Wanna Hold Your Hand
3924                  Cast Away
42080                    Allied
3167          Death Becomes Her
5035                  Used Cars
3703          What Lies Beneath
31694                  The Walk
14566         A Christmas Carol
8321          The Polar Express
Name: title, dtype: object
