Build a Metadata based recommendation system on credits and keyword dataset.

In [1]:
#importing required libraries
import numpy as np
import pandas as pd
from ast import literal_eval

import warnings
warnings.filterwarnings("ignore")

In [2]:
#reading the dataset to the python environment
credits=pd.read_csv('credits.csv')
keywords=pd.read_csv('keywords.csv')
movies_metadata=pd.read_csv('movies_metadata.csv')

In [3]:
#displaying the first few rows
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [4]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [5]:
#to find the number of rows and shape
print('Shape of credits dataset: ',credits.shape)
print('Shape of keywords dataset: ',keywords.shape)

Shape of credits dataset:  (45476, 3)
Shape of keywords dataset:  (46419, 2)


In [6]:
#to check for null values
credits.isna().sum()

cast    0
crew    0
id      0
dtype: int64

In [7]:
keywords.isna().sum()

id          0
keywords    0
dtype: int64

There are no null values in both the datasets.

In [8]:
movies_metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [9]:
movies_metadata=movies_metadata[['id','original_title','vote_average']]

# Preprocessing

In [10]:
#defining a function to extract the key details from the column
def unpack(x):
    eval_x = literal_eval(x)
    if len(eval_x) == 0:
        return None
    else:
        return [ i['name'] for i in eval_x]

In [11]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [12]:
keywords['keywords'] = keywords.keywords.apply(lambda x : unpack(x) )
keywords.head()

Unnamed: 0,id,keywords
0,862,"[jealousy, toy, boy, friendship, friends, riva..."
1,8844,"[board game, disappearance, based on children'..."
2,15602,"[fishing, best friend, duringcreditsstinger, o..."
3,31357,"[based on novel, interracial relationship, sin..."
4,11862,"[baby, midlife crisis, confidence, aging, daug..."


In [13]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [14]:
credits["cast"] = credits.cast.apply(lambda x : unpack(x) )
credits["crew"] = credits.crew.apply(lambda x : unpack(x) )
credits.head()

Unnamed: 0,cast,crew,id
0,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Joss Whedon, Andrew Stanton, J...",862
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Larry J. Franco, Jonathan Hensleigh, James Ho...",8844
2,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[Howard Deutch, Mark Steven Johnson, Mark Stev...",15602
3,"[Whitney Houston, Angela Bassett, Loretta Devi...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...",31357
4,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[Alan Silvestri, Elliot Davis, Nancy Meyers, N...",11862


In [15]:
#to change datatype to string
credits['id']=credits['id'].astype('str')
keywords['id']=keywords['id'].astype('str')

In [16]:
#merging credits and keywords dataframe
movies=keywords.merge(credits,on='id')
movies=movies.merge(movies_metadata,on='id')
movies.head()

Unnamed: 0,id,keywords,cast,crew,original_title,vote_average
0,862,"[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Joss Whedon, Andrew Stanton, J...",Toy Story,7.7
1,8844,"[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Larry J. Franco, Jonathan Hensleigh, James Ho...",Jumanji,6.9
2,15602,"[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[Howard Deutch, Mark Steven Johnson, Mark Stev...",Grumpier Old Men,6.5
3,31357,"[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devi...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...",Waiting to Exhale,6.1
4,11862,"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short, Kim...","[Alan Silvestri, Elliot Davis, Nancy Meyers, N...",Father of the Bride Part II,5.7


In [17]:
movies.isnull().sum()

id                    0
keywords          14889
cast               2491
crew                798
original_title        0
vote_average          4
dtype: int64

In [18]:
#repacing the null values by an empty list
for col in ['keywords','cast','crew']:
    movies[col]=movies[col].fillna("").apply(list)

In [19]:
movies.isnull().sum()

id                0
keywords          0
cast              0
crew              0
original_title    0
vote_average      4
dtype: int64

In [20]:
movies['keywords']=movies['keywords']+movies['cast']+movies['crew']
#converting lists to string
movies['keywords']=movies['keywords'].apply(lambda x: ', '.join([str(i) for i in x]))
movies['keywords'].head()

0    jealousy, toy, boy, friendship, friends, rival...
1    board game, disappearance, based on children's...
2    fishing, best friend, duringcreditsstinger, ol...
3    based on novel, interracial relationship, sing...
4    baby, midlife crisis, confidence, aging, daugh...
Name: keywords, dtype: object

In [21]:
#filtering out movies with avergae vote less than 7
top_movies=movies[movies['vote_average']>7]
top_movies.shape

(7885, 6)

In [22]:
top_movies.drop(['id','cast','crew','vote_average'],axis=1,inplace=True)

In [23]:
#initialising
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
tfidf_matrix = tf.fit_transform(top_movies['keywords'])
tfidf_matrix.shape

(7885, 73974)

In [24]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
#using a numerical quantity to define similarity between movies
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

top_movies = top_movies.reset_index()
titles = top_movies['original_title']
indices = pd.Series(top_movies.index, index=top_movies['original_title'])

In [25]:
#function that returns the 5 most similar movies based on the cosine similarity score.
def get_recommendations(title):
    idx = indices[title]   # to get index of title from 'indices' list
    
    #to get a list of tuples with index of movie and similarity score with input movie
    sim_scores = list(enumerate(cosine_sim[idx])) 
    
    #sorting the array based on descending order of similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [26]:
get_recommendations('The Usual Suspects')

4332    X-Men: Days of Future Past
3552         The Dark Knight Rises
4152       The Wolf of Wall Street
65                    Forrest Gump
28                       Apollo 13
Name: original_title, dtype: object

In [28]:
get_recommendations('Toy Story')

723        Toy Story 2
3033       Toy Story 3
1011    Monsters, Inc.
2590            WALL·E
1255      Finding Nemo
Name: original_title, dtype: object