Content Based Recommender

In [30]:
import pandas as pd
import numpy as np


In [31]:
metadata=pd.read_csv('movies_metadata.csv',low_memory=False)
metadata=metadata[:10000]

In [32]:
metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

To perform similarity ,first we should compute word vectors of each overview by doind Term Frequency-Inverse Document Frequency vectors for each document 

In [33]:
#import TfIdfVectorizer from scikit-learn
#It is built in class
from sklearn.feature_extraction.text import TfidfVectorizer

#should all english stop words as 'the','a' and create TF-IDF vectrizer object
tfidf=TfidfVectorizer(stop_words='english')

#We should replace empty 
metadata['overview']=metadata['overview'].fillna('')

#Constructing TF-IDF vector matrix 
tfidf_matrix=tfidf.fit_transform(metadata['overview'])

tfidf_matrix.shape

(10000, 32350)

Now we can calculate similarity score , by using cosine similarity as it fits the give text modal 

In [34]:
tfidf.get_feature_names()[5000:5010]

['cellisten',
 'cellmate',
 'cellmates',
 'cello',
 'cellular',
 'celtics',
 'cement',
 'cemetary',
 'cemetery',
 'cenobite']

In [35]:
from sklearn.metrics.pairwise import linear_kernel

#compute cosine similarity matrix

cosine_sim=linear_kernel(tfidf_matrix,Y=None)


In [36]:
cosine_sim.shape

(10000, 10000)

In [37]:
cosine_sim[0]

array([1.        , 0.01682915, 0.        , ..., 0.        , 0.        ,
       0.        ])

We have to find top ten movies similar to given movie title.

In [38]:
indices=pd.Series(metadata.index,index=metadata['title']).drop_duplicates()

In [39]:
indices

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                                ... 
Miracle in Milan                9995
Before the Fall                 9996
The Frisco Kid                  9997
Onmyoji: The Yin Yang Master    9998
State Property 2                9999
Length: 10000, dtype: int64

In [40]:
'''Creating a fuction that takes movir title as input and outputs
most similar movies'''
def get_recom(title,cosine_sim=cosine_sim):
    index=indices[title]
    sim_scores=list(enumerate(cosine_sim[index]))
    sim_scores=sorted(sim_scores,key=lambda x:x[1],reverse=True)
    sim_scores=sim_scores[1:11]
    movies_indices=[i[0] for i in sim_scores]
    return metadata['title'].iloc[movies_indices]

In [41]:
#lets check for Green Dragon
get_recom('Green Dragon')

5453                                        Indian Summer
7461                                             The Rack
2924                                    Meatballs Part II
6476                                         Camp Nowhere
3978                                  Ernest Goes to Camp
2926                                          Meatballs 4
1865                                      Friday the 13th
3783                               Prince of Central Park
9804                                     The Green Berets
4639    Into the Arms of Strangers: Stories of the Kin...
Name: title, dtype: object

In [42]:
#Let us optimize our system by considering factors like director,keywords,cast,genre 
#load credits and f=keywords file
credits=pd.read_csv('credits.csv')
keywords=pd.read_csv('keywords.csv')

In [43]:
credits

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862
...,...,...,...
45471,"[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de...",439050
45472,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",111109
45473,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",67758
45474,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506


In [44]:
metadata=pd.read_csv('movies_metadata.csv',low_memory=False)
metadata=metadata[:5000]

In [45]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [46]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [47]:
# Converting all id's in int ,so we can merge in into metadata file
keywords['id']=keywords['id'].astype('int')
credits['id']=credits['id'].astype('int')
metadata['id']=metadata['id'].astype('int')
metadata=metadata.merge(credits,on='id')
metadata=metadata.merge(keywords,on='id')
metadata.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [48]:
#literal_evalution is done to remove unnecessary characters 
from ast import literal_eval
features=['cast','crew','keywords','genres']
for feature in features:
    metadata[feature]=metadata[feature].apply(literal_eval)

metadata[features].head(2)

Unnamed: 0,cast,crew,keywords,genres
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."


In [49]:
#function to return director from crew column of every row
def get_director(x):
    for i in x:
        if i['job']=='Director':
            return i['name']
    return np.nan

In [50]:
#fuction to write only three names 
def get_list(x):
    if isinstance(x,list):
        names=[i['name'] for i in x]
        if len(names)>3:
            return names[:3]
        return names
    return []

In [51]:
metadata['Director']=metadata['crew'].apply(get_director)
features=['cast','keywords','genres']
for feature in features:
    metadata[feature]=metadata[feature].apply(get_list)

In [52]:
metadata[['title','cast','Director','keywords','genres']].head(3)

Unnamed: 0,title,cast,Director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


In [53]:
metadata['cast']

0                     [Tom Hanks, Tim Allen, Don Rickles]
1          [Robin Williams, Jonathan Hyde, Kirsten Dunst]
2              [Walter Matthau, Jack Lemmon, Ann-Margret]
3       [Whitney Houston, Angela Bassett, Loretta Devine]
4              [Steve Martin, Diane Keaton, Martin Short]
                              ...                        
5016         [Craig Stevens, William Hopper, Alix Talton]
5017              [Kevin Costner, Joe Morton, Ron Rifkin]
5018        [Stuart Townsend, Aaliyah, Marguerite Moreau]
5019           [Arliss Howard, Debra Winger, Paul Le Mat]
5020         [Patrick Swayze, Forest Whitaker, Đơn Dương]
Name: cast, Length: 5021, dtype: object

In [54]:
#function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x,list):
        return [str.lower(i.replace(' ','')) for i in x]
    else:
        if isinstance(x,str):
            return str.lower(x.replace(' ',''))
        else:
            return ''

In [55]:
features=['cast','keywords','genres','Director']
for feature in features:
    metadata[feature]=metadata[feature].apply(clean_data)

In [56]:
def soup(x):
    return ' '.join(x['keywords'])+' '+' '.join(x['cast'])+' '+x['Director']+' '+' '.join(x['genres'])+' '+x['Director']

In [57]:
metadata['soup']=metadata.apply(soup,axis=1)

In [58]:
metadata.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,cast,crew,keywords,Director,soup
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[animation, comedy, family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,,Toy Story,False,7.7,5415.0,"[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy]",johnlasseter,jealousy toy boy tomhanks timallen donrickles ...
1,False,,65000000,"[adventure, fantasy, family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[robinwilliams, jonathanhyde, kirstendunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[boardgame, disappearance, basedonchildren'sbook]",joejohnston,boardgame disappearance basedonchildren'sbook ...


In [59]:
#Here count vectorizer is used as weighted sum is unnecssary 
from sklearn.feature_extraction.text import CountVectorizer
count=CountVectorizer(stop_words='english')
count_matrix=count.fit_transform(metadata['soup'])


In [60]:
count_matrix.shape

(5021, 12184)

In [61]:
#To perform cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim2=cosine_similarity(count_matrix,count_matrix)


In [62]:
indices=pd.Series(metadata.index,index=metadata['title'])
indices

title
Toy Story                         0
Jumanji                           1
Grumpier Old Men                  2
Waiting to Exhale                 3
Father of the Bride Part II       4
                               ... 
The Deadly Mantis              5016
Dragonfly                      5017
Queen of the Damned            5018
Big Bad Love                   5019
Green Dragon                   5020
Length: 5021, dtype: int64

In [63]:
get_recom('Green Dragon',cosine_sim2)

3778    Went to Coney Island on a Mission from God... ...
4521                                          Next of Kin
1640                                     Ill Gotten Gains
3475                           Jails, Hospitals & Hip-Hop
7                                            Tom and Huck
1097                                           North Star
1982                                            Kidnapped
2794                                Napoleon and Samantha
479                                                Lassie
924                                     The Mark of Zorro
Name: title, dtype: object

In [64]:
get_recom('The Godfather',cosine_sim2)

1926          The Godfather: Part III
1191           The Godfather: Part II
4450    Tucker: The Man and His Dream
1178                   Apocalypse Now
4000                 Gardens of Stone
1606                    The Rainmaker
2017                    The Outsiders
3623                 The Conversation
4582                  The Cotton Club
2368            Peggy Sue Got Married
Name: title, dtype: object