In [1]:
import warnings; 
warnings.simplefilter('ignore')
import pandas as pd
from ast import literal_eval

In [2]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer

In [3]:
path_to_dataset = 'movies_metadata_fixed.csv'
dataset = pd.read_csv(path_to_dataset)

In [4]:
dataset['genres'] = dataset['genres']\
              .fillna('[]')\
              .apply(literal_eval)\
              .apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [5]:
dataset['year'] = pd.to_datetime(dataset['release_date']).dt.year

In [6]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

In [7]:
dataset = pd.merge(dataset, credits, on='id')
dataset = pd.merge(dataset, keywords, on='id')
dataset.shape

(46628, 28)

In [9]:
links_small = pd.read_csv('links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [10]:
smd = dataset[dataset['id'].isin(links_small)]
smd.shape

(9219, 28)

In [11]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd.cast.head()

0    [{'cast_id': 14, 'character': 'Woody (voice)',...
1    [{'cast_id': 1, 'character': 'Alan Parrish', '...
2    [{'cast_id': 2, 'character': 'Max Goldman', 'c...
3    [{'cast_id': 1, 'character': 'Savannah 'Vannah...
4    [{'cast_id': 1, 'character': 'George Banks', '...
Name: cast, dtype: object

In [12]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd.cast.head()

0    [Tom Hanks, Tim Allen, Don Rickles, Jim Varney...
1    [Robin Williams, Jonathan Hyde, Kirsten Dunst,...
2    [Walter Matthau, Jack Lemmon, Ann-Margret, Sop...
3    [Whitney Houston, Angela Bassett, Loretta Devi...
4    [Steve Martin, Diane Keaton, Martin Short, Kim...
Name: cast, dtype: object

In [13]:
smd['cast'] = smd['cast'].apply(lambda x: x[:5] if len(x) >= 5 else x)
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
smd['cast_str'] = smd['cast'].apply(lambda x: ' '.join(x))
smd.cast_str.head()

0    tomhanks timallen donrickles jimvarney wallace...
1    robinwilliams jonathanhyde kirstendunst bradle...
2    waltermatthau jacklemmon ann-margret sophialor...
3    whitneyhouston angelabassett lorettadevine lel...
4    stevemartin dianekeaton martinshort kimberlywi...
Name: cast_str, dtype: object

In [14]:
smd['crew'] = smd['crew'].apply(literal_eval)
smd.crew.head()

0    [{'credit_id': '52fe4284c3a36847f8024f49', 'de...
1    [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...
2    [{'credit_id': '52fe466a9251416c75077a89', 'de...
3    [{'credit_id': '52fe44779251416c91011acb', 'de...
4    [{'credit_id': '52fe44959251416c75039ed7', 'de...
Name: crew, dtype: object

In [15]:
smd['crew'].iloc[0][0]

{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [16]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan
smd['director'] = smd['crew'].apply(get_director) 
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd.director.head()

0      johnlasseter
1       joejohnston
2      howarddeutch
3    forestwhitaker
4      charlesshyer
Name: director, dtype: object

In [17]:
smd['keywords'].head().apply(literal_eval).iloc[0]

[{'id': 931, 'name': 'jealousy'},
 {'id': 4290, 'name': 'toy'},
 {'id': 5202, 'name': 'boy'},
 {'id': 6054, 'name': 'friendship'},
 {'id': 9713, 'name': 'friends'},
 {'id': 9823, 'name': 'rivalry'},
 {'id': 165503, 'name': 'boy next door'},
 {'id': 170722, 'name': 'new toy'},
 {'id': 187065, 'name': 'toy comes to life'}]

In [18]:
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd.keywords.head()

0    [jealousy, toy, boy, friendship, friends, riva...
1    [board game, disappearance, based on children'...
2    [fishing, best friend, duringcreditsstinger, o...
3    [based on novel, interracial relationship, sin...
4    [baby, midlife crisis, confidence, aging, daug...
Name: keywords, dtype: object

In [19]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1)\
       .stack()\
       .reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s = s[s > 1]

In [20]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd.keywords.head()

0    [jealousy, toy, boy, friendship, friends, riva...
1    [board game, disappearance, based on children'...
2         [fishing, best friend, duringcreditsstinger]
3    [based on novel, interracial relationship, sin...
4    [baby, midlife crisis, confidence, aging, daug...
Name: keywords, dtype: object

In [21]:
def stem_keywords(x):
    stemmed_tokens = []
    for token in x:
        try:
            new_token = stemmer.stem(token)
            stemmed_tokens.append(new_token)
        except:
            stemmed_tokens.append(token)
    return stemmed_tokens

smd['keywords'] = smd['keywords'].apply(lambda x: stem_keywords(x))
smd['keywords'] = smd['keywords'].apply(lambda x: [i.replace(" ", "").lower() for i in x])

In [22]:
smd['keywords_str'] = smd['keywords'].apply(lambda x: ' '.join([str(i) for i in x]))
smd.keywords_str.head()

0    jealousy toy boy friendship friends rivalry bo...
1    boardgame disappearance basedonchildren'sbook ...
2              fishing bestfriend duringcreditsstinger
3    basedonnovel interracialrelationship singlemot...
4    baby midlifecrisis confidence aging daughter m...
Name: keywords_str, dtype: object

In [23]:
def concat_fields(data):
    concat = data['keywords'] + data['cast'] + [data['director']] + data['genres']
    result = ' '.join([str(i).lower() for i in concat])
    return result
smd['soup'] = smd.apply(lambda x: concat_fields(x), axis=1)

In [24]:
smd['title'].iloc[0], smd['soup'].iloc[0]

('Toy Story',
 'jealousy toy boy friendship friends rivalry boynextdoor newtoy toycomestolife tomhanks timallen donrickles jimvarney wallaceshawn johnlasseter animation comedy family')

In [25]:
count = CountVectorizer(ngram_range=(1, 2), min_df=2)
count_matrix = count.fit_transform(smd['soup'])
count_matrix.shape

(9219, 21542)

In [26]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim.shape

(9219, 9219)

In [27]:
smd = smd.reset_index()
titles = index=smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [28]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [29]:
get_recommendations('The Hobbit: An Unexpected Journey').head(10)

3899    The Lord of the Rings: The Fellowship of the Ring
8833            The Hobbit: The Battle of the Five Armies
4436                The Lord of the Rings: The Two Towers
8537                  The Hobbit: The Desolation of Smaug
5074        The Lord of the Rings: The Return of the King
1693                                The Lord of the Rings
8867                                             Warcraft
477                                            The Shadow
5852                                           The Hobbit
2730                      Baby: Secret of the Lost Legend
Name: title, dtype: object