In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
metadata = pd.read_csv('/content/TheMoviesDataset/movies_metadata.csv', low_memory=False)
credits = pd.read_csv('/content/TheMoviesDataset/credits.csv')
keywords = pd.read_csv('/content/TheMoviesDataset/keywords.csv')
metadata = metadata.drop([19730, 29503, 35587])

In [5]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

In [6]:
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)
    
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)
    
metadata['cast'] = metadata['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [7]:
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(10)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"
3,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devine]",Forest Whitaker,"[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]"
4,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short]",Charles Shyer,"[baby, midlife crisis, confidence]",[Comedy]
5,Heat,"[Al Pacino, Robert De Niro, Val Kilmer]",Michael Mann,"[robbery, detective, bank]","[Action, Crime, Drama]"
6,Sabrina,"[Harrison Ford, Julia Ormond, Greg Kinnear]",Sydney Pollack,"[paris, brother brother relationship, chauffeur]","[Comedy, Romance]"
7,Tom and Huck,"[Jonathan Taylor Thomas, Brad Renfro, Rachael ...",Peter Hewitt,[],"[Action, Adventure, Drama]"
8,Sudden Death,"[Jean-Claude Van Damme, Powers Boothe, Dorian ...",Peter Hyams,"[terrorist, hostage, explosive]","[Action, Adventure, Thriller]"
9,GoldenEye,"[Pierce Brosnan, Sean Bean, Izabella Scorupco]",Martin Campbell,"[cuba, falsely accused, secret identity]","[Adventure, Action, Thriller]"


In [8]:
stemmer = SnowballStemmer('english')
metadata['keywords'] = metadata['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])

In [9]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

metadata['director'] = metadata['director'].apply(lambda x: [x])

relevant_columns = ['title','cast', 'keywords', 'genres', 'director']
metadata = metadata[relevant_columns]

In [10]:
metadata.head(10)

Unnamed: 0,title,cast,keywords,genres,director
0,Toy Story,"[tomhanks, timallen, donrickles]","[jealousi, toy, boy]","[animation, comedy, family]",[johnlasseter]
1,Jumanji,"[robinwilliams, jonathanhyde, kirstendunst]","[boardgam, disappear, basedonchildren'sbook]","[adventure, fantasy, family]",[joejohnston]
2,Grumpier Old Men,"[waltermatthau, jacklemmon, ann-margret]","[fish, bestfriend, duringcreditssting]","[romance, comedy]",[howarddeutch]
3,Waiting to Exhale,"[whitneyhouston, angelabassett, lorettadevine]","[basedonnovel, interracialrelationship, single...","[comedy, drama, romance]",[forestwhitaker]
4,Father of the Bride Part II,"[stevemartin, dianekeaton, martinshort]","[babi, midlifecrisi, confid]",[comedy],[charlesshyer]
5,Heat,"[alpacino, robertdeniro, valkilmer]","[robberi, detect, bank]","[action, crime, drama]",[michaelmann]
6,Sabrina,"[harrisonford, juliaormond, gregkinnear]","[pari, brotherbrotherrelationship, chauffeur]","[comedy, romance]",[sydneypollack]
7,Tom and Huck,"[jonathantaylorthomas, bradrenfro, rachaelleig...",[],"[action, adventure, drama]",[peterhewitt]
8,Sudden Death,"[jean-claudevandamme, powersboothe, dorianhare...","[terrorist, hostag, explos]","[action, adventure, thriller]",[peterhyams]
9,GoldenEye,"[piercebrosnan, seanbean, izabellascorupco]","[cuba, falselyaccus, secretident]","[adventure, action, thriller]",[martincampbell]


In [11]:
metadata['combined'] =  metadata['keywords'] + metadata['cast'] + metadata['director'] + metadata['genres']
metadata['combined'] = metadata['combined'].apply(lambda x: ' '.join(x))
metadata['combined'].head()


0    jealousi toy boy tomhanks timallen donrickles ...
1    boardgam disappear basedonchildren'sbook robin...
2    fish bestfriend duringcreditssting waltermatth...
3    basedonnovel interracialrelationship singlemot...
4    babi midlifecrisi confid stevemartin dianekeat...
Name: combined, dtype: object

In [12]:
metadata = metadata.reset_index()
metadata[['index','title','combined']].to_csv("/content/movies_metadata.csv",index = False)

In [14]:
def get_recommendations(title):
    data = pd.read_csv("/content/movies_metadata.csv")
    count = CountVectorizer(stop_words='english')
    indices = pd.Series(data.index, index=data['title'])
    count_matrix = count.fit_transform(data['combined'].astype('U'))
    cosine_sim = cosine_similarity(count_matrix[indices[title]], count_matrix)
    sim_scores = list(enumerate(cosine_sim[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]

    return data['title'].iloc[movie_indices]



In [15]:
get_recommendations('Batman')

1349                     Batman Returns
1511                     Batman & Robin
18480    G.I. Joe: The Revenge of Cobra
150                      Batman Forever
1363                      Mars Attacks!
2546                           Superman
2547                        Superman II
11162                  Superman Returns
11889                      Spider-Man 3
21269                      Man of Steel
Name: title, dtype: object