In [23]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
movies = pd.read_csv("C:/Users/srisa/Desktop/project/AIandDA/code/movies_metadata.csv")
credits = pd.read_csv("C:/Users/srisa/Desktop/project/AIandDA/code/credits.csv")
#keywords like jealousy, fishing, etc that belongs to particular movies are also part of the metadata.
keywords = pd.read_csv("C:/Users/srisa/Desktop/project/AIandDA/code/keywords.csv")
#importing necessary columns
movies = movies[['id','title','genres']]
#clean movie_id function
def clean_id(x):
    try:
        return int(x)
    except:
        return np.nan
movies['id'] = movies['id'].apply(clean_id)
movies = movies[movies['id'].notnull()]
#converting everything into integer
movies['id'] = movies['id'].astype('int')
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
#merging the 3 dataframes to get all the required data on 1 datafarame movies
movies = movies.merge(credits, on='id')
movies = movies.merge(keywords, on='id')
movies.head()

Unnamed: 0,id,title,genres,cast,crew,keywords
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [24]:
#changing the 4 columns into python objects ( list of dictionaries here)
movies['genres'] = movies['genres'].apply(literal_eval)
movies['cast'] = movies['cast'].apply(literal_eval)
movies['crew'] = movies['crew'].apply(literal_eval)
movies['keywords'] = movies['keywords'].apply(literal_eval)
#grabbing the names of all the genres attached to each movie
movies['genres'] = movies['genres'].apply(lambda x: [i['name'].lower() for i in x])
#grabbing the name of the director from all the crew members
#we will only use directors from the creqw column for our purpose
movies['crew'] = movies['crew'].apply(lambda x: [i['name'].lower() for i in x if i['job']=='Director'])
#grabbing the cast and keywords from the list of dictionaries of those columns
movies['cast'] = movies['cast'].apply(lambda x: [i['name'].lower() for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i['name'].lower() for i in x])
#taking maximum 3 cast/genre/keywords for each movie
movies['genres'] = movies['genres'].apply(lambda x: x[:3] if len(x)>3 else x)
movies['cast'] = movies['cast'].apply(lambda x: x[:3] if len(x)>3 else x)
movies['keywords'] = movies['keywords'].apply(lambda x: x[:3] if len(x)>3 else x)
movies.head() 

Unnamed: 0,id,title,genres,cast,crew,keywords
0,862,Toy Story,"[animation, comedy, family]","[tom hanks, tim allen, don rickles]",[john lasseter],"[jealousy, toy, boy]"
1,8844,Jumanji,"[adventure, fantasy, family]","[robin williams, jonathan hyde, kirsten dunst]",[joe johnston],"[board game, disappearance, based on children'..."
2,15602,Grumpier Old Men,"[romance, comedy]","[walter matthau, jack lemmon, ann-margret]",[howard deutch],"[fishing, best friend, duringcreditsstinger]"
3,31357,Waiting to Exhale,"[comedy, drama, romance]","[whitney houston, angela bassett, loretta devine]",[forest whitaker],"[based on novel, interracial relationship, sin..."
4,11862,Father of the Bride Part II,[comedy],"[steve martin, diane keaton, martin short]",[charles shyer],"[baby, midlife crisis, confidence]"


In [26]:
#removing spaces
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(' ','') for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(' ','') for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(' ','') for i in x])
movies['genre'] = movies['genres'].apply(lambda x: [i.replace(' ','') for i in x])

In [27]:
movies.head()

Unnamed: 0,id,title,genres,cast,crew,keywords,genre
0,862,Toy Story,"[animation, comedy, family]","[tomhanks, timallen, donrickles]",[johnlasseter],"[jealousy, toy, boy]","[animation, comedy, family]"
1,8844,Jumanji,"[adventure, fantasy, family]","[robinwilliams, jonathanhyde, kirstendunst]",[joejohnston],"[boardgame, disappearance, basedonchildren'sbook]","[adventure, fantasy, family]"
2,15602,Grumpier Old Men,"[romance, comedy]","[waltermatthau, jacklemmon, ann-margret]",[howarddeutch],"[fishing, bestfriend, duringcreditsstinger]","[romance, comedy]"
3,31357,Waiting to Exhale,"[comedy, drama, romance]","[whitneyhouston, angelabassett, lorettadevine]",[forestwhitaker],"[basedonnovel, interracialrelationship, single...","[comedy, drama, romance]"
4,11862,Father of the Bride Part II,[comedy],"[stevemartin, dianekeaton, martinshort]",[charlesshyer],"[baby, midlifecrisis, confidence]",[comedy]


In [32]:
X = movies[movies.loc[:,movies.columns != 'title'].columns]
y = movies['title']
X.head()

Unnamed: 0,id,genres,cast,crew,keywords,genre
0,862,"[animation, comedy, family]","[tomhanks, timallen, donrickles]",[johnlasseter],"[jealousy, toy, boy]","[animation, comedy, family]"
1,8844,"[adventure, fantasy, family]","[robinwilliams, jonathanhyde, kirstendunst]",[joejohnston],"[boardgame, disappearance, basedonchildren'sbook]","[adventure, fantasy, family]"
2,15602,"[romance, comedy]","[waltermatthau, jacklemmon, ann-margret]",[howarddeutch],"[fishing, bestfriend, duringcreditsstinger]","[romance, comedy]"
3,31357,"[comedy, drama, romance]","[whitneyhouston, angelabassett, lorettadevine]",[forestwhitaker],"[basedonnovel, interracialrelationship, single...","[comedy, drama, romance]"
4,11862,[comedy],"[stevemartin, dianekeaton, martinshort]",[charlesshyer],"[baby, midlifecrisis, confidence]",[comedy]


don't mind the following 3 In, just tried if i could make the string -> float for feature selection but it didn't work 

In [34]:
a=X.columns
a

Index(['id', 'genres', 'cast', 'crew', 'keywords', 'genre'], dtype='object')

In [35]:
a = pd.get_dummies(a, dtype=float)
a

Unnamed: 0,cast,crew,genre,genres,id,keywords
0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0
5,0.0,0.0,1.0,0.0,0.0,0.0


In [36]:
a = a.drop('genre',axis=1)
a

Unnamed: 0,cast,crew,genres,id,keywords
0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0
5,0.0,0.0,0.0,0.0,0.0


In [None]:
CONCEPT: BAG-OF-WORDS IN NLP

In [38]:
movies['metadata'] = movies.apply(lambda x : ' '.join(x['genres']) + ' ' + ' '.join(x['cast']) + ' ' + ' '.join(x['crew']) + ' ' + ' '.join(x['keywords']), axis = 1)
movies.head()

Unnamed: 0,id,title,genres,cast,crew,keywords,genre,metadata
0,862,Toy Story,"[animation, comedy, family]","[tomhanks, timallen, donrickles]",[johnlasseter],"[jealousy, toy, boy]","[animation, comedy, family]",animation comedy family tomhanks timallen donr...
1,8844,Jumanji,"[adventure, fantasy, family]","[robinwilliams, jonathanhyde, kirstendunst]",[joejohnston],"[boardgame, disappearance, basedonchildren'sbook]","[adventure, fantasy, family]",adventure fantasy family robinwilliams jonatha...
2,15602,Grumpier Old Men,"[romance, comedy]","[waltermatthau, jacklemmon, ann-margret]",[howarddeutch],"[fishing, bestfriend, duringcreditsstinger]","[romance, comedy]",romance comedy waltermatthau jacklemmon ann-ma...
3,31357,Waiting to Exhale,"[comedy, drama, romance]","[whitneyhouston, angelabassett, lorettadevine]",[forestwhitaker],"[basedonnovel, interracialrelationship, single...","[comedy, drama, romance]",comedy drama romance whitneyhouston angelabass...
4,11862,Father of the Bride Part II,[comedy],"[stevemartin, dianekeaton, martinshort]",[charlesshyer],"[baby, midlifecrisis, confidence]",[comedy],comedy stevemartin dianekeaton martinshort cha...


In [None]:
DUE TO MEMORY ISSUES ONLY FIRST 10000 ROWS ARE SELECTED

In [47]:
movies_df = movies.iloc[:10000,:]
movies_df

Unnamed: 0,id,title,genres,cast,crew,keywords,genre,metadata
0,862,Toy Story,"[animation, comedy, family]","[tomhanks, timallen, donrickles]",[johnlasseter],"[jealousy, toy, boy]","[animation, comedy, family]",animation comedy family tomhanks timallen donr...
1,8844,Jumanji,"[adventure, fantasy, family]","[robinwilliams, jonathanhyde, kirstendunst]",[joejohnston],"[boardgame, disappearance, basedonchildren'sbook]","[adventure, fantasy, family]",adventure fantasy family robinwilliams jonatha...
2,15602,Grumpier Old Men,"[romance, comedy]","[waltermatthau, jacklemmon, ann-margret]",[howarddeutch],"[fishing, bestfriend, duringcreditsstinger]","[romance, comedy]",romance comedy waltermatthau jacklemmon ann-ma...
3,31357,Waiting to Exhale,"[comedy, drama, romance]","[whitneyhouston, angelabassett, lorettadevine]",[forestwhitaker],"[basedonnovel, interracialrelationship, single...","[comedy, drama, romance]",comedy drama romance whitneyhouston angelabass...
4,11862,Father of the Bride Part II,[comedy],"[stevemartin, dianekeaton, martinshort]",[charlesshyer],"[baby, midlifecrisis, confidence]",[comedy],comedy stevemartin dianekeaton martinshort cha...
...,...,...,...,...,...,...,...,...
9995,44181,National Lampoon's Gold Diggers,[comedy],"[willfriedle, chrisowen, louiselasser]",[garypreisler],[],[comedy],comedy willfriedle chrisowen louiselasser gary...
9996,24619,Blind Horizon,"[drama, thriller]","[valkilmer, nevecampbell, samshepard]",[michaelhaussman],[],"[drama, thriller]",drama thriller valkilmer nevecampbell samshepa...
9997,70926,Islands in the Stream,[drama],"[georgec.scott, davidhemmings, gilbertroland]",[franklinj.schaffner],[],[drama],drama georgec.scott davidhemmings gilbertrolan...
9998,37214,Go for Broke!,"[action, drama, war]","[vanjohnson, lanenakano, georgemiki]",[robertpirosh],"[worldwarii, army]","[action, drama, war]",action drama war vanjohnson lanenakano georgem...


 FINDING THE SIMILARITY USING COSINE DISTANCE

In [48]:
count_vec = CountVectorizer(stop_words='english')
count_vec_matrix = count_vec.fit_transform(movies_df['metadata'])
cosine_sim_matrix = cosine_similarity(count_vec_matrix, count_vec_matrix)
#movies index mapping
mapping = pd.Series(movies_df.index,index = movies_df['title'])

recommender function to recommend movies based on metadata

In [42]:
def recommend_movies_based_on_metadata(movie_input):
    movie_index = mapping[movie_input]
    #get similarity values with other movies
    similarity_score = list(enumerate(cosine_sim_matrix[movie_index]))
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    # Get the scores of the 15 most similar movies. Ignore the first movie,since it would be the inputed movie itself.
    similarity_score = similarity_score[1:15]
    movie_indices = [i[0] for i in similarity_score]
    return (movies_df['title'].iloc[movie_indices])

In [49]:
recommend_movies_based_on_metadata('Blind Horizon')

1648              Ill Gotten Gains
3487    Jails, Hospitals & Hip-Hop
8331                        Fabled
1801               Little Boy Blue
3940                 Kill Me Again
1391              Hearts and Minds
2715             The Pelican Brief
9506                        Trauma
3020                       Country
5763                   Raggedy Man
9586          When Will I Be Loved
111               Before and After
458                  Guilty as Sin
627                          Frisk
Name: title, dtype: object