In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import warnings; warnings.simplefilter('ignore')

In [None]:
#importing datasets
metadata = pd.read_csv("https://raw.githubusercontent.com/Rohit01-zoey/CS419M/main/movies_metadata_clean.csv")
credits_db = pd.read_csv("https://raw.githubusercontent.com/Rohit01-zoey/CS419M/main/credits_clean.csv")
keywords_db = pd.read_csv("https://raw.githubusercontent.com/Rohit01-zoey/CS419M/main/keywords_clean.csv")
small = pd.read_csv("https://raw.githubusercontent.com/Rohit01-zoey/CS419M/main/links_small.csv")

In [None]:
#dropping useless columns
credits_db.drop("Unnamed: 0", inplace = True, axis=1)
keywords_db.drop("Unnamed: 0", inplace = True, axis=1)

In [None]:
#keeping only needed columns
metadata = metadata.loc[:, ["genres", "id", "original_title", "vote_count", "vote_average"]]

In [None]:
#dropping rows with invalid data
metadata = metadata.drop([19730, 29503, 35587])

**keywords_db**: Contains keywords corresponing to each movie

In [None]:
display(keywords_db)

Unnamed: 0,id,keywords
0,862,"['toy', 'boy', 'friendship', 'friend']"
1,8844,"[""based on children's book"", 'giant insect']"
2,15602,"['fish', 'best friend']"
3,31357,"['based on novel', 'interracial relationship']"
4,11862,"['daughter', 'mother daughter relationship']"
...,...,...
46414,439050,[]
46415,111109,"['artist', 'play', 'pinoy']"
46416,67758,[]
46417,227506,[]


**credits_db**: Contains cast and director corresponding to each movie

In [None]:
display(credits_db)

Unnamed: 0,cast,id,director
0,"['woody (voice)', 'buzz lightyear (voice)', 'm...",862,JohnLasseter
1,"['alan parrish', 'samuel alan parrish / van pe...",8844,
2,"['max goldman', 'john gustafson', 'ariel gusta...",15602,HowardDeutch
3,"[""savannah 'vannah' jackson"", ""bernadine 'bern...",31357,ForestWhitaker
4,"['george banks', 'nina banks', 'franck eggelho...",11862,
...,...,...,...
45471,"['', '', '']",439050,HamidNematollah
45472,"['sister angela', 'homer', 'crazy woman/virgin...",111109,LavDiaz
45473,"['emily shaw', 'det. mark winston', 'jayne fer...",67758,MarkL.Lester
45474,"['', '', '', '', '']",227506,YakovProtazanov


**metadata**: Contains id, movie title, IMDB vote counts, IMDB vote averages and genres for each movie 

In [None]:
display(metadata)

Unnamed: 0,genres,id,original_title,vote_count,vote_average
0,"['Animation', 'Comedy', 'Family']",862,Toy Story,5415,7
1,"['Adventure', 'Fantasy', 'Family']",8844,Jumanji,2413,6
2,"['Romance', 'Comedy']",15602,Grumpier Old Men,92,6
3,"['Comedy', 'Drama', 'Romance']",31357,Waiting to Exhale,34,6
4,['Comedy'],11862,Father of the Bride Part II,173,5
...,...,...,...,...,...
45461,"['Drama', 'Family']",439050,رگ خواب,1,4
45462,['Drama'],111109,Siglo ng Pagluluwal,3,9
45463,"['Action', 'Drama', 'Thriller']",67758,Betrayal,6,3
45464,[],227506,Satana likuyushchiy,0,0


Repeating director's name 3 times (arbitrarily) to give more weightage to it

In [None]:
credits_db["director"] = credits_db["director"].apply(lambda x: [x, x, x])

In [None]:
#converting "id" in each dataset to int
keywords_db["id"] = keywords_db["id"].astype("int")
credits_db["id"] = credits_db["id"].astype("int")
metadata["id"] = metadata["id"].astype("int")

In [None]:
#merging the metadata, credits_db and keywords_db based on id
metadata_2 = metadata.merge(credits_db, on = "id")
metadata_2 = metadata_2.merge(keywords_db, on = "id")

Since we have limited computaional power, we use only a fraction of this dataset, ie. the movies with id present in dataset "small"

**small**: Contains only a fraction of movie ID's

In [None]:
display(small)

Unnamed: 0.1,Unnamed: 0,movieId,imdbId,tmdbId
0,0,1,114709,862.0
1,1,2,113497,8844.0
2,2,3,113228,15602.0
3,3,4,114885,31357.0
4,4,5,113041,11862.0
...,...,...,...,...
9120,9120,162672,3859980,402672.0
9121,9121,163056,4262980,315011.0
9122,9122,163949,2531318,391698.0
9123,9123,164977,27660,137608.0


In [None]:
small = small[small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [None]:
#making a new dataset of the subset of movies
metadata_2 = metadata_2[metadata_2["id"].isin(small)]

We will now use IMDB's weighted rating formula

**Weighted Rating** =  $\frac{v}{v+m}.R + \frac{m}{v+m}.C$

v: number of votes for the movie

R: average rating of the movie
 
C: mean vote across all movies

m: minimum votes required to be listed

We choose m to be a suitable threshold. In our case, we set $m = 0.75$, which essentially means that we keep the top 25 percentile movies only. This is to remove the lowly rated IMDB movies.

In [None]:
#converting non null values in vote_count and vote_averages to int
vote_counts = metadata_2[metadata_2["vote_count"].notnull()]["vote_count"].astype("int")
vote_averages = metadata_2[metadata_2["vote_average"].notnull()]["vote_average"].astype("int")

#computing C and m
C = vote_averages.mean()
m = vote_counts.quantile(0.75)

#taking the movies with non null values and above the required threshold
metadata_2 = metadata_2[(metadata_2["vote_count"] >= m) & (metadata_2['vote_count'].notnull()) & (metadata_2['vote_average'].notnull())]

#converting to int
metadata_2["vote_count"] = metadata_2["vote_count"].astype("int")
metadata_2["vote_average"] = metadata_2["vote_average"].astype("int")

In [None]:
#converting string to object
metadata_2["keywords"] = metadata_2["keywords"].apply(literal_eval)
metadata_2["genres"] = metadata_2["genres"].apply(literal_eval)
metadata_2["cast"] = metadata_2["cast"].apply(literal_eval)

#keeping only top 3 cast names per movie
metadata_2["cast"] = metadata_2["cast"].apply(lambda x: x[:3] if len(x)>3 else x)

In [None]:
#removing all blank spaces
metadata_2["director"] = metadata_2["director"].astype("str").apply(lambda x: str.lower(x.replace(" ", "")))
metadata_2["cast"] = metadata_2["cast"].astype("str").apply(lambda x: str.lower(x.replace(" ", "")))
metadata_2["genres"] = metadata_2["genres"].astype("str").apply(lambda x: str.lower(x.replace(" ", "")))
metadata_2["keywords"] = metadata_2["keywords"].astype("str").apply(lambda x: str.lower(x.replace(" ", "")))

In [None]:
display(metadata_2)

Unnamed: 0,genres,id,original_title,vote_count,vote_average,cast,director,keywords
0,"['animation','comedy','family']",862,Toy Story,5415,7,"['woody(voice)','buzzlightyear(voice)','mr.pot...","['johnlasseter','johnlasseter','johnlasseter']","['toy','boy','friendship','friend']"
1,"['adventure','fantasy','family']",8844,Jumanji,2413,6,"['alanparrish','samuelalanparrish/vanpelt','ju...","[nan,nan,nan]","[""basedonchildren'sbook"",'giantinsect']"
5,"['action','crime','drama','thriller']",949,Heat,1886,7,"['lt.vincenthanna','neilmccauley','chrisshiher...","['michaelmann','michaelmann','michaelmann']","['bank','chase','thief','honor','murder','heis..."
9,"['adventure','action','thriller']",710,GoldenEye,1194,6,"['jamesbond','alectrevelyan','natalyafyodorovn...","['martincampbell','martincampbell','martincamp...","['cuba','computervirus','kgb']"
12,"['family','animation','adventure']",21032,Balto,423,7,"['balto(voice)','boristhegoose(voice)','jenna(...","[nan,nan,nan]","['wolf','alaska','dog']"
...,...,...,...,...,...,...,...,...
40500,"['action','horror','thriller']",316727,The Purge: Election Year,1356,6,"['senatorcharlieroan','leobarnes','joedixon']","['jamesdemonaco','jamesdemonaco','jamesdemonaco']","['dystopia','sequel']"
40501,['comedy'],316023,Mike and Dave Need Wedding Dates,901,5,"['davestangle','mikestangle','alice']","[nan,nan,nan]",['hawaii']
40554,"['family','animation']",399106,Piper,487,8,[],"['alanbarillaro','alanbarillaro','alanbarillaro']","['bird','fear','short']"
40631,"['mystery','adventure','crime']",328387,Nerve,2262,7,"['veedelmonico','ian','sydneysloane']","['henryjoost','henryjoost','henryjoost']","['basedonnovel','internet','game','basedonyoun..."


In [None]:
#making a new column consisting of all the metadata
#genres + cast + director + keywords
metadata_2["features"] = metadata_2["genres"] + metadata_2["cast"] + metadata_2["director"] + metadata_2["keywords"]

In [None]:
#displaying final features we work on
metadata_2["features"]

0        ['animation','comedy','family']['woody(voice)'...
1        ['adventure','fantasy','family']['alanparrish'...
5        ['action','crime','drama','thriller']['lt.vinc...
9        ['adventure','action','thriller']['jamesbond',...
12       ['family','animation','adventure']['balto(voic...
                               ...                        
40500    ['action','horror','thriller']['senatorcharlie...
40501    ['comedy']['davestangle','mikestangle','alice'...
40554    ['family','animation'][]['alanbarillaro','alan...
40631    ['mystery','adventure','crime']['veedelmonico'...
40831    ['crime','drama','thriller','western']['marcus...
Name: features, Length: 2305, dtype: object

In [None]:
#converting these words into a matrix
words = CountVectorizer(analyzer = "word", ngram_range = (1, 2), min_df = 0, stop_words = 'english')
word_matrix = words.fit_transform(metadata_2["features"])

$cosine(x,y) = \frac{x.y'}{||x||.||y||}$

We use this cosine similarity to compute the similarity between the obtained matrices, and therefore the movies.

In [None]:
cos_sim = cosine_similarity(word_matrix, word_matrix)

In [None]:
metadata_2 = metadata_2.reset_index()

#getting titles and indices of the movies
titles = metadata_2["original_title"]
indices = pd.Series(metadata_2.index, index = metadata_2["original_title"])

In [None]:
#defining a function to get recommendations
def recommendations(title):
    index = indices[title]
    simil = list(enumerate(cos_sim[index]))
    simil = sorted(simil, key = lambda x: x[1], reverse=True)
    simil = simil[1:31]
    movies = [i[0] for i in simil]
    return titles.iloc[movies]

In [None]:
#change movie name accordingly to get recommendations
#change param in head to get top n movies
recommendations("2012").head(10)

1924                              After Earth
1937                          The World's End
622                Mad Max Beyond Thunderdome
861           Charlie's Angels: Full Throttle
1737                             Killer Elite
1986                              All Is Lost
2076                                 Hercules
2204    The Hunger Games: Mockingjay - Part 2
766                          The Time Machine
1134                                16 Blocks
Name: original_title, dtype: object