In [1]:
## importing libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")


from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
## functions used:

## converst to int type.
def convert_int(x):
    try:
        return int(x)
    except:
        return 0
    
    
## function to extract director of the movie from its respective column
def find_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan



## extract the leads of the movie from its respective column
def lead_cast(x):
    if isinstance(x, list):
        leads = []
        for i in x:
            leads.append(i['name'])
        return leads
    return []
            
    
    
## extract frequent occuring keyword from its respective column
def keyword_extractor(x):
    if isinstance(x, list):
        keywords = []
        for i in x:
            keywords.append(i['name'])
        
        return keywords
    
    return []
    


def freq_words(x):
    frequeunt_words = {}
    for i in x:
        for word in i:
            if word not in frequeunt_words:
                frequeunt_words[word] = 1
            else:
                frequeunt_words[word] += 1
                
    
                
    return frequeunt_words



## filters commonly occuring keywords.
def filtered_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
            
    return words


## function that filters types of genre a particular movie belongs to. A movie can have more than 1 genre.
def get_genres(x):
    if isinstance(x, list):
        genres = []
        for i in x:
            genres.append(i['name'])
        
        return genres
        
    return []


In [3]:
## reading the movie data file
movie_df = pd.read_csv('movies_metadata.csv')
movie_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
## check for the shape of the movie file
movie_df.shape

(45466, 24)

In [5]:
## convert datetime column to year column
movie_df['year'] = movie_df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
movie_df['year'] = movie_df['year'].apply(convert_int)
movie_df['year']

0        1995
1        1995
2        1995
3        1995
4        1995
         ... 
45461       0
45462    2011
45463    2003
45464    1917
45465    2017
Name: year, Length: 45466, dtype: int64

In [6]:
## data types of movie dataset.
movie_df.dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
year                       int64
dtype: object

In [7]:
## take only relevant columns for processing.
## it is found that 'id','genres','title','vote_average','vote_count','year' are relevant for this particular problem set.
relevant_cols = ['id','genres','title','vote_average','vote_count','year']
filter_df = movie_df[relevant_cols]
filter_df.dtypes

id               object
genres           object
title            object
vote_average    float64
vote_count      float64
year              int64
dtype: object

In [8]:
## convert id from string to int.
filter_df['id'] = filter_df['id'].apply(convert_int)

In [9]:
## meta data 1 : credits data set has character of the movie and crew information including the director
credits_df = pd.read_csv('credits.csv')
credits_df.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [10]:
## meta data 2: keywords are important while choosing a movie, it gives an abstract overview of the tone of the movie
keyW_df = pd.read_csv('keywords.csv')
keyW_df.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


##### merge the meta data 1 and 2 with our filtered movie dataset , merging is done on id

In [11]:
## merging credits and keyword dataframes to filtered movie dataset
filter_df = filter_df.merge(credits_df, on='id')
filter_df = filter_df.merge(keyW_df, on='id')

In [12]:
## merged data set
filter_df

Unnamed: 0,id,genres,title,vote_average,vote_count,year,cast,crew,keywords
0,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,7.7,5415.0,1995,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,6.9,2413.0,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",Grumpier Old Men,6.5,92.0,1995,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",Waiting to Exhale,6.1,34.0,1995,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 35, 'name': 'Comedy'}]",Father of the Bride Part II,5.7,173.0,1995,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
...,...,...,...,...,...,...,...,...,...
46623,439050,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",Subdue,4.0,1.0,0,"[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de...","[{'id': 10703, 'name': 'tragic love'}]"
46624,111109,"[{'id': 18, 'name': 'Drama'}]",Century of Birthing,9.0,3.0,2011,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...","[{'id': 2679, 'name': 'artist'}, {'id': 14531,..."
46625,67758,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",Betrayal,3.8,6.0,2003,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",[]
46626,227506,[],Satan Triumphant,0.0,0.0,1917,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",[]


In [13]:
## convert the string to its corresponding type of evaluation purpose.
from ast import literal_eval

In [14]:
filter_df['cast'] = filter_df['cast'].apply(literal_eval)

In [15]:
filter_df['crew'] = filter_df['crew'].apply(literal_eval)

In [16]:
filter_df['keywords'] = filter_df['keywords'].apply(literal_eval)

#### using the crew column to extract out the director of the movie.
#### using the cast column to get lead actors.


In [17]:
filter_df['director'] = filter_df['crew'].apply(find_director) 
filter_df['director']

0           John Lasseter
1            Joe Johnston
2           Howard Deutch
3         Forest Whitaker
4           Charles Shyer
               ...       
46623    Hamid Nematollah
46624            Lav Diaz
46625      Mark L. Lester
46626    Yakov Protazanov
46627       Daisy Asquith
Name: director, Length: 46628, dtype: object

In [18]:
filter_df['cast'] = filter_df['cast'].apply(lead_cast)
filter_df['cast']

0        [Tom Hanks, Tim Allen, Don Rickles, Jim Varney...
1        [Robin Williams, Jonathan Hyde, Kirsten Dunst,...
2        [Walter Matthau, Jack Lemmon, Ann-Margret, Sop...
3        [Whitney Houston, Angela Bassett, Loretta Devi...
4        [Steve Martin, Diane Keaton, Martin Short, Kim...
                               ...                        
46623          [Leila Hatami, Kourosh Tahami, Elham Korda]
46624    [Angel Aquino, Perry Dizon, Hazel Orencio, Joe...
46625    [Erika Eleniak, Adam Baldwin, Julie du Page, J...
46626    [Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...
46627                                                   []
Name: cast, Length: 46628, dtype: object

##### the side roles do not impact the overall experince of the movie, we can avoid side roles and focus on the main characters, which in most probable would be played by A-list actors.

In [19]:
filter_df['cast'] = filter_df['cast'].apply(lambda x : x[:2] if len(x) > 2 else x)

In [20]:
filter_df['cast']

0                       [Tom Hanks, Tim Allen]
1              [Robin Williams, Jonathan Hyde]
2                [Walter Matthau, Jack Lemmon]
3            [Whitney Houston, Angela Bassett]
4                 [Steve Martin, Diane Keaton]
                         ...                  
46623           [Leila Hatami, Kourosh Tahami]
46624              [Angel Aquino, Perry Dizon]
46625            [Erika Eleniak, Adam Baldwin]
46626    [Iwan Mosschuchin, Nathalie Lissenko]
46627                                       []
Name: cast, Length: 46628, dtype: object

In [21]:
filter_df['keywords'] = filter_df['keywords'].apply(keyword_extractor)
filter_df['keywords']

0        [jealousy, toy, boy, friendship, friends, riva...
1        [board game, disappearance, based on children'...
2        [fishing, best friend, duringcreditsstinger, o...
3        [based on novel, interracial relationship, sin...
4        [baby, midlife crisis, confidence, aging, daug...
                               ...                        
46623                                        [tragic love]
46624                                [artist, play, pinoy]
46625                                                   []
46626                                                   []
46627                                                   []
Name: keywords, Length: 46628, dtype: object

In [22]:
freq_word_dict =  freq_words(filter_df['keywords'])

In [23]:
s = filter_df.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [24]:
s = s.value_counts()

In [25]:
s = s[s > 1]

In [26]:
s.describe()

count    11397.000000
mean        13.238747
std         49.614615
min          2.000000
25%          2.000000
50%          4.000000
75%         10.000000
max       3128.000000
Name: keyword, dtype: float64

In [27]:
filter_df['keywords'] = filter_df['keywords'].apply(filtered_keywords)

In [28]:
filter_df

Unnamed: 0,id,genres,title,vote_average,vote_count,year,cast,crew,keywords,director
0,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,7.7,5415.0,1995,"[Tom Hanks, Tim Allen]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy, friendship, friends, riva...",John Lasseter
1,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,6.9,2413.0,1995,"[Robin Williams, Jonathan Hyde]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[board game, disappearance, based on children'...",Joe Johnston
2,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",Grumpier Old Men,6.5,92.0,1995,"[Walter Matthau, Jack Lemmon]","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fishing, best friend, duringcreditsstinger, o...",Howard Deutch
3,31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",Waiting to Exhale,6.1,34.0,1995,"[Whitney Houston, Angela Bassett]","[{'credit_id': '52fe44779251416c91011acb', 'de...","[based on novel, interracial relationship, sin...",Forest Whitaker
4,11862,"[{'id': 35, 'name': 'Comedy'}]",Father of the Bride Part II,5.7,173.0,1995,"[Steve Martin, Diane Keaton]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[baby, midlife crisis, confidence, aging, daug...",Charles Shyer
...,...,...,...,...,...,...,...,...,...,...
46623,439050,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",Subdue,4.0,1.0,0,"[Leila Hatami, Kourosh Tahami]","[{'credit_id': '5894a97d925141426c00818c', 'de...",[tragic love],Hamid Nematollah
46624,111109,"[{'id': 18, 'name': 'Drama'}]",Century of Birthing,9.0,3.0,2011,"[Angel Aquino, Perry Dizon]","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...","[artist, play, pinoy]",Lav Diaz
46625,67758,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",Betrayal,3.8,6.0,2003,"[Erika Eleniak, Adam Baldwin]","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",[],Mark L. Lester
46626,227506,[],Satan Triumphant,0.0,0.0,1917,"[Iwan Mosschuchin, Nathalie Lissenko]","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",[],Yakov Protazanov


In [29]:
filter_df['genres'] = filter_df['genres'].apply(literal_eval).apply(get_genres)

In [30]:
filter_df['genres']

0         [Animation, Comedy, Family]
1        [Adventure, Fantasy, Family]
2                   [Romance, Comedy]
3            [Comedy, Drama, Romance]
4                            [Comedy]
                     ...             
46623                 [Drama, Family]
46624                         [Drama]
46625       [Action, Drama, Thriller]
46626                              []
46627                              []
Name: genres, Length: 46628, dtype: object

In [31]:
filter_df['cast'] = filter_df['cast'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])

In [32]:
## check for missing values.
filter_df['director'].isna().sum()

917

In [33]:
## fill the missing values in director columns with unknown
filter_df['director'] = filter_df['director'].fillna('unknown')

In [34]:
## make the names to lower case
filter_df['director'] = filter_df['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
filter_df['director']

0           johnlasseter
1            joejohnston
2           howarddeutch
3         forestwhitaker
4           charlesshyer
              ...       
46623    hamidnematollah
46624            lavdiaz
46625       markl.lester
46626    yakovprotazanov
46627       daisyasquith
Name: director, Length: 46628, dtype: object

In [35]:
## drop the cre columns
filter_df.drop('crew', axis = 1, inplace = True)

In [36]:
## extract the genres
filter_df['genres'] = filter_df['genres'].apply(lambda x : [str.lower(i.replace(" ","")) for i in x])
filter_df['genres']

0         [animation, comedy, family]
1        [adventure, fantasy, family]
2                   [romance, comedy]
3            [comedy, drama, romance]
4                            [comedy]
                     ...             
46623                 [drama, family]
46624                         [drama]
46625       [action, drama, thriller]
46626                              []
46627                              []
Name: genres, Length: 46628, dtype: object

### "keyword" pre-processing


In [37]:
## making a copy of the filtered dataframe for processing.
copy_df = filter_df.copy(deep = True)

In [38]:
copy_df['mix'] = ""

In [39]:
## making a column by mising keywords, cast, genres and director
for i in range(len(copy_df)):
    copy_df['mix'][i] = copy_df['keywords'][i] + copy_df['cast'][i] + copy_df['genres'][i]
    copy_df['mix'][i].append(copy_df['director'][i])
    

In [40]:
copy_df['mix'] = copy_df['mix'].apply(lambda x: ' '.join(x))

In [41]:
copy_df['mix']

0        jealousy toy boy friendship friends rivalry bo...
1        board game disappearance based on children's b...
2        fishing best friend duringcreditsstinger old m...
3        based on novel interracial relationship single...
4        baby midlife crisis confidence aging daughter ...
                               ...                        
46623    tragic love leilahatami kouroshtahami drama fa...
46624    artist play pinoy angelaquino perrydizon drama...
46625    erikaeleniak adambaldwin action drama thriller...
46626     iwanmosschuchin nathalielissenko yakovprotazanov
46627                                         daisyasquith
Name: mix, Length: 46628, dtype: object

In [42]:
##since the dataframe is large, we will cut down the size in order vecotrization. 
part_df = copy_df.iloc[:28000,:]

In [43]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(part_df['mix'])

In [None]:
### cosine similarity to transform text used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
part_df = part_df.reset_index()
titles = part_df['title']
indices = pd.Series(part_df.index, index = part_df['title'])

In [None]:
# def get_recommendations(title):
#     idx = indices[title]
#     sim_scores = list(enumerate(cosine_sim[idx]))
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#     sim_scores = sim_scores[1:31]
#     movie_indices = [i[0] for i in sim_scores]
#     return titles.iloc[movie_indices]

In [None]:
# get_recommendations('Braveheart').head(10)

In [None]:
vote_counts = movie_df[movie_df['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movie_df[movie_df['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

In [None]:
m = vote_counts.quantile(0.95)
m

In [None]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

## recommendation function
def recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = movie_df.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [None]:
recommendations('Braveheart').head(5)

#### title similar to "Braveheart" are 
1. Bicycle Thieves
2. The Skin I Live In	
3. Sudden Impact	
4. Drunken Angel	
5. La Grande Bouffe