# Problem: IMDB Dataset Recommender
### Problem class: Metadata-Based recommender system
### Problem dataset link: https://bit.ly/33IAohl
### Problem description:
     Creating a Metadata based recommendation system using IMDB dataset.

### Problem Task:
     Recommend the top 10 most similar movies.

# Importing libraries

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

# Load the dataset into a pandas dataframe

In [5]:
# Import the original file
original_df = pd.read_csv("data/movies_metadata.csv", low_memory=False)

df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [7]:
# import data from the clean file
df = pd.read_csv("data/metadata_clean.csv", low_memory=False)

# Add the useful features into the cleaned dataframe
df['overview'], df['id'] = original_df['overview'], original_df['id']


credit_df = pd.read_csv("data/credits.csv", low_memory=False) # https://www.kaggle.com/rounakbanik/the-movies-dataset/downloads/credits.csv/7
keywords_df = pd.read_csv("data/keywords.csv", low_memory=False)

# show the head of the cleaned dataframe
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862


In [21]:
df.dtypes

title            object
genres           object
runtime         float64
vote_average    float64
vote_count      float64
year              int64
overview         object
id               object
dtype: object

In [None]:
# Convert the id of the df into int
# df['id'] = df['id'].astype('int') ## will arise value error

In [27]:
# Function to convert all non-integer IDs to NaN
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan

In [28]:
# Clean the ids of df
df['id'] = df['id'].apply(clean_ids)

# filter all rows that have a null ID
df = df[df['id'].notnull()]

In [30]:
# Convert IDs into int
df['id'] = df['id'].astype('int')
keywords_df['id'] = keywords_df['id'].astype('int')
credit_df['id'] = credit_df['id'].astype('int')

In [31]:
# Merge Keywords and credits into the metadata df
df = df.merge(credit_df, on='id')
df = df.merge(keywords_df, on='id')

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id,cast,crew,keywords
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


## Wrangling Keywords, cast and crew column

In [None]:
# TODO: 
    ## include only the top three keywords
    ## include the top three stars in our cast.
    ## Convert crew into director.extract only the director of the movie and ignore all other crew members.
    
# Convert the stringified objects into the native python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [34]:
# print the first cast member of the movie in df
df.iloc[0]['crew'][0]

{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [35]:
# Extract the director's name. if director is not listed, return  NaN
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

In [36]:
# Define the new director feature
df['director'] = df['crew'].apply(get_director)

# Print the directors of the frist 5 movies
df['director'].head()

0      John Lasseter
1       Joe Johnston
2      Howard Deutch
3    Forest Whitaker
4      Charles Shyer
Name: director, dtype: object

In [37]:
# Return the list of top 3 elements or entrie list; whichever is more.
def generate_list(x:list):
    if isinstance(x, list):
        names = [ele['name'] for ele in x]
        # Check if more than 3 elements exists. if yes, return only first three.
        if len(names) > 3:
            names = names[:3]
        return names
    # return empty list in case of missing/malformed data
    return []

In [38]:
# Apply the generate_list function to cast and keywords
df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)

# Only consider a maximum of 3 genres
df['genres'] = df['genres'].apply(lambda x: x[:3])

In [39]:
# Print the new features of the first 5 movies along with title
df[['title', 'cast', 'director', 'keywords', 'genres']].head(5)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"
3,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devine]",Forest Whitaker,"[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]"
4,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short]",Charles Shyer,"[baby, midlife crisis, confidence]",[Comedy]


In [40]:
# Building a vectorizer to build document vectors.
## Reason: 
# If two actors had the same first name (say, Ryan Reynolds and Ryan Gosling), the vectorizer will treat 
# both Ryans as the same, although they are clearly different entities. This will impact the quality of 
# the recommendations we receive. If a person likes Ryan Reynolds' movies, it doesn't imply that they like
# movies by all Ryans.

# Function to sanitize data to prevent ambiguity
# Removes spaces and converts to lowercase
def sanitize(x):
    if isinstance(x, list):
        # strip spaces and convert to lowercase
        return [str.lower(i.replace(" ", '')) for i in x]
    else:
        # Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [41]:
# Apply the sanitize function to cast, keywords, director and genres
features = ['cast', 'director', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(sanitize)

### Creating the metadata soup

In [45]:
# Function that creates a soup out of the desired metadata
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [46]:
# Create the new soup feature
df['soup'] = df.apply(create_soup, axis=1)

In [47]:
# Display the soup of the first movie
df.iloc[0]['soup']

'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'

# CountVectorizer

In [48]:
# CountVectorizer is the simplest type of vectorizer
# Example: 
#     we have three documents, A, B, and C, which are as follows:
#     A: The sun is a star.
#     B: My love is like a red, red rose
#     C: Mary had a little lamb

# step 1: convert these documents into their vector forms using CountVectorizer
# step 2: compute the size of the vocabulary. The vocabulary is the number of unique words present across all documents
#         The vocabulary for this set of three documents is as follows: the, sun, is, a, star, my, love, like, 
#         red, rose, mary, had, little, lamb. Consequently, the size of the vocabulary is 14
# step 3: eliminating the stop words
#         V: like, little, lamb, love, mary, red, rose, sun, star
# The size of our vocabulary is now nine. Therefore, our documents will be represented as ninedimensional vectors,
# and each dimension here will represent the number of times a particular wordoccurs in a document

## Applying CountVectorizer in A B C
#     A: (0, 0, 0, 0, 0, 0, 0, 1, 1)
#     B: (1, 0, 0, 1, 0, 2, 1, 0, 0)
#     C: (0, 1, 1, 0, 1, 0, 0, 0, 0)

In [None]:
# Reason to use CountVectorizer
# Instead of using TF-IDFVectorizer, we will be using CountVectorizer. This is because using TFIDFVectorizer will accord less weight to actors and directors
# who have acted and directed in a relatively larger number of movies

In [49]:
# Define a new CountVectorizer object and create vectors for the soup
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [None]:
# Import cosine_similarity function
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity score (equivalent to dot product for tf-idf vactors)
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
# Reset index of the df and construct reverse mapping again
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [None]:
# Function that takes in movie title as input and gives recommendations
def content_based_recommnder(title:str, cosine_sim:bool=cosine_similarity, df:pd.DataFrame=df, indices:pd.Series=indices):
    # Obtain the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwise similarity scores of all movies with that movie
    # And convert it inot a list of tuples as described above
    similarity_scores = list(enumerate(cosine_similarity[idx]))
    
    # Sort the movies based on the cosine similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies. Ignore the first movie.
    similarity_scores = similarity_scores[1:11]
    
    # Get the movie indices
    movie_indices = [i[0] for i in similarity_scores]
    
    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [None]:
#Get recommendations for The Lion King
content_based_recommnder('The Lion King')