<a href="https://colab.research.google.com/github/XiaoxuanLu/Movie_recommender_system/blob/main/Content_based_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We would like to build a content-based recommender system that computes similarity between movies based on certain metrics and suggests movies that are most similar to a particular movie that a user liked.


The content-based movie recommender will be based on
1. Movie Overviews and Taglines
2. Movie Cast, Crew, Keywords and Genre

## Load the dataset

In [1]:
import pandas as pd
import warnings; warnings.simplefilter('ignore')

links = pd.read_csv('data/links.csv')
links = links[links['tmdbId'].notnull()]['tmdbId'].astype('int')
md = pd.read_csv('data/movies_metadata.csv',low_memory=False)

In [2]:
# delete the rows which have hyphen in input
md = md.drop([19730, 29503, 35587])
md['id'] = md['id'].astype('int')

In [3]:
# only keep the movies which are also in links 
md = md[md['id'].isin(links)]
md.shape

(45463, 24)

## Build a recommender based on Overview and Taglines

In [4]:
md

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0
45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,2011-11-17,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0
45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0


In [5]:
md['overview'].isnull().sum()

954

In [6]:
md['tagline'].isnull().sum()

25051

In [7]:
md['tagline'] = md['tagline'].fillna('')
md['overview'] = md['overview'].fillna('')

In [8]:
print(md['tagline'].isnull().sum())
print(md['overview'].isnull().sum())

0
0


In [9]:
md['general'] = md['overview'] + md['tagline']

### Preprocess the general information

In [10]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string

lemmatizer = WordNetLemmatizer()

def tokenize(text):
    return word_tokenize(text)

def lemmatize(words):
    words = [lemmatizer.lemmatize(word) for word in words]
    return words
    
# Lower casing
def lower(text):
    low_text = text.lower()
    return low_text

def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

# Number removal
def remove_num(text):
    remove = re.sub(r'[^a-zA-Z ]', '' ,text)
    return remove

def remove_space(text):
    remove = text.replace(r'( +)'," ")
    return remove.strip()
    

# punctuation_removal
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

# convert a list into string
def list_str(lst):
    return ' '.join([str(item) for item in lst])

In [11]:
md['general'] = md['general'].apply(lambda x: remove_html(x))
md['general'] = md['general'].apply(lambda x: remove_num(x))
md['general'] = md['general'].apply(lambda x: remove_space(x))
md['general'] = md['general'].apply(lambda x: punctuation_removal(x))
md['general'] = md['general'].apply(lambda x: lower(x))
md['general'] = md['general'].apply(lambda x: tokenize(x))
md['general'] = md['general'].apply(lambda x: lemmatize(x))
md['general'] = md['general'].apply(lambda x: list_str(x))

### Similarity Methods
#### 1. Cosine Similarity with TF-IDF

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(md['general'])

In [38]:
cs = linear_kernel(tfidf_matrix, tfidf_matrix)


In [39]:
cs[0]

array([1.        , 0.00499328, 0.        , ..., 0.        , 0.00239637,
       0.        ])

In [14]:
md = md.reset_index()
movies = pd.Series(md.index, index=md['title'])

We implemented a method called get_recommendations_cs, which utilized the TF-IDF based cosine similarity method to find the most similar movies for users. Users can choose the number of moview they want the system to recommend, and also they can choose to input the minimum rating of the moviews the system could recommend. 

In [26]:
def get_recommendations_cs(title, n=20, vote = 0):
    idx = movies[title]
    if isinstance(idx, pd.Series):
        idx = idx.iloc[0]
    scores = list(enumerate(cs[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    indices = [i[0] for i in scores]
    votes = list(md['vote_average'].iloc[indices])
    qualified_scores = []
    for i in range(len(votes)):
        if votes[i] >= vote:
            qualified_scores.append(scores[i])
    qualified_scores = qualified_scores[1:n+1]
    indices = [i[0] for i in qualified_scores]
    return md['title'].iloc[indices]

#### Test the first method

In [16]:
get_recommendations_cs('The Dark Knight',10,7)

18252                                The Dark Knight Rises
21193    Batman Unmasked: The Psychology of the Dark Kn...
15511                           Batman: Under the Red Hood
23870                            In Order of Disappearance
41979    Batman Beyond Darwyn Cooke's Batman 75th Anniv...
18035                                     Batman: Year One
19791              Batman: The Dark Knight Returns, Part 1
3267                                                   JFK
3095                          Batman: Mask of the Phantasm
20231              Batman: The Dark Knight Returns, Part 2
Name: title, dtype: object

#### 2. Kullback–Leibler divergence

Compared with the first method, KLD method is too slow to process, so it's not a good choice for the movie recommender system. Therefore, we choose to utilize the cosine similarity with TF-IDF as the primary method for the content-based recommender system in the project.

In [69]:
import collections
import math

def get_counts(word_list):
    return collections.Counter(word_list)


def create_prob_dist(count_dict):
    total_ct = sum(count_dict.values())
    p = {x: ct / total_ct for x, ct in count_dict.items()}
    return p


def count_smoothing(freq_dist, vocabulary, alpha=1):
    return {w: freq_dist.get(w, 0) + alpha for w in vocabulary}


def entropy(p):

    h = 0
    for x in p:
        h -= p[x] * math.log(p[x])

    return h


def cross_entropy(p1, p2):
    xh = 0
    for x in p1:
        xh -= p1[x] * math.log(p2[x])

    return xh


def kl_divergence(p1, p2):
    kl = 0
    kl = cross_entropy(p1, p2) - entropy(p1)

    return kl

In [70]:
md['general_tokens'] = md['general'].apply(lambda x: tokenize(x))
md['counts'] = md['general_tokens'].apply(lambda x: get_counts(x))

In [None]:
def get_recommendations_kld(title, n, vote = 0):
    idx = movies[title]
    ct_a = md['counts'].iloc[idx]
    kld = []
    scores = list(enumerate(md['counts']))
    for i in scores:
        vocab = set(ct_a.keys()) | set(i[1].keys())
        ct_a = count_smoothing(ct_a, vocab)
        ct_b = count_smoothing(i[1], vocab)
        p_a = create_prob_dist(ct_a)
        p_b = create_prob_dist(ct_b)
        kl_ab = kl_divergence(p_a, p_b)
        kld.append((i[0], kl_ab))

    kld = sorted(kld, key=lambda x: x[1])
    indices = [i[0] for i in kld]
    votes = list(md['vote_average'].iloc[indices])
    qualified_scores = []
    for i in range(len(votes)):
        if votes[i] >= vote:
            qualified_scores.append(scores[i])
    qualified_scores = qualified_scores[1:n+1]
    indices = [i[0] for i in qualified_scores]
    return md['title'].iloc[indices]

## Build a recommender based on Movie Cast, Crew, Keywords and Genre

### Merge the dataset

In [12]:
credits = pd.read_csv('data/credits.csv')
keywords = pd.read_csv('data/keywords.csv')

In [13]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

In [14]:
# merge the credits and keywords to md by 'id'
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [15]:
# only keep the movies which are also in links 
md = md[md['id'].isin(links)]
md.shape

(46628, 28)

In [16]:
from ast import literal_eval
md['cast'] = md['cast'].apply(literal_eval)
md['crew'] = md['crew'].apply(literal_eval)
md['keywords'] = md['keywords'].apply(literal_eval)
md['genres'] = md['genres'].apply(literal_eval)

### Preprocess the genre, cast, crew and keywords

In [19]:
# To check what jobs inside of the crew
for i in md['crew'][1]:
    print(i['job'],end=' ')

Executive Producer Screenplay Original Music Composer Director Editor Casting Animation Supervisor Production Design Producer Executive Producer Executive Producer Director of Photography Novel Producer Screenplay Screenplay 

1. In the crew, we assume that most users concern about the Director and Screenplay, so we would like to keep director and screenplay in the crew. To expand the weight of director and screeplay, we will multiply the director'name by 5 and screenplay'name by 3. 

2. For cast, we want to keep the first five actors or actresses since they are the primary characters. 

3. We store the keywords of a movie in a list

In [17]:
import numpy as np
def get_director(crew):
    for c in crew:
        if c['job'] == 'Director':
            return [c['name']] * 5
    return np.nan

def get_screenplay(crew):
    for c in crew:
        if c['job'] == 'Screenplay':
            return [c['name']] * 3
    return np.nan

def get_five_actors(cast):
    l = []
    for c in cast:
        l.append(c['name'])
    return l[:5]

def get_genres(genres):
    l = []
    for g in genres:
        l.append(g['name'])
    return l

In [18]:
md['director'] = md['crew'].apply(get_director)
md['screenplay'] = md['crew'].apply(get_screenplay)
md['cast'] = md['cast'].apply(get_five_actors)
md['keywords'] = md['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['genres'] = md['genres'].apply(get_genres)

In [19]:
md['meta'] = md['director'] + md['screenplay'] + md ['cast'] + md['keywords'] + md['genres']

In [20]:
md['meta'] = md['meta'].fillna('')
md['meta'] = md['meta'].apply(lambda x: list_str(x))
md['meta'] = md['meta'].apply(lambda x: lower(x))
md['meta'] = md['meta'].apply(lambda x: tokenize(x))
md['meta'] = md['meta'].apply(lambda x: lemmatize(x))
md['meta'] = md['meta'].apply(lambda x: list_str(x))

### Build the second recommender

This time we try to use the CountVectorizer to measure the cosine similarity. The second recommender will be based on movie, cast, crew, and keywords

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

cv = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
cv_matrix = cv.fit_transform(md['meta'])

In [41]:
alter = cosine_similarity(cv_matrix, cv_matrix)

In [44]:
md = md.reset_index()
movies = pd.Series(md.index, index=md['title'])

At this time, we improve the previous recommender to make the user to choose which recommender they would like to use. The default recommender will be based on tagline and overview, but if they want to recommend the moview based on the genre, cast, crew, and keywords, they can change the recommender from default to alter. 

In [49]:
def get_recommendations_cs(title, n=20, vote = 0, recommender = cs):
    idx = movies[title]
    if isinstance(idx, pd.Series):
        idx = idx.iloc[0]
    scores = list(enumerate(recommender[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    indices = [i[0] for i in scores]
    votes = list(md['vote_average'].iloc[indices])
    qualified_scores = []
    for i in range(len(votes)):
        if votes[i] >= vote:
            qualified_scores.append(scores[i])
    qualified_scores = qualified_scores[1:n+1]
    indices = [i[0] for i in qualified_scores]
    return md['title'].iloc[indices]

### Test two recommenders

We will compare the two recommenders: the recommender based on overview and tagline VS the recommender based on genres, cast, crew, and keywords.

In [28]:
get_recommendations_cs('The Dark Knight') # based on overview and tagline

18750                                The Shrine
9235                                   The Hole
28871                        Des roses en hiver
21908                               Mood Indigo
28720                                 Vuonna 85
38831                                Vinodentro
33337                                    Let Go
14668                 The Princess and the Frog
41438                                  Our Gang
44257           The Most Hated Woman in America
17730    The Autobiography of Nicolae Ceausescu
26638            Chestnut: Hero of Central Park
3862                                    Haunted
15667                            The Karate Kid
33282                          Mary of Nazareth
1957                    Flight of the Navigator
25279                          Listen Up Philip
42264               Trancers 6: Life After Deth
20855      Big Girls Don't Cry... They Get Even
25859                              Fighting Mad
Name: title, dtype: object

In [29]:
get_recommendations_cs('The Dark Knight', recommender = alter) # based on director, actors, screeplay, and keywords

18442                                The Dark Knight Rises
10210                                        Batman Begins
11463                                         The Prestige
2486                                             Following
15651                                            Inception
4126                                               Memento
5302                                              Insomnia
164                                          Feast of July
20253                                         Jack Reacher
3799                                    The Way of the Gun
10241                                           Retrograde
23668                   Mission: Impossible - Rogue Nation
8243                                  Whispers in the Dark
10639                                                Creep
463      The Englishman Who Went Up a Hill But Came Dow...
34                                              Carrington
3814                                          Best in Sh

In [51]:
get_recommendations_cs('Toy Story', recommender = alter) # based on director, actors, screeplay, and keywords

11074                              Cars
3024                        Toy Story 2
17551                            Cars 2
2262                       A Bug's Life
13624    Dr. Horrible's Sing-Along Blog
18008                      The Avengers
21498              Sky West and Crooked
16475                   The Company Men
4902                       Joe Somebody
1903                     Child's Play 2
4973                            Seconds
8673               The Hallelujah Trail
12455                         3 Bad Men
4229                 The Horse Soldiers
26423                      Pickup Alley
8010               How the West Was Won
8092                            Biggles
961                             Algiers
6370          Heaven Knows, Mr. Allison
6478                          Curly Sue
Name: title, dtype: object

In [56]:
get_recommendations_cs('The Bat Man', recommender = alter)

1                            Jumanji
2                   Grumpier Old Men
3                  Waiting to Exhale
4        Father of the Bride Part II
5                               Heat
6                            Sabrina
7                       Tom and Huck
8                       Sudden Death
9                          GoldenEye
10            The American President
11       Dracula: Dead and Loving It
12                             Balto
13                             Nixon
14                  Cutthroat Island
15                            Casino
16             Sense and Sensibility
17                        Four Rooms
18    Ace Ventura: When Nature Calls
19                       Money Train
20                        Get Shorty
Name: title, dtype: object

## Output the dataset

In [53]:
df = md[['id','title','vote_average','vote_count','general','meta','imdb_id']]

In [54]:
df

Unnamed: 0,id,title,vote_average,vote_count,general,meta,imdb_id
0,862,Toy Story,7.7,5415.0,led by woody andys toy live happily in his roo...,john lasseter john lasseter john lasseter john...,tt0114709
1,8844,Jumanji,6.9,2413.0,when sibling judy and peter discover an enchan...,joe johnston joe johnston joe johnston joe joh...,tt0113497
2,15602,Grumpier Old Men,6.5,92.0,a family wedding reignites the ancient feud be...,,tt0113228
3,31357,Waiting to Exhale,6.1,34.0,cheated on mistreated and stepped on the woman...,forest whitaker forest whitaker forest whitake...,tt0114885
4,11862,Father of the Bride Part II,5.7,173.0,just when george bank ha recovered from his da...,charles shyer charles shyer charles shyer char...,tt0113041
...,...,...,...,...,...,...,...
46623,439050,Subdue,4.0,1.0,rising and falling between a man and womanrisi...,,tt6209470
46624,111109,Century of Birthing,9.0,3.0,an artist struggle to finish his work while a ...,,tt2028550
46625,67758,Betrayal,3.8,6.0,when one of her hit go wrong a professional as...,mark l. lester mark l. lester mark l. lester m...,tt0303758
46626,227506,Satan Triumphant,0.0,0.0,in a small town live two brother one a ministe...,,tt0008536


In [55]:
import pickle

pickle.dump(df, open('movies.pkl', 'wb'))
pickle.dump(df.to_dict(), open('movies_dict.pkl', 'wb'))
