<a href="https://colab.research.google.com/github/XiaoxuanLu/Movie_recommender_system/blob/main/Content_based_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We would like to build a content-based recommender system that computes similarity between movies based on certain metrics and suggests movies that are most similar to a particular movie that a user liked.


The content-based movie recommender will be based on
1. Movie Overviews and Taglines
2. Movie Cast, Crew, Keywords and Genre

## Load the dataset

In [1]:
import pandas as pd
import warnings; warnings.simplefilter('ignore')

links = pd.read_csv('data/links.csv')
links = links[links['tmdbId'].notnull()]['tmdbId'].astype('int')
md = pd.read_csv('data/movies_metadata.csv',low_memory=False)

In [2]:
# delete the rows which have hyphen in input
md = md.drop([19730, 29503, 35587])
md['id'] = md['id'].astype('int')

In [3]:
# only keep the movies which are also in links 
md = md[md['id'].isin(links)]
md.shape

(45463, 24)

## Build a recommender based on Overview and Taglines

In [4]:
md

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0
45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,2011-11-17,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0
45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0


In [5]:
md['overview'].isnull().sum()

954

In [6]:
md['tagline'].isnull().sum()

25051

In [7]:
md['tagline'] = md['tagline'].fillna('')
md['overview'] = md['overview'].fillna('')

In [8]:
print(md['tagline'].isnull().sum())
print(md['overview'].isnull().sum())

0
0


In [9]:
md['general'] = md['overview'] + md['tagline']

### Preprocess the general information

In [10]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string

lemmatizer = WordNetLemmatizer()

def tokenize(text):
    return word_tokenize(text)

def lemmatize(words):
    words = [lemmatizer.lemmatize(word) for word in words]
    return words
    
# Lower casing
def lower(text):
    low_text = text.lower()
    return low_text

def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

# Number removal
def remove_num(text):
    remove = re.sub(r'[^a-zA-Z ]', '' ,text)
    return remove

def remove_space(text):
    remove = text.replace(r'( +)'," ")
    return remove.strip()
    

# punctuation_removal
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

# convert a list into string
def list_str(lst):
    return ' '.join([str(item) for item in lst])

In [11]:
md['general'] = md['general'].apply(lambda x: remove_html(x))
md['general'] = md['general'].apply(lambda x: remove_num(x))
md['general'] = md['general'].apply(lambda x: remove_space(x))
md['general'] = md['general'].apply(lambda x: punctuation_removal(x))
md['general'] = md['general'].apply(lambda x: lower(x))
md['general'] = md['general'].apply(lambda x: tokenize(x))
md['general'] = md['general'].apply(lambda x: lemmatize(x))
md['general'] = md['general'].apply(lambda x: list_str(x))

### Similarity Methods
#### 1. Cosine Similarity with TF-IDF

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(md['general'])

In [13]:
cs = linear_kernel(tfidf_matrix, tfidf_matrix)
cs[0]

array([1.        , 0.00502439, 0.        , ..., 0.        , 0.0024031 ,
       0.        ])

In [14]:
md = md.reset_index()
movies = pd.Series(md.index, index=md['title'])

We implemented a method called get_recommendations_cs, which utilized the TF-IDF based cosine similarity method to find the most similar movies for users. Users can choose the number of moview they want the system to recommend, and also they can choose to input the minimum rating of the moviews the system could recommend. 

In [61]:
def get_recommendations_cs(title, n, vote = 0):
    idx = movies[title]
    if isinstance(idx, pd.Series):
        idx = idx.iloc[0]
    scores = list(enumerate(cs[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    indices = [i[0] for i in scores]
    votes = list(md['vote_average'].iloc[indices])
    qualified_scores = []
    for i in range(len(votes)):
        if votes[i] >= vote:
            qualified_scores.append(scores[i])
    qualified_scores = qualified_scores[1:n+1]
    indices = [i[0] for i in qualified_scores]
    return md['title'].iloc[indices]

#### Test the first method

In [64]:
get_recommendations_cs('The Dark Knight',10,7)

18252                                The Dark Knight Rises
21193    Batman Unmasked: The Psychology of the Dark Kn...
15511                           Batman: Under the Red Hood
23870                            In Order of Disappearance
41979    Batman Beyond Darwyn Cooke's Batman 75th Anniv...
18035                                     Batman: Year One
19791              Batman: The Dark Knight Returns, Part 1
3267                                                   JFK
3095                          Batman: Mask of the Phantasm
20231              Batman: The Dark Knight Returns, Part 2
Name: title, dtype: object

#### 2. Kullback–Leibler divergence

Compared with the first method, KLD method is too slow to process, so it's not a good choice for the movie recommender system. Therefore, we choose to utilize the cosine similarity with TF-IDF as the primary method for the content-based recommender system in the project.

In [69]:
import collections
import math

def get_counts(word_list):
    return collections.Counter(word_list)


def create_prob_dist(count_dict):
    total_ct = sum(count_dict.values())
    p = {x: ct / total_ct for x, ct in count_dict.items()}
    return p


def count_smoothing(freq_dist, vocabulary, alpha=1):
    return {w: freq_dist.get(w, 0) + alpha for w in vocabulary}


def entropy(p):

    h = 0
    for x in p:
        h -= p[x] * math.log(p[x])

    return h


def cross_entropy(p1, p2):
    xh = 0
    for x in p1:
        xh -= p1[x] * math.log(p2[x])

    return xh


def kl_divergence(p1, p2):
    kl = 0
    kl = cross_entropy(p1, p2) - entropy(p1)

    return kl

In [70]:
md['general_tokens'] = md['general'].apply(lambda x: tokenize(x))
md['counts'] = md['general_tokens'].apply(lambda x: get_counts(x))

In [None]:
def get_recommendations_kld(title, n, vote = 0):
    idx = movies[title]
    ct_a = md['counts'].iloc[idx]
    kld = []
    scores = list(enumerate(md['counts']))
    for i in scores:
        vocab = set(ct_a.keys()) | set(i[1].keys())
        ct_a = count_smoothing(ct_a, vocab)
        ct_b = count_smoothing(i[1], vocab)
        p_a = create_prob_dist(ct_a)
        p_b = create_prob_dist(ct_b)
        kl_ab = kl_divergence(p_a, p_b)
        kld.append((i[0], kl_ab))

    kld = sorted(kld, key=lambda x: x[1])
    indices = [i[0] for i in kld]
    votes = list(md['vote_average'].iloc[indices])
    qualified_scores = []
    for i in range(len(votes)):
        if votes[i] >= vote:
            qualified_scores.append(scores[i])
    qualified_scores = qualified_scores[1:n+1]
    indices = [i[0] for i in qualified_scores]
    return md['title'].iloc[indices]

## Build a recommender based on Movie Cast, Crew, Keywords and Genre

### Merge the dataset

In [95]:
credits = pd.read_csv('data/credits.csv')
keywords = pd.read_csv('data/keywords.csv')

In [96]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

In [97]:
# merge the credits and keywords to md by 'id'
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [99]:
# only keep the movies which are also in links 
md = md[md['id'].isin(links)]
md.shape

(46628, 31)

In [106]:
from ast import literal_eval
md['cast'] = md['cast'].apply(literal_eval)
md['crew'] = md['crew'].apply(literal_eval)
md['keywords'] = md['keywords'].apply(literal_eval)

In [110]:
md['cast'][1]

[{'cast_id': 1,
  'character': 'Alan Parrish',
  'credit_id': '52fe44bfc3a36847f80a7c73',
  'gender': 2,
  'id': 2157,
  'name': 'Robin Williams',
  'order': 0,
  'profile_path': '/sojtJyIV3lkUeThD7A2oHNm8183.jpg'},
 {'cast_id': 8,
  'character': 'Samuel Alan Parrish / Van Pelt',
  'credit_id': '52fe44bfc3a36847f80a7c99',
  'gender': 2,
  'id': 8537,
  'name': 'Jonathan Hyde',
  'order': 1,
  'profile_path': '/7il5D76vx6QVRVlpVvBPEC40MBi.jpg'},
 {'cast_id': 2,
  'character': 'Judy Sheperd',
  'credit_id': '52fe44bfc3a36847f80a7c77',
  'gender': 1,
  'id': 205,
  'name': 'Kirsten Dunst',
  'order': 2,
  'profile_path': '/wBXvh6PJd0IUVNpvatPC1kzuHtm.jpg'},
 {'cast_id': 24,
  'character': 'Peter Shepherd',
  'credit_id': '52fe44c0c3a36847f80a7ce7',
  'gender': 0,
  'id': 145151,
  'name': 'Bradley Pierce',
  'order': 3,
  'profile_path': '/j6iW0vVA23GQniAPSYI6mi4hiEW.jpg'},
 {'cast_id': 10,
  'character': 'Sarah Whittle',
  'credit_id': '52fe44bfc3a36847f80a7c9d',
  'gender': 1,
  'id': 5

In [111]:
get_recommendations_cs('Mrs. Doubtfire',10)

31928                The Sci-Fi Boys
31136    Pokémon: Jirachi Wish Maker
30547              The Better Angels
41381                            Sky
39543                 Eden and After
37685           Becoming Bulletproof
33149             The Pumpkin Karver
13781             The Young Victoria
12112                The Last Legion
1855           From Here to Eternity
Name: title, dtype: object