<a href="https://colab.research.google.com/github/XiaoxuanLu/Movie_recommender_system/blob/main/Content_based_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We would like to build a content-based recommender system that computes similarity between movies based on certain metrics and suggests movies that are most similar to a particular movie that a user liked.


The content-based movie recommender will be based on
Movie Overviews and Taglines
Movie Cast, Crew, Keywords and Genre

## Load the dataset

In [1]:
import pandas as pd
import warnings; warnings.simplefilter('ignore')

links = pd.read_csv('data/links.csv')
links = links[links['tmdbId'].notnull()]['tmdbId'].astype('int')
md = pd.read_csv('data/movies_metadata.csv',low_memory=False)

In [2]:
# delete the rows which have hyphen in input
md = md.drop([19730, 29503, 35587])
md['id'] = md['id'].astype('int')

In [3]:
# only keep the movies which are also in links 
md = md[md['id'].isin(links)]
md.shape

(45463, 24)

## Build a recommender based on Overview and Taglines

In [4]:
md

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0
45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,2011-11-17,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0
45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0


In [5]:
md['overview'].isnull().sum()

954

In [6]:
md['tagline'].isnull().sum()

25051

In [7]:
md['tagline'] = md['tagline'].fillna('')
md['overview'] = md['overview'].fillna('')

In [8]:
print(md['tagline'].isnull().sum())
print(md['overview'].isnull().sum())

0
0


In [9]:
md['general'] = md['overview'] + md['tagline']

### Preprocess the general information

In [10]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string

lemmatizer = WordNetLemmatizer()

def tokenize(text):
    return word_tokenize(text)

def lemmatize(words):
    words = [lemmatizer.lemmatize(word) for word in words]
    return words
    
# Lower casing
def lower(text):
    low_text = text.lower()
    return low_text

def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

# Number removal
def remove_num(text):
    remove = re.sub(r'[^a-zA-Z ]', '' ,text)
    return remove

def remove_space(text):
    remove = text.replace(r'( +)'," ")
    return remove.strip()
    

# punctuation_removal
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

# convert a list into string
def list_str(lst):
    return ' '.join([str(item) for item in lst])

In [11]:
md['general'] = md['general'].apply(lambda x: remove_html(x))
md['general'] = md['general'].apply(lambda x: remove_num(x))
md['general'] = md['general'].apply(lambda x: remove_space(x))
md['general'] = md['general'].apply(lambda x: punctuation_removal(x))
md['general'] = md['general'].apply(lambda x: lower(x))
md['general'] = md['general'].apply(lambda x: tokenize(x))
md['general'] = md['general'].apply(lambda x: lemmatize(x))
md['general'] = md['general'].apply(lambda x: list_str(x))

### Similarity Methods
#### 1. Cosine Similarity with TF-IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(md['general'])

In [14]:
cs = linear_kernel(tfidf_matrix, tfidf_matrix)
cs[0]

array([1.        , 0.00502439, 0.        , ..., 0.        , 0.0024031 ,
       0.        ])

In [15]:
md = md.reset_index()
movies = pd.Series(md.index, index=md['title'])

In [21]:
def get_recommendations_cs(title, n):
    idx = movies[title]
    scores = list(enumerate(cs[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = scores[1:n+1]
    indices = [i[0] for i in scores]
    return md['title'].iloc[indices]

In [24]:
get_recommendations_cs('3 Idiots', 20).head(20)

16575                          Craig's Wife
36972                             Bheja Fry
43101                You Should Meet My Son
41944        Poison Ivy: The Secret Society
43284                             3 Idiotas
22642                     Backstreet Dreams
34573                       The Murder Pact
21619                          At Any Price
15238                                 Dread
17249    Then I Sentenced Them All to Death
39398             Heart Like a Hand Grenade
288                                Outbreak
33603                              The Hive
29525                    Our Brief Eternity
15070        The Education of Charlie Banks
43737      Sars Wars: Bangkok Zombie Crisis
34687                      Come Tomorrow...
40366                    Mouse in Manhattan
19752                   Student of the Year
2318                            Patch Adams
Name: title, dtype: object