# Content-Based Recommendation System

## 1. Loading Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
movies  = pd.read_csv("Data/tmdb_5000_movies.csv")
credits = pd.read_csv("Data/tmdb_5000_credits.csv")

In [3]:
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [4]:
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [5]:
print("Movies\' shape: ", movies.shape)
print("Credits\' shape: ", credits.shape)

Movies' shape:  (4803, 20)
Credits' shape:  (4803, 4)


In [6]:
movies_df = movies.merge(credits, on = "title")
movies_df.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [7]:
# Select necessary attributes
movies_df = movies_df[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew", "vote_average", "vote_count"]]

In [8]:
movies_df.isnull().sum()

movie_id        0
title           0
overview        3
genres          0
keywords        0
cast            0
crew            0
vote_average    0
vote_count      0
dtype: int64

In [9]:
movies_df.dropna(inplace = True)

## 2. Building Item Profile

In [10]:
print(movies_df.iloc[0].genres)
print(type(movies_df.iloc[0].genres))

[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]
<class 'str'>


Now you can see the genre of each movie is a string within the form of a list of dictionaries. We have to convert it into a list that contains only unique genres for each movie.

In [11]:
#'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [12]:
# Example of ast packages
import ast
ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [13]:
# define a function to convert
import ast 
def convert(string):
    list_genres = []
    for i in ast.literal_eval(string):
        list_genres.append(i['name'])
    return list_genres

In [14]:
movies_df["genres"] = movies_df['genres'].apply(convert)
movies_df["genres"].head()

0    [Action, Adventure, Fantasy, Science Fiction]
1                     [Adventure, Fantasy, Action]
2                       [Action, Adventure, Crime]
3                 [Action, Crime, Drama, Thriller]
4             [Action, Adventure, Science Fiction]
Name: genres, dtype: object

In [15]:
movies_df["keywords"] = movies_df["keywords"].apply(convert)
movies_df["keywords"].head()

0    [culture clash, future, space war, space colon...
1    [ocean, drug abuse, exotic island, east india ...
2    [spy, based on novel, secret agent, sequel, mi...
3    [dc comics, crime fighter, terrorist, secret i...
4    [based on novel, mars, medallion, space travel...
Name: keywords, dtype: object

#### `Cast` and `Crew` Attributes

In [16]:
movies_df["cast"].iloc[0]

'[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", "gender": 

The `cast` and `crew` attributes are a more challenged. We will do the same thing to convert it to a list 

In [17]:
def convert5(string):
    list_element = []
    counter = 0
    for i in ast.literal_eval(string):
        if counter <= 5: # We only get 5 casts in the movie
            list_element.append(i["name"])
            counter += 1
        else:
            break
    return list_element

In [18]:
movies_df["cast"] = movies_df["cast"].apply(convert5)
movies_df["cast"].head(2)

0    [Sam Worthington, Zoe Saldana, Sigourney Weave...
1    [Johnny Depp, Orlando Bloom, Keira Knightley, ...
Name: cast, dtype: object

In [19]:
movies_df["crew"].iloc[0]

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In each movie, we will fetch the name of the director for that movie in `crew` attribute.

In [20]:
def fetch_crews(string):
    director = []
    for st in ast.literal_eval(string):
        if st["job"] == "Director" or st["job"] == "Writer" or st["job"] == "Editor":
            director.append(st["name"])
    return director

In [21]:
movies_df["crew"] = movies_df["crew"].apply(fetch_crews)
movies_df["crew"].head()

0    [Stephen E. Rivkin, James Cameron, James Camer...
1      [Gore Verbinski, Stephen E. Rivkin, Craig Wood]
2                              [Sam Mendes, Lee Smith]
3                       [Christopher Nolan, Lee Smith]
4                    [Andrew Stanton, Eric Zumbrunnen]
Name: crew, dtype: object

In [22]:
type(movies_df["overview"].iloc[0].split())

list

In [23]:
movies_df["overview"] = movies_df["overview"].apply(lambda x: x.split())
movies_df["overview"].head(1)

0    [In, the, 22nd, century,, a, paraplegic, Marin...
Name: overview, dtype: object

In [24]:
# result of new df
movies_df.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,vote_average,vote_count
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[Stephen E. Rivkin, James Cameron, James Camer...",7.2,11800
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[Gore Verbinski, Stephen E. Rivkin, Craig Wood]",6.9,4500


In [25]:
def remove_space(word):
    l = []
    for i in word:
        l.append(i.replace(" ", ""))
    return l

In [26]:
movies_df["genres"] = movies_df["genres"].apply(remove_space)
movies_df["keywords"] = movies_df["keywords"].apply(remove_space)
movies_df["cast"] = movies_df["cast"].apply(remove_space)
movies_df["crew"] = movies_df["crew"].apply(remove_space)

In [27]:
movies_df["tags"] = movies_df["overview"] + movies_df["genres"] + movies_df["keywords"] + movies_df["cast"] + movies_df["crew"]
movies_df.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,vote_average,vote_count,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...","[StephenE.Rivkin, JamesCameron, JamesCameron, ...",7.2,11800,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...","[GoreVerbinski, StephenE.Rivkin, CraigWood]",6.9,4500,"[Captain, Barbossa,, long, believed, to, be, d..."


In [28]:
movies_df = movies_df[["movie_id", "title", "tags", "vote_average", "vote_count"]]
movies_df["tags"] = movies_df["tags"].apply(lambda x: " ".join(x))

In [29]:
movies_df.head()

Unnamed: 0,movie_id,title,tags,vote_average,vote_count
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",7.2,11800
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",6.9,4500
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,6.3,4466
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,7.6,9106
4,49529,John Carter,"John Carter is a war-weary, former military ca...",6.1,2124


##### Lowercase tags

In [30]:
movies_df["tags"] = movies_df["tags"].apply(lambda x: x.lower())
movies_df.head()

Unnamed: 0,movie_id,title,tags,vote_average,vote_count
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di...",7.2,11800
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha...",6.9,4500
2,206647,Spectre,a cryptic message from bond’s past sends him o...,6.3,4466
3,49026,The Dark Knight Rises,following the death of district attorney harve...,7.6,9106
4,49529,John Carter,"john carter is a war-weary, former military ca...",6.1,2124


In [31]:
movies_df["tags"][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver stephenlang michellerodriguez giovanniribisi stephene.rivkin jamescameron jamescameron jamescameron johnrefoua'

## Generating Embeddings

Generate a vector for each movie based on profile

### First step - Stem words

In [32]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

In [33]:
def process(text):
    temp = []
    for i in text.split():
        temp.append(ps.stem(i))
    return " ".join(temp)

movies_df["tags"] = movies_df["tags"].apply(process)
movies_df["tags"].iloc[0]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav stephenlang michellerodriguez giovanniribisi stephene.rivkin jamescameron jamescameron jamescameron johnrefoua'

## Second Step - Vectorization

### Using CountVectorizer

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=5000, stop_words="english")

In [35]:
vectorizer.fit(movies_df["tags"])
vector_transform = vectorizer.transform(movies_df["tags"])
print(vectorizer.vocabulary_)
print(vector_transform.toarray())

{'century': 729, 'marin': 2841, 'dispatch': 1287, 'moon': 3061, 'uniqu': 4695, 'mission': 3034, 'becom': 425, 'torn': 4573, 'follow': 1703, 'order': 3267, 'protect': 3554, 'alien': 156, 'action': 68, 'adventur': 95, 'fantasi': 1618, 'sciencefict': 3959, 'cultureclash': 1074, 'futur': 1785, 'societi': 4166, 'spacetravel': 4206, 'futurist': 1787, 'romanc': 3837, 'space': 4200, 'tribe': 4622, 'soldier': 4172, 'battl': 410, '3d': 36, 'samworthington': 3920, 'zoesaldana': 4994, 'sigourneyweav': 4097, 'michellerodriguez': 2987, 'giovanniribisi': 1844, 'stephene': 4266, 'rivkin': 3788, 'jamescameron': 2290, 'captain': 672, 'long': 2725, 'believ': 437, 'dead': 1152, 'ha': 1938, 'come': 916, 'life': 2677, 'head': 1996, 'edg': 1409, 'earth': 1397, 'turner': 4645, 'elizabeth': 1436, 'noth': 3213, 'quit': 3588, 'ocean': 3234, 'drugabus': 1360, 'exoticisland': 1568, 'loveofone': 2757, 'slif': 4146, 'traitor': 4601, 'shipwreck': 4077, 'ship': 4076, 'allianc': 166, 'afterlif': 107, 'fighter': 1663, '

In [36]:
vector = vector_transform.toarray()

In [37]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(vector)
similarity_matrix[0]

array([1.        , 0.11293849, 0.07460471, ..., 0.0404061 , 0.        ,
       0.        ])

In [38]:
print(len(vectorizer.vocabulary_))
print(vector_transform.shape)

5000
(4806, 5000)


In [39]:
similarity_matrix.shape

(4806, 4806)

### Using TF-IDF

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_tf = TfidfVectorizer()
vectorizer_tf.fit(movies_df["tags"])
vector_transform_tf = vectorizer_tf.transform(movies_df["tags"])
print(vectorizer_tf.vocabulary_)
print(vector_transform_tf.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [41]:
vector_tf = vector_transform_tf.toarray()

In [42]:
from sklearn.metrics.pairwise import cosine_similarity

In [43]:
similarity_matrix_tf = cosine_similarity(vector_tf)
similarity_matrix_tf[0]

array([1.        , 0.06343249, 0.02303776, ..., 0.02564359, 0.00817532,
       0.00632016])

### Recommendation

In [44]:
def recommend_movie(movie_name, n = 5):
    pos_movie = movies_df[movies_df["title"] == movie_name].index[0]
    similarity = similarity_matrix[pos_movie]
    pos_movies_list = sorted(list(enumerate(similarity)), reverse=True, key=lambda x: x[1])[1:n+1]
    
    for pos_movie in pos_movies_list:
        print(movies_df.iloc[pos_movie[0]].title)

In [45]:
recommend_movie("The Avengers", 10)

Avengers: Age of Ultron
Captain America: Civil War
Iron Man 3
Captain America: The First Avenger
Thor: The Dark World
Iron Man
Captain America: The Winter Soldier
Ant-Man
Iron Man 2
Fantastic 4: Rise of the Silver Surfer


In [46]:
def recommend_movie_tf(movie_name, n = 5):
    pos_movie = movies_df[movies_df["title"] == movie_name].index[0]
    similarity = similarity_matrix_tf[pos_movie]
    pos_movies_list = sorted(list(enumerate(similarity)), reverse=True, key=lambda x: x[1])[1:n+1]
    
    for pos_movie in pos_movies_list:
        print(movies_df.iloc[pos_movie[0]].title)

In [47]:
recommend_movie_tf("The Avengers", 10)

Avengers: Age of Ultron
Captain America: Civil War
Captain America: The Winter Soldier
Captain America: The First Avenger
Iron Man 2
Iron Man 3
X-Men: Apocalypse
Thor: The Dark World
Serenity
Fantastic 4: Rise of the Silver Surfer


## 4. Save the Movies DataFrame and The Similarity Matrix for later use.

In [48]:
import pickle

top_10_movies = movies.sort_values(["vote_count","vote_average"], ascending=False).id[:10]

pickle.dump(movies_df, open("Data/movies_list.pkl", "wb"))
pickle.dump(similarity_matrix, open("Data/similarity_ct.pkl", "wb"))
pickle.dump(similarity_matrix_tf, open("Data/similarity_tf.pkl", "wb"))
pickle.dump(top_10_movies, open("Data/top_10_movies.pkl", "wb"))