# Content-Based Movie Recommendation Engine

In [1]:
! pip install rake_nltk

import pandas as pd
import numpy as np
from rake_nltk import Rake

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

from ast import literal_eval



## Data Preprocessing
### Loading Data

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Join datasets
credits.columns = ['id', 'title', 'cast', 'crew']

alldata = movies.merge(credits, on = 'id')
alldata.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


### Cleaning Features

Our content-based filtering system will not be using all of these columns, so I will cut the dataset down to only include the relevant features. Then we can clean up the feature contents.

In [3]:
import warnings
warnings.filterwarnings('ignore')


df = alldata[['id', 'original_title', 'genres', 'keywords', 'overview', 'original_language', 'cast', 'crew']]


features = ['keywords', 'genres', 'cast', 'crew']
for i in features:
    df[i] = alldata[i].apply(literal_eval)
    
# Extract list of genres
def list_genres(x):
    l = [d['name'] for d in x]
    return(l)
df['genres'] = df['genres'].apply(list_genres)

# Extract top 3 cast members
def list_cast(x):
    l = [d['name'] for d in x]
    if len(l) > 3:
        l = l[:3]
    return(l)
df['cast'] = df['cast'].apply(list_cast)

# Extract top 5 keywords
def list_keywords(x):
    l = [d['name'] for d in x]
    if len(l) > 5:
        l = l[:5]
    return(l)
df['keywords'] = df['keywords'].apply(list_keywords)

# Extract director
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan
df['director'] = df['crew'].apply(get_director)

# Drop the now unnecessary crew feature
df = df.drop('crew', axis = 1)

# Clean features of spaces and lowercase all to ensure uniques
def clean_feat(x):
    if isinstance(x, list):
        return [i.lower().replace(" ","") for i in x]
    else:
        if isinstance(x, str):
            return x.lower().replace(" ", "")
        else:
            return ''

features = ['keywords', 'genres', 'cast', 'director']
for i in features:
    df[i] = df[i].apply(clean_feat)

In [4]:
df.head()

Unnamed: 0,id,original_title,genres,keywords,overview,original_language,cast,director
0,19995,Avatar,"[action, adventure, fantasy, sciencefiction]","[cultureclash, future, spacewar, spacecolony, ...","In the 22nd century, a paraplegic Marine is di...",en,"[samworthington, zoesaldana, sigourneyweaver]",jamescameron
1,285,Pirates of the Caribbean: At World's End,"[adventure, fantasy, action]","[ocean, drugabuse, exoticisland, eastindiatrad...","Captain Barbossa, long believed to be dead, ha...",en,"[johnnydepp, orlandobloom, keiraknightley]",goreverbinski
2,206647,Spectre,"[action, adventure, crime]","[spy, basedonnovel, secretagent, sequel, mi6]",A cryptic message from Bond’s past sends him o...,en,"[danielcraig, christophwaltz, léaseydoux]",sammendes
3,49026,The Dark Knight Rises,"[action, crime, drama, thriller]","[dccomics, crimefighter, terrorist, secretiden...",Following the death of District Attorney Harve...,en,"[christianbale, michaelcaine, garyoldman]",christophernolan
4,49529,John Carter,"[action, adventure, sciencefiction]","[basedonnovel, mars, medallion, spacetravel, p...","John Carter is a war-weary, former military ca...",en,"[taylorkitsch, lynncollins, samanthamorton]",andrewstanton


Now we have several features with lists of keywords that are all lowercase and stripped of spaces, therefore making them unique keywords. 

### Missing Values
Let's check for missing values, since they could be problematic when it comes to creating more keywords for overview.

In [5]:
missing = df.columns[df.isnull().any()]
df[missing].isnull().sum().to_frame()

Unnamed: 0,0
overview,3


In [6]:
# Replace NaN from overview with an empty string
df['overview'] = df['overview'].fillna('')

### Creating bag of keywords

We will use genres, keywords, overview, cast, and director to create a bag of words column.

Let's use Rake from the nltk package to extract keywords from the overview feature, which is a summary of the plot. We'll put those keywords into a new column: plotwords.

In [7]:
# Initialize empty column
df['plotwords'] = ''

# function to get keywords from a text
def get_keywords(x):
    plot = x
    
    
    rake = Rake()
    
    # extract keywords from text
    rake.extract_keywords_from_text(plot)
    
    
    scores = rake.get_word_degrees()
    
    
    return(list(scores.keys()))

# Apply function to generate keywords
df['plotwords'] = df['overview'].apply(get_keywords)

Now that we have our plot keywords, let's combine our our cleaned features with them to create a bag of words. We'll make a new dataframe.

In [8]:
df_keys = pd.DataFrame() 

df_keys['title'] = df['original_title']
df_keys['keywords'] = ''

def bag_words(x):
    return(' '.join(x['genres']) + ' ' + ' '.join(x['keywords']) + ' ' +  ' '.join(x['cast']) + 
           ' ' + ' '.join(x['director']) + ' ' + ' '.join(x['plotwords']))
df_keys['keywords'] = df.apply(bag_words, axis = 1)

df_keys.head()

Unnamed: 0,title,keywords
0,Avatar,action adventure fantasy sciencefiction cultur...
1,Pirates of the Caribbean: At World's End,adventure fantasy action ocean drugabuse exoti...
2,Spectre,action adventure crime spy basedonnovel secret...
3,The Dark Knight Rises,action crime drama thriller dccomics crimefigh...
4,John Carter,action adventure sciencefiction basedonnovel m...


## Creating Model

We will use CountVectorizer from scikit-learn to convet the keywords into a matrix of token counts, producing the frequency of each word.

In [9]:
# create count matrix
cv = CountVectorizer()
cv_mx = cv.fit_transform(df_keys['keywords'])

In [10]:
# create cosine similarity matrix
cosine_sim = cosine_similarity(cv_mx, cv_mx)
cosine_sim

array([[1.        , 0.10527936, 0.06299408, ..., 0.        , 0.        ,
        0.        ],
       [0.10527936, 1.        , 0.09284767, ..., 0.02438299, 0.        ,
        0.        ],
       [0.06299408, 0.09284767, 1.        , ..., 0.02188441, 0.        ,
        0.        ],
       ...,
       [0.        , 0.02438299, 0.02188441, ..., 1.        , 0.05938557,
        0.03959038],
       [0.        , 0.        , 0.        , ..., 0.05938557, 1.        ,
        0.06818182],
       [0.        , 0.        , 0.        , ..., 0.03959038, 0.06818182,
        1.        ]])

In [11]:
# create list of indices for later matching
indices = pd.Series(df_keys.index, index = df_keys['title'])

# Recommendation 
Now we will write the actual recommendation function.

In [12]:
def recommend_movie(title, n = 10, cosine_sim = cosine_sim):
    movies = []
    
    # retrieve matching movie title index
    if title not in indices.index:
        print("Movie not in database.")`11
        return
    else:
        idx = indices[title]
    
    # cosine similarity scores of movies in descending order
    scores = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    
    # top n most similar movies indexes
    # use 1:n because 0 is the same movie entered
    top_n_idx = list(scores.iloc[1:n].index)
        
    return df_keys['title'].iloc[top_n_idx]

### Testing our Recommendation Engine!

In [13]:
recommend_movie('Toy Story', n = 5)

343        Toy Story 2
42         Toy Story 3
221    Stuart Little 2
837         Free Birds
Name: title, dtype: object

In [14]:
recommend_movie('The Avengers')

7                  Avengers: Age of Ultron
174                    The Incredible Hulk
511                                  X-Men
85     Captain America: The Winter Soldier
79                              Iron Man 2
26              Captain America: Civil War
169     Captain America: The First Avenger
31                              Iron Man 3
68                                Iron Man
Name: title, dtype: object

In [15]:
recommend_movie('The Hobbit: An Unexpected Journey')

22                    The Hobbit: The Desolation of Smaug
19              The Hobbit: The Battle of the Five Armies
262     The Lord of the Rings: The Fellowship of the Ring
329         The Lord of the Rings: The Return of the King
330                 The Lord of the Rings: The Two Towers
292                                                Eragon
84                                               47 Ronin
1044                   Journey to the Center of the Earth
1                Pirates of the Caribbean: At World's End
Name: title, dtype: object

In [16]:
recommend_movie('Go', n = 7)

3727       Snabba Cash
4019        Jesus' Son
2055         Mad Money
3623              Made
388     Ocean's Eleven
3488         Novocaine
Name: title, dtype: object

In [17]:

recommend_movie('Made', n = 7)

3851                 Taxman
1873             Blood Ties
4401    The Helix... Loaded
3180     The Way of the Gun
1038        The Infiltrator
534                 Bandits
Name: title, dtype: object