In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
countv = CountVectorizer(stop_words = 'english')

In [3]:
data = pd.read_csv('my_final_data.csv')

In [4]:
df = data.copy()
df.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...


In [5]:
df.shape

(6147, 7)

Removing the spaces between words is an important preprocessing step. It is done so that your vectorizer doesn't count the Johnny of "Johnny Depp" and "Johnny Galecki" as the same. After this processing step, the aforementioned actors will be represented as "johnnydepp" and "johnnygalecki" and will be distinct to your vectorizer.

In [6]:
# Convert all strings to lower case and strip names of spaces

def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [7]:
features = ['director_name','actor_1_name','actor_2_name','actor_3_name']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [8]:
df.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,jamescameron,cchpounder,joeldavidmoore,wesstudi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,goreverbinski,johnnydepp,orlandobloom,jackdavenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,sammendes,christophwaltz,rorykinnear,stephaniesigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,christophernolan,tomhardy,christianbale,josephgordon-levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,dougwalker,dougwalker,robwalker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...


In [9]:
df['genres'] = df['genres'].str.lower()

In [10]:
df.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,jamescameron,cchpounder,joeldavidmoore,wesstudi,action adventure fantasy sci-fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,goreverbinski,johnnydepp,orlandobloom,jackdavenport,action adventure fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,sammendes,christophwaltz,rorykinnear,stephaniesigman,action adventure thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,christophernolan,tomhardy,christianbale,josephgordon-levitt,action thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,dougwalker,dougwalker,robwalker,unknown,documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...


In [11]:
def create_soup(x):
    return x['director_name'] + ' ' + x['actor_1_name'] + ' ' + x['actor_2_name'] + ' ' + x['actor_3_name'] + ' ' + x['genres']

In [12]:
df['soup'] = df.apply(create_soup, axis = 1)

In [13]:
df.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb,soup
0,jamescameron,cchpounder,joeldavidmoore,wesstudi,action adventure fantasy sci-fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...,jamescameron cchpounder joeldavidmoore wesstud...
1,goreverbinski,johnnydepp,orlandobloom,jackdavenport,action adventure fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...,goreverbinski johnnydepp orlandobloom jackdave...
2,sammendes,christophwaltz,rorykinnear,stephaniesigman,action adventure thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...,sammendes christophwaltz rorykinnear stephanie...
3,christophernolan,tomhardy,christianbale,josephgordon-levitt,action thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...,christophernolan tomhardy christianbale joseph...
4,dougwalker,dougwalker,robwalker,unknown,documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...,dougwalker dougwalker robwalker unknown docume...


In [14]:
df.drop(['comb'], axis=1, inplace=True)

In [15]:
df.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,soup
0,jamescameron,cchpounder,joeldavidmoore,wesstudi,action adventure fantasy sci-fi,avatar,jamescameron cchpounder joeldavidmoore wesstud...
1,goreverbinski,johnnydepp,orlandobloom,jackdavenport,action adventure fantasy,pirates of the caribbean: at world's end,goreverbinski johnnydepp orlandobloom jackdave...
2,sammendes,christophwaltz,rorykinnear,stephaniesigman,action adventure thriller,spectre,sammendes christophwaltz rorykinnear stephanie...
3,christophernolan,tomhardy,christianbale,josephgordon-levitt,action thriller,the dark knight rises,christophernolan tomhardy christianbale joseph...
4,dougwalker,dougwalker,robwalker,unknown,documentary,star wars: episode vii - the force awakens ...,dougwalker dougwalker robwalker unknown docume...


In [16]:
df['soup'][1]

'goreverbinski johnnydepp orlandobloom jackdavenport action adventure fantasy'

## Content-Based Recommender

In [17]:
count_matrix = countv.fit_transform(df['soup'])

In [18]:
count_matrix.shape

(6147, 11767)

In [19]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [20]:
cosine_sim

array([[1.        , 0.37796447, 0.25197632, ..., 0.        , 0.        ,
        0.        ],
       [0.37796447, 1.        , 0.28571429, ..., 0.        , 0.        ,
        0.        ],
       [0.25197632, 0.28571429, 1.        , ..., 0.        , 0.14285714,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.15430335,
        0.18257419],
       [0.        , 0.        , 0.14285714, ..., 0.15430335, 1.        ,
        0.16903085],
       [0.        , 0.        , 0.        , ..., 0.18257419, 0.16903085,
        1.        ]])

In [21]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df.index, index=df['movie_title']).drop_duplicates()

In [22]:
indices.shape

(6147,)

In [23]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['movie_title'].iloc[movie_indices]

In [24]:
get_recommendations('the dark knight rises', cosine_sim)

86                         inception
57                   the dark knight
648     sin city: a dame to kill for
1407                    premium rush
68       g.i. joe: the rise of cobra
1074             miracle at st. anna
5971                            7500
107                    batman begins
1542                          looper
853                            shaft
Name: movie_title, dtype: object

In [25]:
get_recommendations('the godfather', cosine_sim)

2722     the godfather: part ii
847     the godfather: part iii
1499             apocalypse now
934                   the judge
1823                   scarface
3005              the outsiders
4119      to kill a mockingbird
590                   the score
1000            the cotton club
1188              the rainmaker
Name: movie_title, dtype: object

In [26]:
get_recommendations('american pie', cosine_sim)

1515                    american pie 2
929                   american reunion
1035                     scary movie 2
2012    jay and silent bob strike back
2677                dead man on campus
2823           johnson family vacation
2930                           tomcats
265                     little fockers
798                   american wedding
1406                            cursed
Name: movie_title, dtype: object