# A content based movie recommendation system. 

In [211]:
import pandas as pd
import numpy as np

In [212]:
!ls

Content Based Movie Recomender.ipynb [34mevaluating-recommenders[m[m
[34mExample projects[m[m                     [34mml-latest-small[m[m
[34mModule_4_Project_Flatiron[m[m            ml-latest-small (1).zip


In [213]:
import glob
print(glob.glob("ml-latest-small/*.csv"))

['ml-latest-small/links.csv', 'ml-latest-small/tags.csv', 'ml-latest-small/ratings.csv', 'ml-latest-small/movies.csv']


## Read in the data sets

In [214]:
links = pd.read_csv('ml-latest-small/links.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')

In [215]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [216]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [217]:
print(ratings.shape)
ratings.head()

(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [218]:
print(movies.shape)
movies.head()

(9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [219]:
print(tags['tag'].nunique(), tags['userId'].nunique())

1589 58


In [220]:
tags.shape

(3683, 4)

In [221]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [222]:
# tags.rename(columns = {'movieId':'movie_Id'}, inplace=True)
# tags.head()

We want build a content based recommendation engine but we currently have no reviews or movie details column. In order to carry out the necessary NLP vectorization and given the available information is relatively light on the ground we have decided to split and merge any appropriate column. The following will be merged into a single details column: release date, tag and genre

## Data preparation

In [223]:
new_movies = movies['title'].str.split("(",  n=1, expand=True)

In [224]:
movies["title"] = new_movies[0]
movies["release_date"] = new_movies[1]
# movies.drop(columns=["title"], inplace=True)

movies.head()

Unnamed: 0,movieId,title,genres,release_date
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995)
1,2,Jumanji,Adventure|Children|Fantasy,1995)
2,3,Grumpier Old Men,Comedy|Romance,1995)
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995)
4,5,Father of the Bride Part II,Comedy,1995)


In [225]:
movies['release_date'] = movies['release_date'].str.replace(r")","")

In [226]:
movies.head()

Unnamed: 0,movieId,title,genres,release_date
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [227]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   movieId       9742 non-null   int64 
 1   title         9742 non-null   object
 2   genres        9742 non-null   object
 3   release_date  9730 non-null   object
dtypes: int64(1), object(3)
memory usage: 304.6+ KB


In [228]:
df = movies.merge(tags, how='left', left_on='movieId', right_on='movieId')
print(df.shape)
df.head()

(11853, 7)


Unnamed: 0,movieId,title,genres,release_date,userId,tag,timestamp
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,336.0,pixar,1139046000.0
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,474.0,pixar,1137207000.0
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,567.0,fun,1525286000.0
3,2,Jumanji,Adventure|Children|Fantasy,1995,62.0,fantasy,1528844000.0
4,2,Jumanji,Adventure|Children|Fantasy,1995,62.0,magic board game,1528844000.0


In [229]:
# df_info = pd.concat([movies, tags], axis=1)
# df_info.head()

In [230]:
#merge the required columns into a single 'details' column. 
df_details = df.assign(details = df['genres'].astype(str) + ',' + \
                     df['release_date'].astype(str) + ',' + df['tag'].astype(str))

In [231]:
df_details.head()

Unnamed: 0,movieId,title,genres,release_date,userId,tag,timestamp,details
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,336.0,pixar,1139046000.0,"Adventure|Animation|Children|Comedy|Fantasy,19..."
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,474.0,pixar,1137207000.0,"Adventure|Animation|Children|Comedy|Fantasy,19..."
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,567.0,fun,1525286000.0,"Adventure|Animation|Children|Comedy|Fantasy,19..."
3,2,Jumanji,Adventure|Children|Fantasy,1995,62.0,fantasy,1528844000.0,"Adventure|Children|Fantasy,1995,fantasy"
4,2,Jumanji,Adventure|Children|Fantasy,1995,62.0,magic board game,1528844000.0,"Adventure|Children|Fantasy,1995,magic board game"


In [232]:
df_details.drop(columns=['genres', 'release_date', 'tag', 'timestamp'], inplace=True)

In [233]:
df_details.head(12000)

Unnamed: 0,movieId,title,userId,details
0,1,Toy Story,336.0,"Adventure|Animation|Children|Comedy|Fantasy,19..."
1,1,Toy Story,474.0,"Adventure|Animation|Children|Comedy|Fantasy,19..."
2,1,Toy Story,567.0,"Adventure|Animation|Children|Comedy|Fantasy,19..."
3,2,Jumanji,62.0,"Adventure|Children|Fantasy,1995,fantasy"
4,2,Jumanji,62.0,"Adventure|Children|Fantasy,1995,magic board game"
...,...,...,...,...
11848,193581,Black Butler: Book of the Atlantic,,"Action|Animation|Comedy|Fantasy,2017,nan"
11849,193583,No Game No Life: Zero,,"Animation|Comedy|Fantasy,2017,nan"
11850,193585,Flint,,"Drama,2017,nan"
11851,193587,Bungo Stray Dogs: Dead Apple,,"Action|Animation,2018,nan"


We have multiple movie entries in this format which will make the recommendations nonsensical or hard to understand... for example we could end up being recommended the same movie as we're comparing to or the same movie more than once. We will group the dataframe by movieID and merge the details info. We will lose the user id as this is not important for the content. 

In [234]:
#Reduce to a single instance of each movie with all the info associated with multiples kept in the details col. 
data = df_details.groupby('movieId').agg({'title':'first',
                                          'details': ', '.join}).reset_index()
data.head() 

Unnamed: 0,movieId,title,details
0,1,Toy Story,"Adventure|Animation|Children|Comedy|Fantasy,19..."
1,2,Jumanji,"Adventure|Children|Fantasy,1995,fantasy, Adven..."
2,3,Grumpier Old Men,"Comedy|Romance,1995,moldy, Comedy|Romance,1995..."
3,4,Waiting to Exhale,"Comedy|Drama|Romance,1995,nan"
4,5,Father of the Bride Part II,"Comedy,1995,pregnancy, Comedy,1995,remake"


In [237]:
data.to_csv(r'ml-latest-small/movies_titles.csv')

In [73]:
data.shape

(9742, 3)

In [206]:
#Check to make sure no whitespace in the title after we previously split the column?
data.title.loc[0]

'Toy Story '

In [210]:
#Turns out there is some remaining white space that needs removing. 
data.title = data.title.apply(lambda x: x.strip())
data.title.loc[0] #sorted

'Toy Story'

In [74]:
data['title'].unique()

array(['Toy Story ', 'Jumanji ', 'Grumpier Old Men ', ..., 'Flint ',
       'Bungo Stray Dogs: Dead Apple ', 'Andrew Dice Clay: Dice Rules '],
      dtype=object)

In [75]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   details  9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [54]:
#Use Term Frequence, Inverse Document Frequency to determine the weights of each term in our dataframe
tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                      ngram_range=(1,3), stop_words='english')

In [76]:
#Replace NAN with empty strings
data['details'] = data['details'].fillna('')

In [77]:
#Fit the TF-IDF on the 'details' text and create sparse matrix
tfv_matrix = tfv.fit_transform(data['details'])

In [78]:
tfv_matrix

<9742x3496 sparse matrix of type '<class 'numpy.float64'>'
	with 94614 stored elements in Compressed Sparse Row format>

In [79]:
tfv_matrix.shape

(9742, 3496)

In [80]:
from sklearn.metrics.pairwise import sigmoid_kernel

#create an instance of the sigmoid kernel class
sig = sigmoid_kernel(tfv_matrix, tfv_matrix) #compare how overviews are related to each other

In [81]:
#sigmoid score for all details comparisons with the title at index 0
sig[0]

array([0.76171426, 0.76163688, 0.76160539, ..., 0.76159416, 0.76160057,
       0.7615958 ])

In [175]:
indices = pd.Series(data.index, index=data['title']).drop_duplicates()

In [174]:
# indices.reset_index(inplace=True)

TypeError: Cannot reset_index inplace on a Series to create a DataFrame

In [203]:
indices[3000:3020]

title
Emperor's New Groove, The      3000
Pollock                        3001
What Women Want                3002
Finding Forrester              3003
Gift, The                      3004
Before Night Falls             3005
Cast Away                      3006
Family Man, The                3007
House of Mirth, The            3008
Miss Congeniality              3009
O Brother, Where Art Thou?     3010
State and Main                 3011
Dracula 2000                   3012
All the Pretty Horses          3013
Everlasting Piece, An          3014
Thirteen Days                  3015
Traffic                        3016
Claim, The                     3017
Shadow of the Vampire          3018
House of Games                 3019
dtype: int64

In [199]:
indices['Dumb & Dumber ']

197

In [178]:
# indices[indices['title'].astype(str)=='Toy Story'].index.item()

In [142]:
# indices.loc[indices['title']=='Toy_Story']
# indices.query('title == "Jumanji"')

Unnamed: 0,title


In [180]:
def recommender(title, sig=sig):
    """
    This function takes a movie title as an argument and 
    returns the n=5 most similar movies based on the content
    of the movie details and sig score. 
    
    """
    
    #First we get the index corresponding to the argument / original title
    index = indices[title]
    
    #Then we fetch all the sigmoid scores for the pairwise comparisons with the argument title
    sig_scores = list(enumerate(sig[index]))
    
    #Now we sort the scores so the top recommended movies are at the top
    sorted_sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
    
    #Slice the top off to give us a list equal to n=5
    top_sig_scores = sorted_sig_scores[1:6]
    
    #Identify the movie indices corresponding to the above list
    movie_index = [i[0] for i in top_sig_scores]
    
    #Now find the title names at these idices and return them as our recommendations!!
    return data['title'].iloc[movie_index]
    

In [205]:
recommender('Dracula 2000 ')

3012                  Dracula 2000 
5671              Ju-on: The Curse 
3018         Shadow of the Vampire 
2828                   Scary Movie 
6177    Dead Hate the Living!, The 
Name: title, dtype: object