# A content based movie recommendation system. 

In [1]:
import pandas as pd
import numpy as np

In [5]:
!ls

Content Based Movie Recomender.ipynb [34mml-latest-small[m[m
[34mevaluating-recommenders[m[m              ml-latest-small (1).zip


In [8]:
import glob
print(glob.glob("ml-latest-small/*.csv"))

['ml-latest-small/links.csv', 'ml-latest-small/tags.csv', 'ml-latest-small/ratings.csv', 'ml-latest-small/movies.csv']


## Read in the data sets

In [140]:
links = pd.read_csv('ml-latest-small/links.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')

In [141]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [142]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [143]:
print(ratings.shape)
ratings.head()

(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [144]:
print(movies.shape)
movies.head()

(9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [145]:
print(tags['tag'].nunique(), tags['userId'].nunique())

1589 58


In [146]:
tags.shape

(3683, 4)

In [147]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [148]:
# tags.rename(columns = {'movieId':'movie_Id'}, inplace=True)
# tags.head()

We want build a content based recommendation engine but we currently have no reviews or movie details column. In order to carry out the necessary NLP vectorization and given the available information is relatively light on the ground we have decided to split and merge any appropriate column. The following will be merged into a single details column: release date, tag and genre

## Data preparation

In [149]:
new_movies = movies['title'].str.split("(",  n=1, expand=True)

In [150]:
movies["title"] = new_movies[0]
movies["release_date"] = new_movies[1]
# movies.drop(columns=["title"], inplace=True)

movies.head()

Unnamed: 0,movieId,title,genres,release_date
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995)
1,2,Jumanji,Adventure|Children|Fantasy,1995)
2,3,Grumpier Old Men,Comedy|Romance,1995)
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995)
4,5,Father of the Bride Part II,Comedy,1995)


In [151]:
movies['release_date'] = movies['release_date'].str.replace(r")","")

In [152]:
movies.head()

Unnamed: 0,movieId,title,genres,release_date
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [153]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   movieId       9742 non-null   int64 
 1   title         9742 non-null   object
 2   genres        9742 non-null   object
 3   release_date  9730 non-null   object
dtypes: int64(1), object(3)
memory usage: 304.6+ KB


In [171]:
df = movies.merge(tags, how='left', left_on='movieId', right_on='movieId')
print(df.shape)
df.head()

(11853, 7)


Unnamed: 0,movieId,title,genres,release_date,userId,tag,timestamp
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,336.0,pixar,1139046000.0
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,474.0,pixar,1137207000.0
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,567.0,fun,1525286000.0
3,2,Jumanji,Adventure|Children|Fantasy,1995,62.0,fantasy,1528844000.0
4,2,Jumanji,Adventure|Children|Fantasy,1995,62.0,magic board game,1528844000.0


In [159]:
# df_info = pd.concat([movies, tags], axis=1)
# df_info.head()

In [160]:
#merge the required columns into a single 'details' column. 
df_details = df.assign(details = df['genres'].astype(str) + ',' + \
                     df['release_date'].astype(str) + ',' + df['tag'].astype(str))

In [161]:
df_details.head()

Unnamed: 0,movieId,title,genres,release_date,userId,tag,timestamp,details
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,336.0,pixar,1139046000.0,"Adventure|Animation|Children|Comedy|Fantasy,19..."
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,474.0,pixar,1137207000.0,"Adventure|Animation|Children|Comedy|Fantasy,19..."
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,567.0,fun,1525286000.0,"Adventure|Animation|Children|Comedy|Fantasy,19..."
3,2,Jumanji,Adventure|Children|Fantasy,1995,62.0,fantasy,1528844000.0,"Adventure|Children|Fantasy,1995,fantasy"
4,2,Jumanji,Adventure|Children|Fantasy,1995,62.0,magic board game,1528844000.0,"Adventure|Children|Fantasy,1995,magic board game"


In [162]:
df_details.drop(columns=['genres', 'release_date', 'tag', 'timestamp'], inplace=True)

In [172]:
df_details.head(12000)

Unnamed: 0,movieId,title,userId,details
0,1,Toy Story,336.0,"Adventure|Animation|Children|Comedy|Fantasy,19..."
1,1,Toy Story,474.0,"Adventure|Animation|Children|Comedy|Fantasy,19..."
2,1,Toy Story,567.0,"Adventure|Animation|Children|Comedy|Fantasy,19..."
3,2,Jumanji,62.0,"Adventure|Children|Fantasy,1995,fantasy"
4,2,Jumanji,62.0,"Adventure|Children|Fantasy,1995,magic board game"
...,...,...,...,...
11848,193581,Black Butler: Book of the Atlantic,,"Action|Animation|Comedy|Fantasy,2017,nan"
11849,193583,No Game No Life: Zero,,"Animation|Comedy|Fantasy,2017,nan"
11850,193585,Flint,,"Drama,2017,nan"
11851,193587,Bungo Stray Dogs: Dead Apple,,"Action|Animation,2018,nan"


In [164]:
df_details['title'].unique()

array(['Toy Story ', 'Jumanji ', 'Grumpier Old Men ', ..., 'Flint ',
       'Bungo Stray Dogs: Dead Apple ', 'Andrew Dice Clay: Dice Rules '],
      dtype=object)