### Recommendation System

A Simple recommender system, using Numpy and Pandas on the MovieLens Movie Dataset

In [1]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
import os

%matplotlib inline

### Read Both the Movies Dataset containing Title, Genre and Year, and Movie Ratings information

In [2]:
#Storing the movie information into a pandas dataframe
movies_df = pd.read_csv(os.path.join('data','movies.csv'))

#Storing the user information into a pandas dataframe
ratings_df = pd.read_csv(os.path.join('data','ratings.csv'))

movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


#### Data Cleaning

In [3]:
movies_df['year'] = movies_df.title.str.extract('(\d\d\d\d)', expand = False)
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
movies_df['genres'] = movies_df.genres.str.split('|')
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


#### One Hot Encoding of Movie Genres

In [4]:
genre_matrix = movies_df.copy()

for index, row in movies_df.iterrows():
    for genre in row['genres']:
        genre_matrix.at[index, genre] = 1
genre_matrix = genre_matrix.fillna(0)
genre_matrix.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [6]:
ratings_df = ratings_df.drop(columns ='timestamp')
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


#### Creating Random input rating of a user

In [7]:
userrate = [{'title': 'Grumpier Old Men', 'rating': 3.3},
           {'title': 'Jumanji', 'rating': 4.5}, {'title': 'Money Train', 'rating': 3.2},{'title': 'Milk Money', 'rating': 2.6},
           {'title': 'Priest', 'rating': 4.1}, {'title': 'Priest', 'rating': 3.7}]
userin = pd.DataFrame(userrate)
userin

Unnamed: 0,rating,title
0,3.3,Grumpier Old Men
1,4.5,Jumanji
2,3.2,Money Train
3,2.6,Milk Money
4,4.1,Priest
5,3.7,Priest


In [8]:
inputm = movies_df[movies_df['title'].isin(userin['title'].tolist())]
userin = pd.merge(inputm, userin)
userin

Unnamed: 0,movieId,title,genres,year,rating
0,2,Jumanji,"[Adventure, Children, Fantasy]",1995,4.5
1,3,Grumpier Old Men,"[Comedy, Romance]",1995,3.3
2,20,Money Train,"[Action, Comedy, Crime, Drama, Thriller]",1995,3.2
3,276,Milk Money,"[Comedy, Romance]",1994,2.6
4,299,Priest,[Drama],1994,4.1
5,299,Priest,[Drama],1994,3.7
6,86835,Priest,"[Action, Horror, Sci-Fi, Thriller]",2011,4.1
7,86835,Priest,"[Action, Horror, Sci-Fi, Thriller]",2011,3.7


In [9]:
genre_matrix2 = genre_matrix[genre_matrix['title'].isin(userin['title'].tolist())]
genre_matrix2

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,20,Money Train,"[Action, Comedy, Crime, Drama, Thriller]",1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
273,276,Milk Money,"[Comedy, Romance]",1994,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
296,299,Priest,[Drama],1994,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17207,86835,Priest,"[Action, Horror, Sci-Fi, Thriller]",2011,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
userin = userin.drop('genres', axis = 1)
userin

Unnamed: 0,movieId,title,year,rating
0,2,Jumanji,1995,4.5
1,3,Grumpier Old Men,1995,3.3
2,20,Money Train,1995,3.2
3,276,Milk Money,1994,2.6
4,299,Priest,1994,4.1
5,299,Priest,1994,3.7
6,86835,Priest,2011,4.1
7,86835,Priest,2011,3.7


In [11]:
userin = userin.drop(index = 4).drop(index = 7)
userin

Unnamed: 0,movieId,title,year,rating
0,2,Jumanji,1995,4.5
1,3,Grumpier Old Men,1995,3.3
2,20,Money Train,1995,3.2
3,276,Milk Money,1994,2.6
5,299,Priest,1994,3.7
6,86835,Priest,2011,4.1


In [12]:
userin.reset_index(drop = True)

Unnamed: 0,movieId,title,year,rating
0,2,Jumanji,1995,4.5
1,3,Grumpier Old Men,1995,3.3
2,20,Money Train,1995,3.2
3,276,Milk Money,1994,2.6
4,299,Priest,1994,3.7
5,86835,Priest,2011,4.1


In [13]:
userin = userin.reset_index(drop = True)

In [14]:
genre_matrix2 = genre_matrix2.drop(columns = ['movieId', 'title', 'genres', 'year'])
genre_matrix2 = genre_matrix2.reset_index(drop = True)
genre_matrix2

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Dot product of the weighted genre Matrix using User Rating Collected Initially

In [15]:
np.dot(userin['rating'], genre_matrix2)

array([4.5, 0. , 4.5, 9.1, 4.5, 5.9, 6.9, 7.3, 3.2, 7.3, 4.1, 0. , 4.1,
       0. , 0. , 0. , 0. , 0. , 0. , 0. ])

In [16]:
weightedrate = genre_matrix2.transpose().dot(userin['rating'])
weightedrate

Adventure             4.5
Animation             0.0
Children              4.5
Comedy                9.1
Fantasy               4.5
Romance               5.9
Drama                 6.9
Action                7.3
Crime                 3.2
Thriller              7.3
Horror                4.1
Mystery               0.0
Sci-Fi                4.1
IMAX                  0.0
Documentary           0.0
War                   0.0
Musical               0.0
Western               0.0
Film-Noir             0.0
(no genres listed)    0.0
dtype: float64

In [17]:
genre_matrix = genre_matrix.drop(columns = ['movieId', 'title', 'genres', 'year'])
genre_matrix = genre_matrix.reset_index(drop = True)
genre_matrix

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Display weighted matrix by movie

In [18]:
recommendertable = (genre_matrix*weightedrate).sum(axis = 1)/(weightedrate.sum())
recommendertable = recommendertable.sort_values(ascending = False)
recommendertable.head(20)

16055    0.690554
4861     0.667752
16504    0.667752
14397    0.659609
15073    0.646580
4625     0.646580
15001    0.646580
27514    0.646580
4923     0.644951
25218    0.644951
33692    0.644951
13250    0.623779
26442    0.623779
9296     0.622150
7124     0.617264
11497    0.607492
9697     0.607492
33914    0.600977
32307    0.600977
26806    0.600977
dtype: float64

### Recommendation

First 55 recommendation by weighted matrix gotten

In [23]:
recmovie = movies_df.loc[movies_df['movieId']. isin(recommendertable.head(55).keys())]
recmovie

Unnamed: 0,movieId,title,genres,year
18,19,Ace Ventura: When Nature Calls,[Comedy],1995
141,143,Gospa,[Drama],1995
372,376,"River Wild, The","[Action, Thriller]",1994
451,455,Free Willy,"[Adventure, Children, Drama]",1993
538,542,Son in Law,"[Comedy, Drama, Romance]",1993
575,581,"Celluloid Closet, The",[Documentary],1995
1367,1398,In Love and War,"[Romance, War]",1996
2449,2533,Escape from the Planet of the Apes,"[Action, Sci-Fi]",1971
4531,4625,Millennium,"[Drama, Sci-Fi, Thriller]",1989
4766,4861,Mission to Mir,"[Documentary, IMAX]",1997
