## Collaborative-based Recommendation Systems

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

### Read the file and merge then based on movieId

In [2]:
movie_df = pd.read_csv('movies.csv')
movie_df.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
rating_df = pd.read_csv('ratings.csv')
rating_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
rating_df.shape , movie_df.shape

((100836, 4), (9742, 3))

In [5]:
merged_df = movie_df.merge(rating_df,on='movieId')
merged_df.head(5)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [6]:
merged_df[merged_df['userId']==1]

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
325,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,964981247
433,6,Heat (1995),Action|Crime|Thriller,1,4.0,964982224
2107,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1,5.0,964983815
2379,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1,5.0,964982931
...,...,...,...,...,...,...
56816,3744,Shaft (2000),Action|Crime|Thriller,1,4.0,964980694
57276,3793,X-Men (2000),Action|Adventure|Sci-Fi,1,5.0,964981855
57457,3809,What About Bob? (1991),Comedy,1,4.0,964981220
59170,4006,Transformers: The Movie (1986),Adventure|Animation|Children|Sci-Fi,1,4.0,964982903


In [7]:
merged_df[merged_df['movieId']==1]

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
210,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,606,2.5,1349082950
211,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,607,4.0,964744033
212,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,608,2.5,1117408267
213,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,609,3.0,847221025


### Get total rating for a movie

In [8]:
total_rating_df = merged_df.groupby(by=['title'])['rating'].count().reset_index().rename(columns={'rating':'total_ratings'})[['title','total_ratings']]
total_rating_df

Unnamed: 0,title,total_ratings
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
...,...,...
9714,eXistenZ (1999),22
9715,xXx (2002),24
9716,xXx: State of the Union (2005),5
9717,¡Three Amigos! (1986),26


In [9]:
merged_df.shape, total_rating_df.shape

((100836, 6), (9719, 2))

In [10]:
total_rating_merged_df = merged_df.merge(total_rating_df,on='title')
total_rating_merged_df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,total_ratings
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,215
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,215
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946,215
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970,215
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483,215
...,...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082,1
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545,1
100833,193585,Flint (2017),Drama,184,3.5,1537109805,1
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021,1


In [11]:
total_rating_merged_df['total_ratings'].describe()

count    100836.000000
mean         58.758777
std          61.965384
min           1.000000
25%          13.000000
50%          39.000000
75%          84.000000
max         329.000000
Name: total_ratings, dtype: float64

### Eliminate movie who are less popular but setting some threshold.

In [12]:
popularity_threshold = 45
popular_movie_df = total_rating_merged_df[total_rating_merged_df['total_ratings']>=popularity_threshold]

In [13]:
popular_movie_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,total_ratings
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,215
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,215
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946,215
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970,215
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483,215


### Create a pivot table to compare rating of user with movie.

In [14]:
movie_feature_table = popular_movie_df.pivot_table(index='title',columns='userId',values='rating').fillna('0')
movie_feature_table.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0,0,0,0.0,0,0,0.0,0,0,0,...,0.0,0,3.0,0,5.0,0.0,0,0.0,0,0.0
101 Dalmatians (1996),0,0,0,0.0,0,0,0.0,0,0,0,...,0.0,0,4.0,0,3.0,0.0,0,0.0,0,0.0
12 Angry Men (1957),0,0,0,5.0,0,0,0.0,0,0,0,...,5.0,0,0.0,0,0.0,0.0,0,0.0,0,0.0
2001: A Space Odyssey (1968),0,0,0,0.0,0,0,4.0,0,0,0,...,0.0,0,5.0,0,0.0,5.0,0,3.0,0,4.5
28 Days Later (2002),0,0,0,0.0,0,0,0.0,0,0,0,...,0.0,0,0.0,0,0.0,0.0,0,3.5,0,5.0


In [15]:
popular_movie_df[popular_movie_df['title']=='10 Things I Hate About You (1999)']

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,total_ratings
45292,2572,10 Things I Hate About You (1999),Comedy|Romance,12,5.0,1247263689,54
45293,2572,10 Things I Hate About You (1999),Comedy|Romance,19,3.0,965706365,54
45294,2572,10 Things I Hate About You (1999),Comedy|Romance,68,4.5,1158534664,54
45295,2572,10 Things I Hate About You (1999),Comedy|Romance,92,5.0,1294941782,54
45296,2572,10 Things I Hate About You (1999),Comedy|Romance,104,5.0,1048586179,54
45297,2572,10 Things I Hate About You (1999),Comedy|Romance,111,4.0,1516153699,54
45298,2572,10 Things I Hate About You (1999),Comedy|Romance,132,4.0,1157923396,54
45299,2572,10 Things I Hate About You (1999),Comedy|Romance,153,1.0,1525552933,54
45300,2572,10 Things I Hate About You (1999),Comedy|Romance,158,3.5,1290764961,54
45301,2572,10 Things I Hate About You (1999),Comedy|Romance,177,4.5,1435534583,54


#### Here, We have used Nearest neighbour un-supervised learning. That based on cosine similarity are very effective to find recommendation based on a particular movie.

In [16]:
movie_feature_table = movie_feature_table.apply(pd.to_numeric, errors='coerce')
movie_knn = NearestNeighbors(metric='cosine', algorithm='brute')
movie_knn.fit(movie_feature_table)

In [17]:
query_index = np.random.choice(movie_feature_table.shape[0])
query_index

361

In [18]:
movie_feature_table.iloc[query_index]

userId
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
      ... 
606    2.0
607    0.0
608    0.0
609    0.0
610    3.5
Name: Pirates of the Caribbean: Dead Man's Chest (2006), Length: 608, dtype: float64

In [19]:
movie_feature_table.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0


#### Function that take index from movie_feature_table to find give n closest recommendation to user for it.

In [20]:
def recommendation_sys(query_index,recom_no=6):
    dist,index = movie_knn.kneighbors(movie_feature_table.iloc[query_index,:].values.reshape(1, -1), n_neighbors = recom_no)
    for i in range(0, len(dist.flatten())):
        if i == 0:
            print('Recommendations for {0}:\n'.format(movie_feature_table.index[query_index]))
        else:
            print('{0}: {1}, with distance of {2}:'.format(i, movie_feature_table.index[index.flatten()[i]], dist.flatten()[i]))

In [22]:
recommendation_sys(query_index=3,recom_no=2)

Recommendations for 2001: A Space Odyssey (1968):

1: Blade Runner (1982), with distance of 0.3292643457846741:
