In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
#download the dataset in https://grouplens.org/datasets/movielens/latest/
ratings_df = pd.read_csv('ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
movies_df = pd.read_csv('movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Idea 1:
- Form a df where each column has user id's and each row represents the movie name. In that way we can see that which user rated which movie

In [5]:
df = pd.merge(ratings_df, movies_df, on='movieId')
df.drop('timestamp', axis=1, inplace=True)
df.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


##### Summary: 
- We can see that 'Toy Story' was rated by user 1,5,7,15,17,...

In [6]:
df.describe()

Unnamed: 0,userId,movieId,rating
count,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557
std,182.618491,35530.987199,1.042529
min,1.0,1.0,0.5
25%,177.0,1199.0,3.0
50%,325.0,2991.0,3.5
75%,477.0,8122.0,4.0
max,610.0,193609.0,5.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int64  
 1   movieId  100836 non-null  int64  
 2   rating   100836 non-null  float64
 3   title    100836 non-null  object 
 4   genres   100836 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 4.6+ MB


In [8]:
df.isnull().sum()

userId     0
movieId    0
rating     0
title      0
genres     0
dtype: int64

In [10]:
combine_movie_rating = df.dropna(axis=0, subset=['title'])
combine_movie_rating.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [11]:
combine_movie_rating.groupby(['title'])['rating'].count()

title
'71 (2014)                                    1
'Hellboy': The Seeds of Creation (2004)       1
'Round Midnight (1986)                        2
'Salem's Lot (2004)                           1
'Til There Was You (1997)                     2
                                             ..
eXistenZ (1999)                              22
xXx (2002)                                   24
xXx: State of the Union (2005)                5
¡Three Amigos! (1986)                        26
À nous la liberté (Freedom for Us) (1931)     1
Name: rating, Length: 9719, dtype: int64

In [18]:
count = combine_movie_rating.groupby(['title'])['rating'].count()
combine_movie_rating['ratingCount'] = combine_movie_rating['title'].map(count)
combine_movie_rating.head()

Unnamed: 0,userId,movieId,rating,title,genres,ratingCount
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215


`Summary:` we can see that toy story was rated by 215 times by users

In [19]:
combine_movie_rating.describe()

Unnamed: 0,userId,movieId,rating,ratingCount
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,58.758777
std,182.618491,35530.987199,1.042529,61.965384
min,1.0,1.0,0.5,1.0
25%,177.0,1199.0,3.0,13.0
50%,325.0,2991.0,3.5,39.0
75%,477.0,8122.0,4.0,84.0
max,610.0,193609.0,5.0,329.0


#### Idea 2:
- Let's say if the movie is rated by more than 50 users then only we can consider

In [22]:
rating_count_threshold = 50
popular_movies = combine_movie_rating[combine_movie_rating['ratingCount']>50]
popular_movies.head()

Unnamed: 0,userId,movieId,rating,title,genres,ratingCount
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215


In [23]:
"""
Now doing the Idea 1 --> forming the pivot table (index:movie title, columns:userID)
""" 
movie_features_df = popular_movies.pivot_table(index='title', columns='userId', values='rating').fillna(0)
movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


### Finding the Nearest Neighbors

In [24]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

movie_feature_matrix = csr_matrix(movie_features_df.values)
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(movie_feature_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [25]:
movie_features_df.shape

(437, 606)

`Idea 3:` Note that there are 437 movies and 606 users , take 1 random movie and see its recommened movies based on nearest neighbour using cosine similarity

In [29]:
movie_features_df.iloc[query_index,:].values.reshape(1,-1).shape

(1, 606)

In [30]:
query_index = np.random.choice(movie_features_df.shape[0])
distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1,-1), n_neighbors=6)
print(query_index, distances, indices)

135 [[0.         0.510017   0.55124723 0.55885329 0.5813702  0.59015644]] [[135 166 315 157  97 115]]


In [31]:
for i in range(len(distances.flatten())):
    if i==0:
        print('Recommendations for {}'.format(movie_features_df.index[query_index]))
    else:
        print('{0}:{1} with distance of {2}'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Ed Wood (1994)
1:Get Shorty (1995) with distance of 0.510016997568939
2:Quiz Show (1994) with distance of 0.5512472343034351
3:Four Weddings and a Funeral (1994) with distance of 0.5588532932559689
4:Clerks (1994) with distance of 0.5813701976372736
5:Dave (1993) with distance of 0.5901564438443736
