# Recommender System with Python

## Import Libraries

In [1]:
import pandas as pd
import numpy as np

## Read the Data

In [13]:
movies_df = pd.read_csv('movies.csv', usecols = ['movieId', 'title'], dtype = {'movieId':'int32', 'title':'str'})
rating_df = pd.read_csv('ratings.csv', usecols = ['userId', 'movieId', 'rating'], dtype = {'userId':'int32', 'movieId':'int32', 'rating':'float32'})

In [14]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [15]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [18]:
df = pd.merge(movies_df, rating_df, on="movieId")
df.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [23]:
combine_movie_rating = df.dropna(axis = 0, subset =['title'])

movie_rating_count = (combine_movie_rating.groupby(by = ['title'])
                      ['rating'].count().reset_index().
                      rename(columns = {'rating': 'totalRating'})
                      [['title', 'totalRating']])

movie_rating_count.head()

Unnamed: 0,title,totalRating
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [24]:
total_rating_count = combine_movie_rating.merge(movie_rating_count, left_on = 'title', right_on = 'title', how = 'left')

total_rating_count.head()

Unnamed: 0,movieId,title,userId,rating,totalRating
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [26]:
pd.set_option('display.float_format', lambda x: '%.3f' %x)

print(movie_rating_count['totalRating'].describe())

count   9719.000
mean      10.375
std       22.406
min        1.000
25%        1.000
50%        3.000
75%        9.000
max      329.000
Name: totalRating, dtype: float64


In [30]:
popularity_threshold = 60

rating_popular_movie = total_rating_count.query('totalRating >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,movieId,title,userId,rating,totalRating
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [31]:
rating_popular_movie.shape

(35080, 5)

### Create a Pivot Table

In [32]:
movie_features_df = rating_popular_movie.pivot_table(index = 'title', columns = 'userId', values = 'rating').fillna(0)

movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0
"40-Year-Old Virgin, The (2005)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
"Abyss, The (1989)",4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0
Ace Ventura: Pet Detective (1994),0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,2.0,0.0,0.0,0.0,3.5,0.0,3.0


In [35]:
from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(movie_features_df.values)

from sklearn.neighbors import NearestNeighbors

knn_model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn_model.fit(movie_features_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [36]:
movie_features_df.shape

(335, 603)

In [38]:
query_index = np.random.choice(movie_features_df.shape[0])
print(query_index)

276


In [43]:
distances, indices = knn_model.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1,-1), n_neighbors = 6)


for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[query_index]))
        
    else:
        print('{0} : {1}, with distance of {2}'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Snatch (2000):

1 : Big Lebowski, The (1998), with distance of 0.4327002167701721
2 : Lock, Stock & Two Smoking Barrels (1998), with distance of 0.43558841943740845
3 : Fight Club (1999), with distance of 0.44815343618392944
4 : Reservoir Dogs (1992), with distance of 0.45795756578445435
5 : Sin City (2005), with distance of 0.46202003955841064


# Conclusion

    In this notebook, we have created a simple movie recommender system with simple mathematics of Correlation and make prediction based on K-nearest neighbors algorithm.