In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_df = pd.read_csv('tmdb_5000_movies.csv', usecols=['id','title'])

In [3]:
movies_df.head()

Unnamed: 0,id,title
0,19995,Avatar
1,285,Pirates of the Caribbean: At World's End
2,206647,Spectre
3,49026,The Dark Knight Rises
4,49529,John Carter


In [4]:
movies_df.shape

(4803, 2)

In [5]:
ratings_df = pd.read_csv('ratings_small.csv', usecols=['userId','movieId','rating'])

In [6]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [7]:
movies_df.rename(columns={'id':'movieId'}, inplace=True)

In [8]:
ratings_df.shape

(100004, 3)

In [9]:
df = pd.merge(ratings_df, movies_df, on='movieId')

In [10]:
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,2105,4.0,American Pie
1,4,2105,4.0,American Pie
2,15,2105,4.0,American Pie
3,30,2105,2.0,American Pie
4,34,2105,4.0,American Pie


In [11]:
df[df['userId']==1]

Unnamed: 0,userId,movieId,rating,title
0,1,2105,4.0,American Pie
47,1,2294,2.0,Jay and Silent Bob Strike Back


In [12]:
df.shape

(18571, 4)

In [13]:
df2 = df.dropna(axis=0, subset=['title'])
movie_ratingCount = df2.groupby(['title'])['rating'].count().reset_index().rename(columns={'rating':'totalRatingCount'})

In [14]:
movie_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,10 Things I Hate About You,7
1,12 Angry Men,1
2,1408,1
3,15 Minutes,5
4,16 Blocks,1


In [15]:
movie_ratingCount.shape

(856, 2)

In [16]:
rating_with_totalRatingCount = df2.merge(movie_ratingCount, left_on='title', right_on='title', how='left')

In [17]:
rating_with_totalRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,2105,4.0,American Pie,47
1,4,2105,4.0,American Pie,47
2,15,2105,4.0,American Pie,47
3,30,2105,2.0,American Pie,47
4,34,2105,4.0,American Pie,47


In [18]:
rating_with_totalRatingCount.shape

(18571, 5)

In [19]:
pd.set_option('display.float_format', lambda x: '%.3f' %x)

print(movie_ratingCount['totalRatingCount'].describe())

count   856.000
mean     21.695
std      36.900
min       1.000
25%       2.000
50%       7.000
75%      24.000
max     324.000
Name: totalRatingCount, dtype: float64


In [26]:
rating_popular_movie = rating_with_totalRatingCount[rating_with_totalRatingCount['totalRatingCount'] >= 50]

In [27]:
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
47,1,2294,2.0,Jay and Silent Bob Strike Back,53
48,5,2294,4.0,Jay and Silent Bob Strike Back,53
49,9,2294,2.0,Jay and Silent Bob Strike Back,53
50,15,2294,2.0,Jay and Silent Bob Strike Back,53
51,48,2294,3.0,Jay and Silent Bob Strike Back,53


In [28]:
rating_popular_movie.shape

(10469, 5)

In [37]:
# First create a Pivot matrix

movie_features_df = rating_popular_movie.pivot_table(index = 'title', columns='userId', values='rating').fillna(0)

movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"20,000 Leagues Under the Sea",0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
A Nightmare on Elm Street,0.0,3.0,2.5,0.0,4.0,0.0,3.0,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0
A Time to Kill,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Aliens vs Predator: Requiem,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0


In [38]:
movie_features_df.shape

(103, 644)

In [40]:
from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(movie_features_df.values)

In [41]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(movie_features_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [42]:
movie_features_df.shape

(103, 644)

In [92]:
query_index = np.random.choice(movie_features_df.shape[0])
print(query_index)

distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

24


In [93]:
movie_features_df.iloc[query_index]

userId
1     0.000
2     0.000
3     0.000
4     0.000
5     0.000
       ... 
667   0.000
668   0.000
669   0.000
670   0.000
671   0.000
Name: Die Hard 2, Length: 644, dtype: float64

In [94]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Die Hard 2:

1: Y Tu Mamá También, with distance of 0.5210453206598713:
2: Rambo III, with distance of 0.5514447861645805:
3: A Nightmare on Elm Street, with distance of 0.5729068505432685:
4: Persepolis, with distance of 0.5799313774826955:
5: Fahrenheit 9/11, with distance of 0.5933993676153946:
