In [22]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
%matplotlib inline

In [23]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
print(movies)

       movieId                                              title  \
0            1                                   Toy Story (1995)   
1            2                                     Jumanji (1995)   
2            3                            Grumpier Old Men (1995)   
3            4                           Waiting to Exhale (1995)   
4            5                 Father of the Bride Part II (1995)   
5            6                                        Heat (1995)   
6            7                                     Sabrina (1995)   
7            8                                Tom and Huck (1995)   
8            9                                Sudden Death (1995)   
9           10                                   GoldenEye (1995)   
10          11                     American President, The (1995)   
11          12                 Dracula: Dead and Loving It (1995)   
12          13                                       Balto (1995)   
13          14                    

## Combine two dataset file

In [24]:
combine_movie_rating= pd.merge(ratings,movies,on='movieId')
#drop the column 'timestamp' since this method doesn't use this tag
combine_movie_rating=combine_movie_rating.drop(['timestamp'],axis = 1)
print(len(combine_movie_rating))
combine_movie_rating.title.value_counts()[:20]

20000263


Pulp Fiction (1994)                                      67310
Forrest Gump (1994)                                      66172
Shawshank Redemption, The (1994)                         63366
Silence of the Lambs, The (1991)                         63299
Jurassic Park (1993)                                     59715
Star Wars: Episode IV - A New Hope (1977)                54502
Braveheart (1995)                                        53769
Terminator 2: Judgment Day (1991)                        52244
Matrix, The (1999)                                       51334
Schindler's List (1993)                                  50054
Toy Story (1995)                                         49695
Fugitive, The (1993)                                     49581
Apollo 13 (1995)                                         47777
Independence Day (a.k.a. ID4) (1996)                     47048
Usual Suspects, The (1995)                               47006
Star Wars: Episode VI - Return of the Jedi (1983)      

## Find the total rating count of each movie

In [25]:
movie_rating_count=pd.DataFrame(combine_movie_rating.groupby(['movieId'])['rating'].count().reset_index().rename(columns={'rating':'totalRatingCount'}))
movie_rating_count.head()

Unnamed: 0,movieId,totalRatingCount
0,1,49695
1,2,22243
2,3,12735
3,4,2756
4,5,12161


## Give a summary of the rating count 

In [26]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_rating_count['totalRatingCount'].describe())

count   26744.000
mean      747.841
std      3085.818
min         1.000
25%         3.000
50%        18.000
75%       205.000
max     67310.000
Name: totalRatingCount, dtype: float64


## Compress data again,use 10% of rating count to define 'Popular' movies and recommand them

In [27]:
print(movie_rating_count['totalRatingCount'].quantile(np.arange(.9,1,.01)))

0.900    1305.700
0.910    1543.000
0.920    1848.000
0.930    2285.980
0.940    2847.000
0.950    3612.950
0.960    4700.560
0.970    6219.970
0.980    8835.780
0.990   14388.690
Name: totalRatingCount, dtype: float64


In [28]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_rating_count,left_on='movieId',right_on='movieId')
print(len(combine_movie_rating))
print(len(rating_with_totalRatingCount))
rating_with_totalRatingCount.head()

20000263
20000263


Unnamed: 0,userId,movieId,rating,title,genres,totalRatingCount
0,1,2,3.5,Jumanji (1995),Adventure|Children|Fantasy,22243
1,5,2,3.0,Jumanji (1995),Adventure|Children|Fantasy,22243
2,13,2,3.0,Jumanji (1995),Adventure|Children|Fantasy,22243
3,29,2,3.0,Jumanji (1995),Adventure|Children|Fantasy,22243
4,34,2,3.0,Jumanji (1995),Adventure|Children|Fantasy,22243


## Select popular movies

In [29]:
popular_threshold=1305
rating_popular_movies= rating_with_totalRatingCount.query('totalRatingCount>=@popular_threshold')
rating_popular_movies.head()

Unnamed: 0,userId,movieId,rating,title,genres,totalRatingCount
0,1,2,3.5,Jumanji (1995),Adventure|Children|Fantasy,22243
1,5,2,3.0,Jumanji (1995),Adventure|Children|Fantasy,22243
2,13,2,3.0,Jumanji (1995),Adventure|Children|Fantasy,22243
3,29,2,3.0,Jumanji (1995),Adventure|Children|Fantasy,22243
4,34,2,3.0,Jumanji (1995),Adventure|Children|Fantasy,22243


## Construct a sparse matrix to calculate

In [30]:
ratings_pivot = rating_popular_movies.pivot(index='movieId', columns='userId',values='rating').fillna(0)
ratings_pivot_sparse = csr_matrix(ratings_pivot.values)

## Caculate the cosine similarity between rating vectors

In [31]:
model_nn_binary = NearestNeighbors(metric='cosine', algorithm='brute')
model_nn_binary.fit(ratings_pivot_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [32]:
query_index=np.random.choice(ratings_pivot.shape[0])
distances, indices = model_nn_binary.kneighbors(ratings_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 11)

for i in range(0, len(distances.flatten())):
    movieId=ratings_pivot.index[query_index]
    likelymovieId=ratings_pivot.index[indices.flatten()[i]]
    if i == 0:
        print('Current movie:',movies[movies.movieId==movieId]['title'].values[0])
    else:
        print('Recommand movie{0}: {1}, distance is:{2}'.format(i, movies[movies.movieId==likelymovieId]['title'].values[0], 
                                                    distances.flatten()[i]))

Current movie: Dr. T and the Women (2000)
Recommand movie1: Nurse Betty (2000), distance is:0.7345368387187592
Recommand movie2: Bounce (2000), distance is:0.7551314017334508
Recommand movie3: Contender, The (2000), distance is:0.7609314227552183
Recommand movie4: State and Main (2000), distance is:0.7852909794846505
Recommand movie5: Wonder Boys (2000), distance is:0.7940448803908337
Recommand movie6: You Can Count on Me (2000), distance is:0.795414443395799
Recommand movie7: Legend of Bagger Vance, The (2000), distance is:0.8050870048548868
Recommand movie8: Keeping the Faith (2000), distance is:0.8076106764831801
Recommand movie9: What Women Want (2000), distance is:0.8078915941182924
Recommand movie10: Small Time Crooks (2000), distance is:0.8094581133132515


In [33]:
movieId=29
distances, indices = model_nn_binary.kneighbors(ratings_pivot.query('movieId == 29').values, n_neighbors = 11)

for i in range(0, len(distances.flatten())):
    likelymovieId=ratings_pivot.index[indices.flatten()[i]]
    if i == 0:
        print('Current movie:',movies[movies.movieId==movieId]['title'].values[0])
    else:
        print('Recommand movie{0}: {1}, distance is:{2}'.format(i, movies[movies.movieId==likelymovieId]['title'].values[0], 
                                                    distances.flatten()[i]))

Current movie: City of Lost Children, The (Cité des enfants perdus, La) (1995)
Recommand movie1: Delicatessen (1991), distance is:0.5609612915294158
Recommand movie2: Brazil (1985), distance is:0.6153414366133902
Recommand movie3: Blade Runner (1982), distance is:0.661584742255307
Recommand movie4: Dark City (1998), distance is:0.6697961262326608
Recommand movie5: Twelve Monkeys (a.k.a. 12 Monkeys) (1995), distance is:0.6713548084356827
Recommand movie6: Clockwork Orange, A (1971), distance is:0.6762167519110898
Recommand movie7: Trainspotting (1996), distance is:0.6864511942296707
Recommand movie8: Pi (1998), distance is:0.6898976955380305
Recommand movie9: Ghost in the Shell (Kôkaku kidôtai) (1995), distance is:0.7006300194646904
Recommand movie10: 2001: A Space Odyssey (1968), distance is:0.703187155348789
