In [2]:
%matplotlib inline
%load_ext nb_black

<IPython.core.display.Javascript object>

In [1]:
import os
import pandas as pd

import numpy as np

In [2]:
data_path = os.path.join('.', 'Data')
movies = pd.read_csv(os.path.join(data_path, 'movies.csv'), 
                    sep = '\t', encoding = 'latin-1')
ratings = pd.read_csv(os.path.join(data_path, 'ratings.csv'), 
                     sep = '\t', encoding = 'latin-1')
users = pd.read_csv(os.path.join(data_path, 'users.csv'), 
                   sep = '\t', encoding = 'latin-1')

In [8]:
print(ratings.user_id.unique().shape[0])
print(ratings.movie_id.unique().shape[0])

6040
3706


In [11]:
Ratings = ratings.pivot(index = 'user_id', columns = 'movie_id', values = 'rating').fillna(0)

In [20]:
user_ratings_denormalized = Ratings.values - np.mean(Ratings, axis = 1).values.reshape(-1, 1)

In [60]:
Ratings.shape

(6040, 3706)

In [34]:
Ratings.size

22384240

In [35]:
Ratings_zeroes = len(np.where(Ratings.values == 0)[0])

In [39]:
sparsity = np.round(Ratings_zeroes/Ratings.size, 3)
print('Sparsity : ', sparsity*100, '%')

Sparsity :  95.5 %


In [44]:
from scipy.sparse.linalg import svds

In [47]:
U, sigma, V_t = svds(users_ratings_denormalized,
                   k = 50)

In [50]:
sigma = np.diag(sigma)

In [56]:
user_predicted_ratings = np.dot(U, np.dot(sigma, V_t))
user_predicted_ratings.shape

(6040, 3706)

In [58]:
Ratings.columns

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            3943, 3944, 3945, 3946, 3947, 3948, 3949, 3950, 3951, 3952],
           dtype='int64', name='movie_id', length=3706)

In [59]:
predictions = pd.DataFrame(user_predicted_ratings, columns = Ratings.columns)
predictions

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.228958,0.083152,-0.254982,-0.078746,-0.047671,-0.236506,-0.134023,0.081455,-0.119456,-0.255853,...,-0.032096,-0.058263,-0.033508,-0.081927,-0.145318,0.343626,0.045676,-0.027991,-0.009453,0.029007
1,0.615466,0.040409,0.206168,-0.128492,-0.106775,1.223801,-0.077824,-0.057992,0.032352,1.437996,...,-0.185752,-0.142982,-0.139830,-0.066674,-0.145498,0.026540,-0.547986,-0.230352,-0.183348,-0.269438
2,1.765127,0.402440,0.037281,-0.096734,-0.079391,-0.212314,-0.185475,0.045281,-0.023145,0.681774,...,-0.013216,-0.058997,-0.040864,-0.024348,-0.032830,0.067835,0.022508,-0.041352,-0.038549,-0.163653
3,0.384312,-0.096705,0.015897,0.065618,0.018204,0.214008,-0.073172,-0.014279,0.021723,-0.135115,...,-0.015175,-0.029171,-0.032245,-0.027162,-0.107727,0.070767,0.033812,-0.049795,-0.008904,-0.057969
4,1.406167,-0.146867,-0.219406,0.078778,-0.200512,1.384175,-0.367735,-0.183026,-0.228604,0.282406,...,-0.057955,-0.122096,-0.161172,-0.184046,-0.218186,-0.220645,0.339083,-0.134276,-0.042400,0.031138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,1.600969,-0.557455,-0.377743,-0.347693,-0.875060,1.400874,0.377517,-0.646182,-0.837970,-0.230524,...,-0.602927,-0.795858,-0.833691,-0.881520,-0.515107,-0.657613,-0.059045,-0.520185,-0.546437,-0.056649
6036,1.868116,-0.063350,-0.215310,-0.379634,0.058599,0.871590,-0.118646,-0.188830,-0.232823,-0.287600,...,-0.364193,-0.201460,-0.231868,-0.249732,-0.103609,-0.395297,-0.293910,-0.151846,-0.316071,-0.169362
6037,0.598581,-0.182276,0.086231,-0.013460,-0.095208,-0.100460,0.079713,-0.054520,-0.012836,-0.019227,...,-0.074053,-0.014673,-0.012956,-0.044589,-0.031246,-0.029370,-0.120281,-0.033877,-0.050862,-0.135443
6038,1.374895,-0.164918,-0.289978,-0.212112,-0.210327,-0.272227,-0.022042,-0.183114,-0.137536,0.077091,...,-0.134814,-0.119777,-0.121115,-0.166510,-0.077967,-0.104658,-0.301176,-0.139614,-0.167358,-0.297070


In [67]:
movies.head()

Unnamed: 0,movie_id,title,genres,movie_embed_id
0,1,Toy Story (1995),Animation|Children's|Comedy,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama,3
4,5,Father of the Bride Part II (1995),Comedy,4


In [75]:
user_row_num = 3
sorted_users_predictions = predictions.loc[user_row_num, :].sort_values(ascending = False)
user_info = ratings.loc[ratings['user_embed_id'] == user_row_num]
user_complete = pd.merge(user_info, movies, how = 'left', left_on = 'movie_id', right_on = 'movie_id').sort_values(['rating'], ascending = False)
pd.merge(movies.loc[~movies['movie_id'].isin(user_complete['movie_id'])], pd.DataFrame(sorted_users_predictions.reset_index()), 
        how = 'left', left_on = 'movie_id', right_on = 'movie_id')

Unnamed: 0,movie_id,title,genres,movie_embed_id,3
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0.384312
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1,-0.096705
2,3,Grumpier Old Men (1995),Comedy|Romance,2,0.015897
3,4,Waiting to Exhale (1995),Comedy|Drama,3,0.065618
4,5,Father of the Bride Part II (1995),Comedy,4,0.018204
...,...,...,...,...,...
3857,3948,Meet the Parents (2000),Comedy,3947,0.070767
3858,3949,Requiem for a Dream (2000),Drama,3948,0.033812
3859,3950,Tigerland (2000),Drama,3949,-0.049795
3860,3951,Two Family House (2000),Drama,3950,-0.008904


In [82]:
def recommended_movies(predictions, user_id, ratings_actual, recommendations_num):
    user_row_num = user_id - 1
    sorted_users_predictions = predictions.loc[user_row_num, :].sort_values(ascending = False)
    
    user_info = ratings_actual.loc[ratings_actual['user_embed_id'] == user_row_num]
    user_complete = pd.merge(user_info, movies, how = 'left', left_on = 'movie_id', right_on = 'movie_id').sort_values(['rating'], ascending = False)
    print(f'The user has rated {user_complete.shape[0]} movies out of {movies.shape[0]} available movies')
    print(f'The top {recommendations_num} movie recommendations for userID {user_id} are : ')
    
    recommends = pd.merge(movies.loc[~movies['movie_id'].isin(user_complete['movie_id'])], 
                         pd.DataFrame(sorted_users_predictions).reset_index(), 
                         how = 'left', left_on = 'movie_id', 
                         right_on = 'movie_id').rename(columns = {user_row_num : 'Pred'}).sort_values(['Pred'], ascending = False).iloc[:recommendations_num, :-1]
    
    return user_complete, recommends

In [83]:
user_rated, recommendations = recommended_movies(predictions, 23, ratings, 30)

The user has rated 304 movies out of 3883 available movies
The top 30 movie recommendations for userID 23 are : 


In [85]:
recommendations.shape

(30, 4)

In [94]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import KFold

In [102]:
kfold = KFold()
kfolds = kfold.split(ratings[['movie_id', 'user_id', 'rating', 'movie_embed_id']])