In [1]:
import numpy as np
import pandas as pd
import math
import sklearn
import scipy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import csr_matrix


##  Check data

In [2]:
df = pd.read_csv('ratings.csv')

In [3]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [4]:
df = df.sample(2000000)

In [5]:
df.userId.nunique()

204425

## Let's choose users with 10 and more interactions



In [6]:
groupby_user = df.groupby('userId').count()[['movieId']] 
groupby_user.head()

Unnamed: 0_level_0,movieId
userId,Unnamed: 1_level_1
4,13
6,3
7,2
8,4
9,1


In [7]:
index_users = groupby_user[(groupby_user.movieId > 5) & (groupby_user.movieId < 50)].index

In [8]:
df = df[df.userId.isin(index_users)]

## Check rating "0"

In [9]:
df.rating.value_counts()

4.0    329056
3.0    231698
5.0    186015
3.5    119659
4.5     97879
2.0     72840
2.5     41439
1.0     35794
1.5     13131
0.5     12773
Name: rating, dtype: int64

In [10]:
df.shape

(1140284, 4)

## Create base_model (cosine similarity)

In [11]:
interactions_train_df, interactions_test_df = train_test_split(df,
                                   stratify=df['userId'], 
                                   test_size=0.45,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 627156
# interactions on Test set: 513128


In [12]:
users_items_pivot_matrix_df = interactions_train_df.pivot(index='userId', 
                                                          columns='movieId', 
                                                          values='rating').fillna(0)

users_items_pivot_matrix_df.head(10)

users_items_pivot_matrix_df

movieId,1,2,3,4,5,6,7,8,9,10,...,149683,149729,149804,150367,150548,150552,150724,150856,151485,151593
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
247736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
247738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
247742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
users_items_pivot_sparse_matrix = csr_matrix(users_items_pivot_matrix_df)

In [14]:
users_items_pivot_sparse_matrix

<73489x12185 sparse matrix of type '<class 'numpy.float64'>'
	with 627156 stored elements in Compressed Sparse Row format>

In [15]:
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(users_items_pivot_sparse_matrix, k = NUMBER_OF_FACTORS_MF)

In [16]:
sigma = np.diag(sigma)

In [17]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings.shape

(73489, 12185)

In [18]:
# all_user_predicted_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (all_user_predicted_ratings.max() - all_user_predicted_ratings.min())

In [20]:
users_ids = users_items_pivot_matrix_df.index

#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df.head(10)

userId,4,11,13,14,15,20,21,23,28,30,...,247725,247729,247730,247732,247734,247735,247736,247738,247742,247751
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.053942,0.030434,0.140199,0.122929,0.247625,0.02146,0.003043,0.20694,-0.004808,-0.272261,...,0.181591,0.082382,0.061841,0.360262,-0.088399,0.602886,-0.052973,0.182746,0.001505,0.46693
2,0.014349,0.008754,0.030543,0.037159,0.05557,0.00642,0.000514,0.040216,0.041825,-0.002837,...,0.042877,0.029857,0.022078,0.045477,0.02181,0.090572,-0.000883,0.041855,0.000562,0.068838
3,0.004309,0.002951,0.008948,0.011993,0.017061,0.002144,0.000106,0.010843,0.006404,0.006122,...,0.013266,0.01028,0.007939,0.009082,0.000575,0.024316,0.029343,0.012855,0.000233,0.013239
4,0.000384,0.000398,0.001526,0.003179,0.00267,0.000304,-1.3e-05,0.001524,0.00562,-4.8e-05,...,0.00305,0.002162,0.00146,-0.00034,-0.001463,0.008666,0.00102,0.00237,5e-05,0.000222
5,0.003429,0.002113,0.007591,0.01128,0.013983,0.001614,9.9e-05,0.009508,0.014956,-0.003315,...,0.011767,0.008627,0.00611,0.007929,-0.00039,0.028462,0.000186,0.010951,0.00017,0.013496
6,0.015298,0.009731,0.030339,0.03642,0.057344,0.006923,0.000524,0.039245,0.044872,0.003082,...,0.041328,0.031239,0.024065,0.048271,0.045154,0.068056,0.039302,0.040942,0.000552,0.072323
7,0.004331,0.003029,0.010162,0.013204,0.01865,0.002355,9.1e-05,0.013261,0.006025,0.009435,...,0.015129,0.011134,0.008541,0.008036,0.006584,0.032667,0.013599,0.015632,0.000276,0.011571
8,0.000393,0.000314,0.00118,0.002283,0.002002,0.000218,5e-06,0.001421,-0.000346,-0.000699,...,0.002041,0.001603,0.001022,0.000317,-0.001,0.003319,0.00181,0.001873,3.9e-05,0.00051
9,0.000561,0.000499,0.002035,0.003603,0.003361,0.000349,-8e-06,0.001938,0.000741,-9.3e-05,...,0.003467,0.002497,0.001632,0.000257,-0.001844,0.006774,0.003168,0.002914,6.9e-05,-0.000194
10,0.016089,0.01109,0.037738,0.049068,0.06759,0.008433,0.000434,0.04858,0.037994,0.033463,...,0.056201,0.039856,0.030231,0.037388,0.035828,0.121325,0.01347,0.054723,0.000854,0.060468


In [None]:
pred = cf_preds_df.unstack()
data_pred = pd.DataFrame(pred)

In [49]:
data_pred

Unnamed: 0_level_0,Unnamed: 1_level_0,0
userId,movieId,Unnamed: 2_level_1
4,1,0.053942
4,2,0.014349
4,3,0.004309
4,4,0.000384
4,5,0.003429
...,...,...
247751,150552,0.000207
247751,150724,0.000031
247751,150856,0.000277
247751,151485,0.000260


## Сheck films, which user rating.

In [29]:
def check(user_id=4):
    row_user = users_items_pivot_matrix_df.loc[user_id]
    print(f'User: {user_id}')
    print(row_user[row_user > 0])
    return row_user[row_user > 0].values

In [30]:
check()

User: 4
movieId
1258    4.0
2384    3.0
3083    5.0
3176    4.0
3261    5.0
3476    3.0
4239    3.0
Name: 4, dtype: float64


array([4., 3., 5., 4., 5., 3., 3.])

In [66]:
def recommend(user_id=4, k=11):
    pred = cf_preds_df.T
    pred = data_pred.loc[4]
    pred = pred.sort_values(by=[0], ascending=False)[:k]
    for ids, rait in zip(pred.index, pred.values):
        print(ids, rait)

In [67]:
recommend()

50 [0.11077591]
2858 [0.09914586]
4993 [0.09256557]
2959 [0.0906238]
1210 [0.08049848]
858 [0.0768245]
7153 [0.06649697]
2762 [0.06460161]
5952 [0.05745212]
1196 [0.05655157]
1270 [0.05544688]
