In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [2]:
df_ratings = pd.read_csv('ml-latest-small/ratings.csv')
df_movies = pd.read_csv('ml-latest-small/movies.csv')

In [3]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# SVD

In [5]:
from scipy.linalg import svd

In [6]:
A = np.array([[1, 2], [3, 4], [5, 6]])
print(A)

[[1 2]
 [3 4]
 [5 6]]


In [7]:
U, s, VT = svd(A)
print('Unitary matrix having left singular vectors as columns: \n', U)
print('Singular values: \n', s)
print('Unitary matrix having right singular vectors as rows: \n', VT)

Unitary matrix having left singular vectors as columns: 
 [[-0.2298477   0.88346102  0.40824829]
 [-0.52474482  0.24078249 -0.81649658]
 [-0.81964194 -0.40189603  0.40824829]]
Singular values: 
 [9.52551809 0.51430058]
Unitary matrix having right singular vectors as rows: 
 [[-0.61962948 -0.78489445]
 [-0.78489445  0.61962948]]


In [8]:
m, n = A.shape[0], A.shape[1]

sigma = np.zeros((m, n))
for i in range(min(m, n)):
    sigma[i, i] = s[i]
A_rec = np.dot(np.dot(U, sigma), VT)
print('Reconstructed matrix: \n', A_rec)

Reconstructed matrix: 
 [[1. 2.]
 [3. 4.]
 [5. 6.]]


# Split

In [9]:
import scipy.sparse as sp
from scipy import sparse
from scipy.sparse.linalg import spsolve

In [12]:
user_item_matrix = df_ratings.pivot_table(index=['userId'], columns=['movieId'], values='rating')
sparse_ui = sp.csr_matrix(user_item_matrix)
sparse_ui.todense()

matrix([[4. , nan, 4. , ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [2.5, 2. , 2. , ..., nan, nan, nan],
        [3. , nan, nan, ..., nan, nan, nan],
        [5. , nan, nan, ..., nan, nan, nan]])

In [14]:
x_train, X_test = train_test_split(sparse_ui, test_size=0.25, random_state=57)
ind_train, ind_test = train_test_split(user_item_matrix, test_size=0.25, random_state=57)

In [15]:
x_res = pd.DataFrame(index=ind_train.index, data=[], columns=['actual'])
for i in x_res.index:
    x_res.loc[i]['actual'] = list(set(ind_train.loc[i][ind_train.loc[i].notnull()].index))

In [16]:
x_res

Unnamed: 0_level_0,actual
userId,Unnamed: 1_level_1
575,"[2560, 2436, 2566, 2567, 2568, 2571, 2572, 257..."
323,"[1, 2, 2571, 1037, 527, 17, 19, 22, 110102, 29..."
14,"[4, 7, 266, 524, 527, 784, 19, 150, 153, 25, 2..."
496,"[4993, 111362, 2950, 904, 104841, 912, 84374, ..."
531,"[4993, 260, 10, 1291, 919, 1961, 2473, 1198, 1..."
...,...
41,"[54272, 115713, 69122, 122882, 7173, 102407, 3..."
99,"[256, 257, 259, 10, 266, 145, 276, 22, 23, 150..."
80,"[33794, 69122, 81417, 91658, 2571, 78349, 5428..."
407,"[4993, 260, 1291, 1036, 2571, 6539, 2959, 5428..."


In [18]:
ind_train_u = pd.Series(ind_train.index.tolist())
ind_train_i = pd.Series(ind_train.columns.values.tolist())

# Latent Factor Model

In [22]:
from sklearn.preprocessing import MinMaxScaler

In [25]:
seed =57
rstate = np.random.RandomState(seed)
rank_size = 10
lambda_val = 0.1
num_user = x_train.shape[0]
num_item = x_train.shape[1]
P = sparse.csr_matrix((rstate.normal(size = (num_user, rank_size))))
Q = sparse.csr_matrix((rstate.normal(size = (num_item, rank_size))))
QTQ = Q.T.dot(Q)
PTP = P.T.dot(P)
P_eye = sparse.eye(num_user)
Q_eye = sparse.eye(num_item)
lambda_eye= lambda_val * sparse.eye(rank_size)

In [26]:
u=5
pref = x_train[u, :].toarray()
pref_u = pref[~np.isnan(pref)]
u_rated_movies_ind = np.argwhere(~np.isnan(pref))[:,1]
Qu = Q[u_rated_movies_ind, :]
QuTru = Qu.T.dot(pref_u.T)
Q[u] = spsolve(QTQ + lambda_eye, QuTru)
print(Q[u].toarray())

[[ 0.0044179  -0.01638758 -0.00766912 -0.00914071  0.02136258 -0.00095192
  -0.00905147  0.01610489  0.00381576 -0.00747813]]


In [29]:
def lfm_als(training_set, lambda_val, iter=1, rank_size=20, seed=57):
    
    num_user = training_set.shape[0]
    num_item = training_set.shape[1]
    
    rstate = np.random.RandomState(seed)
    
    P = sparse.csr_matrix((rstate.normal(size = (num_user, rank_size))))
    Q = sparse.csr_matrix((rstate.normal(size = (num_item, rank_size))))
    #QTQ = Q.T.dot(Q)
    #PTP = P.T.dot(P)
    P_eye = sparse.eye(num_user)
    Q_eye = sparse.eye(num_item)
    lambda_eye= lambda_val * sparse.eye(rank_size)
    
    for iter_step in range(iter):
        QTQ = Q.T.dot(Q)
        PTP = P.T.dot(P)
        
        for u in range(num_user):
            pref = training_set[u, :].toarray()
            pref_u = pref[~np.isnan(pref)]
            u_rated_movies_ind = np.argwhere(~np.isnan(pref))[:,1]
            Qu = Q[u_rated_movies_ind, :]
            QuTru = Qu.T.dot(pref_u.T)
            Q[u] = spsolve(QTQ + lambda_eye, QuTru)
            
        for i in range(num_item):
            pref = training_set[:, i].toarray()
            pref_i = pref[~np.isnan(pref)]
            i_rated_movies_ind = np.argwhere(~np.isnan(pref))[:,0]
            Pi = P[i_rated_movies_ind, :]
            PiTri = Pi.T.dot(pref_i.T)
            P[u] = spsolve(PTP + lambda_eye, PiTri)
    return P, Q.T

In [30]:
%%time
P, Q = lfm_als(x_train, lambda_val=0.1, iter=1, rank_size=20, seed=57)

Wall time: 1min 2s


In [31]:
print(f'P.shape {P.shape}')
print(f'Q.shape {Q.shape}')

P.shape (457, 20)
Q.shape (20, 9724)


In [46]:
movieId = 5615
ind_i = ind_train_i[ind_train_i == movieId].index[0]

qi = Q[:, ind_i].toarray()[:, 0]
qi

array([ 0.08579076, -0.35538698, -2.11810291,  1.61670292,  0.58289662,
        1.75592755,  0.58424818, -0.72698759, -0.76775365, -0.71726321,
       -0.05294368, -1.78220518, -0.69233456,  1.35600943, -1.83786014,
        1.11416858,  0.65358287,  0.32460426,  2.53947959,  0.16712385])

In [47]:
scores = Q.T.dot(qi)
top_10=np.argsort(scores)[::-1][:10]

In [48]:
top_10

array([3977, 6043, 6192, 6991, 5879, 4048, 4439, 7068, 1166, 9547],
      dtype=int64)

In [49]:
df_movies[df_movies['movieId']==ind_train_i[3977]]['title'].iloc[0]

'Invincible (2001)'

In [50]:
movies = []
movies_genres = []
movies_scores = []
movies_ids = []

for ind in top_10:
    movies_ids.append(ind_train_i[ind])
    movies.append(df_movies[df_movies['movieId']==ind_train_i[ind]]['title'].iloc[0])
    movies_genres.append(df_movies[df_movies['movieId']==ind_train_i[ind]]['genres'].iloc[0])
    movies_scores.append(scores[ind])
    
similar = pd.DataFrame({'movieId' : movies_ids, 'movies' : movies, 'scores' : movies_scores, 'genres' : movies_genres})
similar

Unnamed: 0,movieId,movies,scores,genres
0,5615,Invincible (2001),29.756241,Drama
1,40723,Wolf Creek (2005),21.278858,Crime|Horror|Thriller
2,45499,X-Men: The Last Stand (2006),19.570919,Action|Sci-Fi|Thriller
3,68073,Pirate Radio (2009),19.309013,Comedy|Drama
4,33437,Unleashed (Danny the Dog) (2005),18.446562,Action|Crime|Drama|Thriller
5,5772,My Dinner with André (1981),18.351032,Drama
6,6564,Lara Croft Tomb Raider: The Cradle of Life (2003),18.269228,Action|Adventure|Comedy|Romance|Thriller
7,70015,Polytechnique (2009),18.228665,Crime|Drama
8,1547,Shiloh (1997),17.807164,Children|Drama
9,173873,Gulliver's Travels (1996),17.672585,Adventure|Children|Fantasy


In [51]:
def predict_top_k(user_id, train_set, P, Q, df_movies, ind_train_i, k=10):
    
    user_inter = train_set[user_id, :].toarray()
    user_inter = np.where(~np.isnan(user_inter), 0, user_inter)
    user_inter = np.nan_to_num(user_inter[0], nan=1)
    
    rec_vector = P[user_id, :].dot(Q).toarray()
    
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = user_inter * rec_vector_scaled
    
    item_idx = np.argsort(recommend_vector)[::-1][:k]
    
    movies = []
    movies_scores = []
    movies_ids = []

    for ind in item_idx:
        movies_ids.append(ind_train_i[ind])
        movies.append(df_movies[df_movies['movieId']==ind_train_i[ind]]['title'].iloc[0])
        movies_scores.append(recommend_vector[ind])
    
    similar = pd.DataFrame({'movieId' : movies_ids, 'title' : movies, 'scores' : movies_scores})
    return similar
    

In [52]:
user_id = 103
recommendations = predict_top_k(user_id, x_train, P, Q, df_movies, ind_train_i, k=10)
print(recommendations)

   movieId                                              title    scores
0     5615                                  Invincible (2001)  1.000000
1    93502                                  Ledge, The (2011)  0.994485
2    89072                                  Stake Land (2010)  0.957493
3     4334                                       Yi Yi (2000)  0.947172
4     5570                              Thesis (Tesis) (1996)  0.934872
5    26236  White Sun of the Desert, The (Beloe solntse pu...  0.932483
6    80549                                      Easy A (2010)  0.931932
7     4395  Big Deal on Madonna Street (I Soliti Ignoti) (...  0.926718
8   144222                             Bros Before Hos (2013)  0.924318
9     3790                                      Groove (2000)  0.922260
