In [65]:
import numpy as np
import pandas as pd

In [66]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [67]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [68]:
ratings[['userId', 'movieId']].apply(pd.Series.nunique)

userId      610
movieId    9724
dtype: int64

In [69]:
user_movies = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

In [70]:
import scipy.sparse as spsp
data_matrix = spsp.csr_matrix(user_movies.values)
print(data_matrix.shape)

(610, 9724)


In [71]:
data_matrix.toarray()

array([[4. , 0. , 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [72]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(data_matrix, test_size=0.068, random_state=42, shuffle=True)

In [73]:
len(X_test.data)

10086

In [74]:
from sklearn.linear_model import ElasticNet

In [75]:
X = X_train.tocsc(copy=True)

In [76]:
from multiprocessing import Pool
from functools import partial

In [87]:
def _partial_fit(j, X):
    model = ElasticNet(alpha=0.1, l1_ratio=0.1, positive=True, fit_intercept=False, max_iter=1000)
    X_j = X.copy()
    # get the target column
    y = X_j[:, j].toarray()
    # set the j-th column of X to zero
    X_j.data[X_j.indptr[j]:X_j.indptr[j + 1]] = 0.0
    # fit one ElasticNet model per column
    model.fit(X_j, y)
    # self.model.coef_ contains the coefficient of the ElasticNet model
    # let's keep only the non-zero values
    nnz_idx = model.coef_ > 0.0
    values = model.coef_[nnz_idx]
    rows = np.arange(X.shape[1])[nnz_idx]
    cols = np.ones(nnz_idx.sum()) * j
    return values, rows, cols

In [88]:
X

<568x9724 sparse matrix of type '<class 'numpy.float64'>'
	with 90750 stored elements in Compressed Sparse Column format>

In [89]:
import warnings
warnings.filterwarnings('ignore')

In [90]:
n_items = X.shape[1]
_pfit = partial(_partial_fit, X=X)
pool = Pool(processes=10)
res = pool.map(_pfit, np.arange(n_items))

# res contains a vector of (values, rows, cols) tuples
values, rows, cols = [], [], []
for values_, rows_, cols_ in res:
    values.extend(values_)
    rows.extend(rows_)
    cols.extend(cols_)
# generate the sparse weight matrix
W_sparse = spsp.csc_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32)

In [91]:
X_pred = X_test @ W_sparse

In [231]:
user_id = 18 # 4
user_profile = X_test[user_id]
scores = user_profile.dot(W_sparse).toarray().ravel()
ranking = scores.argsort()[::-1]
top_n_ranking = ranking[:10]

In [232]:
top_n = 10

In [233]:
print(np.sort(scores)[::-1][:top_n])

[3.91985286 3.36406538 3.27896743 2.87473185 2.83498253 2.79368444
 2.79356236 2.57041756 2.52917585 2.47926755]


In [234]:
jj = np.unique(user_profile.data, return_counts=True)[1]

In [235]:
tt = jj[-1]+jj[-2]
tt

41

In [236]:
kek = user_profile.toarray().ravel().argsort()[::-1]

In [237]:
kek

array([ 314, 1083, 1290, ..., 6480, 6479,    0])

In [238]:
movies.loc[kek[:tt]].genres.str.split('|').apply(lambda x: pd.Series(x).value_counts()).sum().sort_values()[::-1]

Action       20.0
Drama        18.0
Thriller     17.0
Adventure    11.0
Crime         8.0
Romance       8.0
Comedy        8.0
Sci-Fi        7.0
Horror        4.0
Mystery       3.0
Children      2.0
War           2.0
Western       1.0
Fantasy       1.0
IMAX          1.0
dtype: float64

In [239]:
movies.loc[top_n_ranking]

Unnamed: 0,movieId,title,genres
97,110,Braveheart (1995),Action|Drama|War
398,457,"Fugitive, The (1993)",Thriller
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War
546,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller
461,527,Schindler's List (1993),Drama|War
9,10,GoldenEye (1995),Action|Adventure|Thriller
418,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
615,780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller
334,377,Speed (1994),Action|Romance|Thriller
337,380,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller
