In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [3]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [4]:
ratings[['userId', 'movieId']].apply(pd.Series.nunique)

userId      610
movieId    9724
dtype: int64

In [5]:
user_movies = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

In [6]:
import scipy.sparse as spsp
data_matrix = spsp.csr_matrix(user_movies.values)
print(data_matrix.shape)

(610, 9724)


In [7]:
data_matrix.toarray()

array([[4. , 0. , 4. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(data_matrix, test_size=0.068, random_state=42, shuffle=True)

In [9]:
len(X_test.data)

10086

In [10]:
from sklearn.linear_model import ElasticNet

In [11]:
X = X_train.tocsc(copy=True)

In [12]:
from multiprocessing import Pool
from functools import partial

In [13]:
def _partial_fit(j, X):
    model = ElasticNet(alpha=0.1, l1_ratio=0.1, positive=True, fit_intercept=False, max_iter=1000)
    X_j = X.copy()
    # get the target column
    y = X_j[:, j].toarray()
    # set the j-th column of X to zero
    X_j.data[X_j.indptr[j]:X_j.indptr[j + 1]] = 0.0
    # fit one ElasticNet model per column
    model.fit(X_j, y)
    # self.model.coef_ contains the coefficient of the ElasticNet model
    # let's keep only the non-zero values
    nnz_idx = model.coef_ > 0.0
    values = model.coef_[nnz_idx]
    rows = np.arange(X.shape[1])[nnz_idx]
    cols = np.ones(nnz_idx.sum()) * j
    return values, rows, cols

In [14]:
X

<568x9724 sparse matrix of type '<class 'numpy.float64'>'
	with 90750 stored elements in Compressed Sparse Column format>

In [15]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
n_items = X.shape[1]
_pfit = partial(_partial_fit, X=X)
pool = Pool(processes=10)
res = pool.map(_pfit, np.arange(n_items))

# res contains a vector of (values, rows, cols) tuples
values, rows, cols = [], [], []
for values_, rows_, cols_ in res:
    values.extend(values_)
    rows.extend(rows_)
    cols.extend(cols_)
# generate the sparse weight matrix
W_sparse = spsp.csc_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32)

In [17]:
X_pred = X_test @ W_sparse

In [87]:
user_id = 18 # 4
user_profile = X_test[user_id]
scores = user_profile.dot(W_sparse).toarray().ravel()
ranking = scores.argsort()[::-1]
top_n_ranking = ranking[:30]

In [88]:
l1 = np.argwhere(user_profile.toarray().ravel() != 0).ravel()

In [89]:
top_n = 30

In [90]:
print(np.sort(scores)[::-1][:top_n])

[3.91985286 3.36406538 3.27896743 2.87473185 2.83498253 2.79368444
 2.79356236 2.57041756 2.52917585 2.47926755 2.36404622 2.32217447
 2.28975585 2.28105557 2.24451515 2.22496392 2.16329177 2.13158916
 2.11784517 2.09086994 1.99474336 1.99099808 1.93194224 1.84248989
 1.75123691 1.73335147 1.6959451  1.67878424 1.64924482 1.63249784]


In [91]:
jj = np.unique(user_profile.data, return_counts=True)[1]

In [92]:
tt = jj[-1]+jj[-2]
tt

41

In [93]:
kek = user_profile.toarray().ravel().argsort()[::-1]

In [94]:
kek

array([ 314, 1083, 1290, ..., 6480, 6479,    0])

In [95]:
movies.loc[kek[:tt]].genres.str.split('|').apply(lambda x: pd.Series(x).value_counts()).sum().sort_values()[::-1]

Action       20.0
Drama        18.0
Thriller     17.0
Adventure    11.0
Crime         8.0
Romance       8.0
Comedy        8.0
Sci-Fi        7.0
Horror        4.0
Mystery       3.0
Children      2.0
War           2.0
Western       1.0
Fantasy       1.0
IMAX          1.0
dtype: float64

In [97]:
l2 = np.array(movies.loc[top_n_ranking].movieId.tolist())

In [109]:
movies.loc[np.setdiff1d(top_n_ranking, np.intersect1d(l1, top_n_ranking) )]

Unnamed: 0,movieId,title,genres
134,161,Crimson Tide (1995),Drama|Thriller|War
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
287,329,Star Trek: Generations (1994),Adventure|Drama|Sci-Fi
395,454,"Firm, The (1993)",Drama|Thriller
461,527,Schindler's List (1993),Drama|War
508,590,Dances with Wolves (1990),Adventure|Drama|Western
509,592,Batman (1989),Action|Crime|Thriller
938,1238,Local Hero (1983),Comedy
2077,2761,"Iron Giant, The (1999)",Adventure|Animation|Children|Drama|Sci-Fi
