In [18]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

from sklearn.metrics.pairwise import cosine_similarity

In [6]:
ratings = pd.read_csv("rating.csv")
ratings = ratings.sort_values(['timestamp'])

In [7]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
4182421,28507,1176,4.0,1995-01-09 11:46:44
18950979,131160,1079,3.0,1995-01-09 11:46:49
18950936,131160,47,5.0,1995-01-09 11:46:49
18950930,131160,21,3.0,1995-01-09 11:46:49
12341178,85252,45,3.0,1996-01-29 00:00:00
...,...,...,...,...
7819902,53930,118706,3.5,2015-03-31 06:00:51
2508834,16978,2093,3.5,2015-03-31 06:03:17
12898546,89081,55232,3.5,2015-03-31 06:11:26
12898527,89081,52458,4.0,2015-03-31 06:11:28


In [10]:
print(ratings['userId'].unique().size)
print(ratings['movieId'].unique().size)

138493
26744


In [14]:
train, test = ratings[:8000], ratings[8000:10000]

# Metrics

In [28]:
def average_precision(actual, recommended, k=10):
    ap_sum = 0
    hits = 0
    for i in range(k):
        product_id = recommended[i] if i < len(recommended) else None
        if product_id is not None and product_id in actual:
            hits += 1
            ap_sum += hits / (i + 1)
    return ap_sum / k


def normalized_average_precision(actual, recommended, k=10):
    actual = set(actual)
    if len(actual) == 0:
        return 0.0

    ap = average_precision(actual, recommended, k=k)
    ap_ideal = average_precision(actual, list(actual)[:k], k=k)
    return ap / ap_ideal

Выбрал **NAP**, так как данная метрика неплохо подходит для данной задачи и она очень популярна в задачах RecSys. Во многих крутых статьях её советуют, так почему бы её не попробовать. :) 

# Non-personalized recommendations

In [15]:
user_ratings = sorted(ratings['rating'].unique())
user_ratings

[0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]

In [16]:
user_ratings_with_coef = (
    (0.5, -3), 
    (1.0, -1),
    (1.5, -1),
    (2.0, -0.5), 
    (2.5, -0.5),
    
    (3.0, 3),
    (3.5, 4),
    
    (4.0, 6),
    (4.5, 7),
    (5.0, 8),
)

In [22]:
def weight(item, user_ratings_with_coef):    
    sum = 0
    for rate, coef in user_ratings_with_coef:
        cnt = train[(train['movieId'] == item) & (train['rating'] == rate)]['userId'].count()
        sum += coef * cnt
    return sum

In [23]:
dct = {
    'item' : [],
    'weight' : []
}
for item in tqdm(train['movieId'].unique()):
    dct['item'].append(item)
    dct['weight'].append(weight(item, user_ratings_with_coef))

best_item = pd.DataFrame(dct)

100%|███████████████████████████████████████████████████████████████████████████████| 490/490 [00:04<00:00, 115.02it/s]


In [25]:
best_item.sort_values(by=['weight'], ascending=False, inplace=True)

In [26]:
best_item.head()

Unnamed: 0,item,weight
299,150,883.5
105,296,794.5
222,318,624.0
173,300,580.5
316,153,569.0


In [29]:
def non_personalized_recommend(user, k=10):    
    best_items = best_item['item'].values
    
    if user in train['userId']:
        old = train[train['userId'] == user]['movieId'].unique()
        indices = np.in1d(best_items, old, invert=True)
        return best_items[indices][:k]
    else:
        return best_items[:k]

In [31]:
scores = []
for user in tqdm(test['userId'].unique()):
    actual = list(test[test['userId'] == user]['movieId'])
    recommended = non_personalized_recommend(user)
    
    scores.append(normalized_average_precision(actual, recommended))

print(np.mean(scores))
print(np.mean(scores) > 0.3)

100%|████████████████████████████████████████████████████████████████████████████████| 56/56 [00:00<00:00, 1193.77it/s]

0.36890447845804986
True





# Collaborative filtering with cosine similarity

In [34]:
train_data = train.copy()

for col in ['userId', 'movieId']:
    train_data[col].replace({val: i for i, val in enumerate(train_data[col].unique())}, inplace=True)

In [35]:
train_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
4182421,0,0,4.0,1995-01-09 11:46:44
18950979,1,1,3.0,1995-01-09 11:46:49
18950936,1,2,5.0,1995-01-09 11:46:49
18950930,1,3,3.0,1995-01-09 11:46:49
12341178,2,4,3.0,1996-01-29 00:00:00


In [36]:
n_users = train.userId.unique().shape[0]
n_items = train.movieId.unique().shape[0]

ratings_mat = np.zeros((n_users, n_items))
for line in tqdm(train_data.itertuples()):
    ratings_mat[line[1], line[2]] = line[3]

8000it [00:00, 500842.32it/s]


In [37]:
ratings_mat

array([[4., 0., 0., ..., 0., 0., 0.],
       [0., 3., 5., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 5., ..., 0., 0., 5.],
       [0., 0., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
%%time
user_similarity = cosine_similarity(ratings_mat)
item_similarity = cosine_similarity(ratings_mat.T)

Wall time: 17 ms


In [40]:
user_similarity[:5, :5]

array([[1.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 1.        , 0.08866296, 0.        , 0.08185242],
       [0.        , 0.08866296, 1.        , 0.13807723, 0.16469988],
       [0.        , 0.        , 0.13807723, 1.        , 0.0637355 ],
       [0.        , 0.08185242, 0.16469988, 0.0637355 , 1.        ]])

In [41]:
item_similarity[:5, :5]

array([[1.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 1.        , 0.15764167, 0.08796935, 0.        ],
       [0.        , 0.15764167, 1.        , 0.29491838, 0.23922842],
       [0.        , 0.08796935, 0.29491838, 1.        , 0.3913978 ],
       [0.        , 0.        , 0.23922842, 0.3913978 , 1.        ]])

In [42]:
def predict(ratings_mat, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(ratings_mat) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings_mat.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [45]:
data_for_predict = {}
pred = predict(ratings_mat, user_similarity, kind='user')

for user in tqdm(train['userId'].unique()):
    user_ind = np.where(train['userId'].unique() == user)[0][0]
    
    tup_predict = []
    for i in range(len(pred[user_ind, :])):
        tup_predict.append((pred[user_ind, :][i],
                            train['movieId'].unique()[i]))
    data_for_predict[user] = tup_predict

100%|████████████████████████████████████████████████████████████████████████████████| 217/217 [00:13<00:00, 16.40it/s]


### User-based

In [46]:
def recommend_user_based(user, k=10): 
    if user in train['userId'].unique():
        tup_predict = data_for_predict[user]   
            
        old = train[train['userId'] == user]['movieId'].values
        tup_predict.sort(reverse=True)
        return [i[1] for i in tup_predict if i[1] not in old][:k]
    else:
        return non_personalized_recommend(user)

In [47]:
scores = []
for user in tqdm(test['userId'].unique()):
    actual = list(test[test['userId'] == user]['movieId'])
    recommended = recommend_user_based(user)
    
    scores.append(normalized_average_precision(actual, recommended))

np.mean(scores)

100%|█████████████████████████████████████████████████████████████████████████████████| 56/56 [00:00<00:00, 715.67it/s]


0.3728840702947846

### Item-based

In [50]:
data_for_predict = {}
pred = predict(ratings_mat, item_similarity, kind='item')

for user in tqdm(train['userId'].unique()):
    user_ind = np.where(train['userId'].unique() == user)[0][0]
    
    tup_predict = []
    for i in range(len(pred[user_ind, :])):
        tup_predict.append((pred[user_ind, :][i],
                            train['movieId'].unique()[i]))
    data_for_predict[user] = tup_predict

100%|████████████████████████████████████████████████████████████████████████████████| 217/217 [00:21<00:00,  9.93it/s]


In [51]:
def recommend_item_based(user, k=10): 
    if user in train['userId'].unique():
        tup_predict = data_for_predict[user]   
            
        old = train[train['userId'] == user]['movieId'].values
        tup_predict.sort(reverse=True)
        return [i[1] for i in tup_predict if i[1] not in old][:k]
    else:
        return non_personalized_recommend(user)

In [52]:
scores = []
for user in tqdm(test['userId'].unique()):
    actual = list(test[test['userId'] == user]['movieId'])
    recommended = recommend_item_based(user)
    
    scores.append(normalized_average_precision(actual, recommended))

np.mean(scores)

100%|█████████████████████████████████████████████████████████████████████████████████| 56/56 [00:00<00:00, 369.12it/s]


0.37819019274376414