In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

In [3]:
data = pd.read_csv('dataset.csv')
data = data.sort_values(['timestamp'])

In [4]:
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
217,259,255,4,874724710
83968,259,286,4,874724727
43030,259,298,4,874724754
21399,259,185,4,874724781
82658,259,173,4,874724843


In [5]:
print(data['user_id'].unique().size)
print(data['item_id'].unique().size)

944
1682


In [6]:
train = data[:80000]
test = data[80000:]

In [7]:
train.head()

Unnamed: 0,user_id,item_id,rating,timestamp
217,259,255,4,874724710
83968,259,286,4,874724727
43030,259,298,4,874724754
21399,259,185,4,874724781
82658,259,173,4,874724843


In [8]:
test.head()

Unnamed: 0,user_id,item_id,rating,timestamp
1346,3,245,1,889237247
27978,3,355,3,889237247
1260,3,335,1,889237269
38673,3,322,3,889237269
3761,3,323,2,889237269


In [9]:
def average_precision(actual, recommended, k=30):
    ap_sum = 0
    hits = 0
    for i in range(k):
        product_id = recommended[i] if i < len(recommended) else None
        if product_id is not None and product_id in actual:
            hits += 1
            ap_sum += hits / (i + 1)
    return ap_sum / k


def normalized_average_precision(actual, recommended, k=30):
    actual = set(actual)
    if len(actual) == 0:
        return 0.0

    ap = average_precision(actual, recommended, k=k)
    ap_ideal = average_precision(actual, list(actual)[:k], k=k)
    return ap / ap_ideal

In [10]:
def recommend(user):
    return [288, 1, 286, 121, 174]

In [11]:
scores = []
for user in tqdm(test['user_id'].unique()):
    actual = list(test[test['user_id'] == user]['item_id'])
    recommended = recommend(user)
    
    scores.append(normalized_average_precision(actual, recommended))

np.mean(scores)

100%|██████████████████████████████████████████████████████████████████████████████| 301/301 [00:00<00:00, 1132.70it/s]


0.03566965142495101

In [12]:
# Задача: Обучить модель так, чтобы мера была больше 0.1

# EDA

In [14]:
train.describe()

Unnamed: 0,user_id,item_id,rating,timestamp
count,80000.0,80000.0,80000.0,80000.0
mean,461.714725,422.387738,3.5177,881565900.0
std,265.691828,331.103273,1.12741,4027973.0
min,0.0,1.0,1.0,874724700.0
25%,256.0,172.0,3.0,878963300.0
50%,450.0,318.0,4.0,880845200.0
75%,684.0,629.0,4.0,884673900.0
max,943.0,1682.0,5.0,889237200.0


In [15]:
train.groupby('rating').count()

Unnamed: 0_level_0,user_id,item_id,timestamp
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,5062,5062,5062
2,9101,9101,9101
3,21763,21763,21763
4,27507,27507,27507
5,16567,16567,16567


In [16]:
train.groupby(['user_id'])['item_id'].count()

user_id
0        3
1      265
2       62
3       29
5      175
      ... 
937     40
939     49
940    102
941     22
943    168
Name: item_id, Length: 752, dtype: int64

In [17]:
print(train['user_id'].unique().size)
print(train['item_id'].unique().size)

752
1616


In [18]:
test.groupby(['user_id'])['rating'].mean()

user_id
1      3.571429
3      3.320000
4      4.333333
7      3.965261
11     3.464088
         ...   
932    3.966805
934    3.701149
938    3.268519
940    3.200000
942    4.265823
Name: rating, Length: 301, dtype: float64

In [19]:
train.groupby(['item_id', 'rating']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,timestamp
item_id,rating,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,7,7
1,2,20,20
1,3,81,81
1,4,169,169
1,5,97,97
...,...,...,...
1675,3,1,1
1676,2,1,1
1677,3,1,1
1681,3,1,1


In [20]:
u = train.groupby('user_id')['rating'].count()
topu = u.sort_values(ascending = False)[:10]

i = train.groupby('item_id')['rating'].count()
topi = i.sort_values(ascending = False)[:10]

In [21]:
joined = train.join(topu, on='user_id', how = 'inner', rsuffix='_r')
joined = joined.join(topi, on='item_id', how = 'inner', rsuffix = '_r')

pd.crosstab(joined.user_id, joined.item_id, joined.rating, aggfunc=np.sum)

item_id,1,50,100,121,174,181,258,286,288,294
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
13,3.0,5.0,5.0,5.0,4.0,5.0,4.0,3.0,1.0,2.0
181,3.0,,3.0,4.0,,,3.0,1.0,4.0,2.0
276,5.0,5.0,5.0,4.0,5.0,5.0,5.0,,4.0,4.0
303,5.0,5.0,5.0,3.0,5.0,5.0,4.0,5.0,4.0,4.0
405,,5.0,,,5.0,5.0,,,5.0,
429,3.0,5.0,5.0,3.0,4.0,5.0,4.0,,3.0,
450,4.0,5.0,4.0,3.0,5.0,4.0,4.0,4.0,3.0,4.0
537,2.0,4.0,4.0,1.0,3.0,2.0,4.0,3.0,2.0,1.0
655,2.0,4.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0
846,,5.0,,,5.0,5.0,3.0,,4.0,3.0


# Non-personalized recommendations

In [88]:
def weight(item):
#     coef = (0.1, 0.5, 1, 2, 3)
    coef = (0.1, 0.2, 0.5, 1.5, 3)
    sum = 0
    for i in range(1, 6):
        cnt = train[(train['item_id'] == item) & (train['rating'] == i)]['user_id'].count()
        sum += coef[i-1] * cnt
    return sum

In [90]:
dct = {
    'item' : [],
    'weight' : []
}
for item in tqdm(train['item_id'].unique()):
    dct['item'].append(item)
    dct['weight'].append(weight(item))

best_item = pd.DataFrame(dct)

100%|█████████████████████████████████████████████████████████████████████████████| 1616/1616 [00:11<00:00, 137.97it/s]


In [91]:
best_item.head(5)

Unnamed: 0,item,weight
0,255,163.0
1,286,557.1
2,298,244.3
3,185,351.5
4,173,543.3


In [92]:
best_item.sort_values(by=['weight'], ascending=False, inplace=True)

In [93]:
best_item.head(5)

Unnamed: 0,item,weight
30,50,1031.2
123,100,810.2
40,181,731.4
233,127,706.3
49,174,705.1


In [94]:
def non_personalized_recommend(user):    
    best_items = best_item['item'].values
    
    if user in train['user_id']:
        old = train[train['user_id'] == user]['item_id'].unique()
        indices = np.in1d(best_items, old, invert=True)
        return best_items[indices][:30]
    else:
        return best_items[:30]

In [95]:
scores = []
for user in tqdm(test['user_id'].unique()):
    actual = list(test[test['user_id'] == user]['item_id'])
    recommended = non_personalized_recommend(user)
    
    scores.append(normalized_average_precision(actual, recommended))

print(np.mean(scores))
print(np.mean(scores) > 0.1)

100%|███████████████████████████████████████████████████████████████████████████████| 301/301 [00:00<00:00, 531.55it/s]

0.16941113364006952
True





# Collaborative filtering

In [30]:
n_users = train.user_id.unique().shape[0]
n_items = train.item_id.unique().shape[0]

ratings = np.zeros((n_users, n_items))
for row in train.itertuples():
    ratings[(np.where(train.user_id.unique() == row[1])[0])[0], 
            (np.where(train.item_id.unique() == row[2])[0])[0]] = row[3]
ratings

KeyboardInterrupt: 

In [31]:
train_data = train.copy()

In [32]:
for col in ['user_id', 'item_id']:
    train_data[col].replace({val: i for i, val in enumerate(train_data[col].unique())}, inplace=True)

In [33]:
train.head(5)

Unnamed: 0,user_id,item_id,rating,timestamp
217,259,255,4,874724710
83968,259,286,4,874724727
43030,259,298,4,874724754
21399,259,185,4,874724781
82658,259,173,4,874724843


In [34]:
train_data.head(5)

Unnamed: 0,user_id,item_id,rating,timestamp
217,0,0,4,874724710
83968,0,1,4,874724727
43030,0,2,4,874724754
21399,0,3,4,874724781
82658,0,4,4,874724843


In [35]:
ratings = np.zeros((n_users, n_items))
for line in tqdm(train_data.itertuples()):
    ratings[line[1], line[2]] = line[3]
ratings

80000it [00:00, 538260.92it/s]


array([[4., 4., 4., ..., 0., 0., 0.],
       [0., 4., 5., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 4., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [131]:
def my_similarity(ratings, kind='user'):
    if kind == 'user':
        sim = ratings.dot(ratings.T)
    elif kind == 'item':
        sim = ratings.T.dot(ratings)
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms.T / norms)

In [140]:
# %%time
# user_my_similarity = my_similarity(ratings, kind='user')
# item_my_similarity = my_similarity(ratings, kind='item')

Wall time: 78.8 ms


In [133]:
# user_my_similarity[:5, :5]

array([[1.        , 0.18978178, 0.11447675, 0.22761207, 0.16692304],
       [0.18978178, 1.        , 0.09353405, 0.32072696, 0.21306018],
       [0.11447675, 0.09353405, 1.        , 0.15367694, 0.15629993],
       [0.22761207, 0.32072696, 0.15367694, 1.        , 0.21032327],
       [0.16692304, 0.21306018, 0.15629993, 0.21032327, 1.        ]])

In [134]:
# item_my_similarity[:5, :5]

array([[1.        , 0.30167344, 0.37279859, 0.17310643, 0.23574217],
       [0.30167344, 1.        , 0.3185424 , 0.31781269, 0.33084429],
       [0.37279859, 0.3185424 , 1.        , 0.26044903, 0.31065805],
       [0.17310643, 0.31781269, 0.26044903, 1.        , 0.5090957 ],
       [0.23574217, 0.33084429, 0.31065805, 0.5090957 , 1.        ]])

In [119]:
from sklearn.metrics.pairwise import cosine_similarity

In [143]:
%%time
user_similarity = cosine_similarity(ratings)
item_similarity = cosine_similarity(ratings.T)

Wall time: 70.8 ms


In [136]:
user_similarity[:5, :5]

array([[1.        , 0.18978178, 0.11447675, 0.22761207, 0.16692304],
       [0.18978178, 1.        , 0.09353405, 0.32072696, 0.21306018],
       [0.11447675, 0.09353405, 1.        , 0.15367694, 0.15629993],
       [0.22761207, 0.32072696, 0.15367694, 1.        , 0.21032327],
       [0.16692304, 0.21306018, 0.15629993, 0.21032327, 1.        ]])

In [137]:
item_similarity[:5, :5]

array([[1.        , 0.30167344, 0.37279859, 0.17310643, 0.23574217],
       [0.30167344, 1.        , 0.3185424 , 0.31781269, 0.33084429],
       [0.37279859, 0.3185424 , 1.        , 0.26044903, 0.31065805],
       [0.17310643, 0.31781269, 0.26044903, 1.        , 0.5090957 ],
       [0.23574217, 0.33084429, 0.31065805, 0.5090957 , 1.        ]])

In [44]:
def predict(ratings, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [68]:
data_for_predict = {}
pred = predict(ratings, user_similarity, kind='user')

for user in tqdm(train['user_id'].unique()):
#     print(user)
    user_ind = np.where(train['user_id'].unique() == user)[0][0]
    
    tup_predict = []
    for i in range(len(pred_my[user_ind, :])):
        tup_predict.append((pred_my[user_ind, :][i],
                            train['item_id'].unique()[i]))
    data_for_predict[user] = tup_predict

100%|████████████████████████████████████████████████████████████████████████████████| 752/752 [27:06<00:00,  2.16s/it]


In [70]:
# df = pd.DataFrame.from_dict(data_for_predict)
# df.to_csv('data_for_predict.csv')

In [144]:
def recommend_user_based(user): 
    if user in train['user_id'].unique():
#         print(user)
        tup_predict = data_for_predict[user]   
            
        old = train[train['user_id'] == user]['item_id'].values
        tup_predict.sort(reverse=True)
        return [i[1] for i in tup_predict if i[1] not in old][:30]
    else:
        return non_personalized_recommend(user)

In [145]:
scores = []
for user in tqdm(test['user_id'].unique()):
    actual = list(test[test['user_id'] == user]['item_id'])
    recommended = recommend_user_based(user)
    
    scores.append(normalized_average_precision(actual, recommended))

np.mean(scores)

100%|███████████████████████████████████████████████████████████████████████████████| 301/301 [00:01<00:00, 176.79it/s]


0.17030585816199656