In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k


import category_encoders as ce

In [2]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    precision = flags.sum() / len(recommended_list)
        
    return precision

In [3]:
test = pd.read_csv('./data/test_users.csv')
test_users = test['user_id'].unique()
len(test_users)

1708

In [4]:
data = pd.read_csv('./data/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [5]:
data['weighted'] = data['week_no'] ** 4

In [6]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [7]:
result_users = data_train['user_id'].unique()
len(result_users)

fake_ids = list(set(test_users) - set(result_users))

fake_data = data.iloc[-len(fake_ids):,:]
fake_data['user_id'] = fake_ids
data_train = pd.concat([data_train, fake_data ])
data.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,weighted
2396799,1613,41655820646,663,16102849,1,2.0,3262,-1.15,1231,95,0.0,0.0,81450625
2396800,1001,41655829421,663,13217063,1,1.69,3131,0.0,2231,95,0.0,0.0,81450625
2396801,1001,41655829421,663,13217800,1,1.69,3131,0.0,2231,95,0.0,0.0,81450625
2396802,1167,41656790510,663,6410462,22451,43.98,3385,-0.65,1059,95,0.0,0.0,81450625
2396803,2325,41656790510,663,6430664,1,8.0,3385,0.0,1059,95,0.0,0.0,81450625


In [8]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

popularity.head(3)

Unnamed: 0,item_id,n_sold
0,25671,6
1,26081,1
2,26093,1


In [9]:
data_train_raw = data_train.copy()

top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

# Заведем фиктивный item_id (если юзер НЕ покупал товары из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity',
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [10]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)


top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

# Заведем фиктивный item_id (если юзер НЕ покупал товары из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='weighted', #       values='quantity',
                                  aggfunc='sum',   #aggfunc='count',
                                  #aggfunc=lambda rows: np.average(rows, weights=data_train.loc[rows.index, 'week_no']),
                                  fill_value=0
                                 )

#user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(9)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,45212176.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,68574961.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
from scipy.spatial.distance import cdist
import tqdm

def get_N_niarest_vector(matrix, sample_vector, n=5):
    dist_line = cdist(sample_vector, matrix, metric='cosine')
    items = dist_line[0].argsort()[-n:][::-1]
    batch = matrix[items]
    dist_selected = dist_line[0][items]
    return batch, (dist_selected.sum()-1)/(n-1)

In [12]:
def get_user_recomrndations( user_item_matrix, sample_vector, k_neigbours=9, n_recomendation = 5):
    batch, dist = get_N_niarest_vector(user_item_matrix.values, sample_vector, k_neigbours)
    
    #Вектор личных предпочтений
    rec_from_yourself = sample_vector
    
    #Вектор всех предпочтений.
    rec_from_all = user_item_matrix.values.sum(axis=0)/len(user_item_matrix)
    
    #Вектор ближайших предпочтений.
    rec_from_neighbours = batch.sum(axis=0)/len(batch)
    
    k1=(dist*5.5)   
    k2=(dist*0.001) 
     
    #Некий целевой вектор.
    rec = rec_from_yourself + rec_from_all/k1 # + rec_from_neighbours*k2
    
    #Отбор товаров с максимальными весами.
    items = rec[0].argsort()[-n_recomendation-1:][::-1]
    
    idx_to_zerro = itemid_to_id[999999]
    items = list(items)
    if idx_to_zerro in items:
        items.remove(idx_to_zerro)
    items =items[:n_recomendation]
    
    return items

In [13]:
# перенумеруем пользователей и товары
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))


In [14]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result['actual'] = result['actual'].apply(lambda x: list(x))
result.head(10)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."
5,9,"[864335, 990865, 1029743, 9297474, 10457112, 8..."
6,13,"[6534178, 1104146, 829197, 840361, 862070, 884..."
7,14,"[840601, 867293, 933067, 951590, 952408, 96569..."
8,15,"[910439, 1082185, 959076, 1023958, 1082310, 13..."
9,16,"[1062973, 1082185, 13007710]"


In [15]:
k_neigbours=9 
n_recomendation = 5

In [16]:
id_list = result['user_id'].map(userid_to_id).values

rec_list=[]
for i in tqdm.trange(len(id_list)):    
    sample_vector = np.reshape(user_item_matrix.values[id_list[i]],(1,-1))
    item = get_user_recomrndations( user_item_matrix, sample_vector, k_neigbours=k_neigbours, n_recomendation = n_recomendation)
    rec_list.append(item)
#rec_list    

100%|██████████| 2042/2042 [08:38<00:00,  3.94it/s]


In [17]:
result_list=[]
for i in tqdm.trange(len(rec_list)):
    item = [id_to_itemid[rec] for rec in rec_list[i]]
    result_list.append(item)
#result_list       

100%|██████████| 2042/2042 [00:00<00:00, 40062.53it/s]


In [18]:
result['test'] = result_list

In [19]:
result.apply(lambda x: precision_at_k(x['test'], x['actual'],  5), axis=1).mean()

0.4375122428991185

### Пока это лучший  результат: (0.4375122428991185) ###

In [20]:
popularity = data.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)


top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

# Заведем фиктивный item_id (если юзер НЕ покупал товары из топ-5000, то он "купил" такой товар)
data.loc[~data['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data, 
                                  index='user_id', columns='item_id', 
                                  values='weighted', #       values='quantity',
                                  aggfunc='sum',   #aggfunc='count',
                                  fill_value=0
                                 )

#user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(9)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15926775,15926844,15926885,15926886,15926927,15927403,15927661,15927850,16809471,17105257
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,45212176.0,0.0,0.0,93160625.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,71639296.0,71639296.0,38950081.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# перенумеруем пользователей и товары
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [22]:
id_list = test['user_id'].map(userid_to_id).values

rec_list=[]
for i in tqdm.trange(len(id_list)):    
    sample_vector = np.reshape(user_item_matrix.values[id_list[i]],(1,-1))
    item = get_user_recomrndations( user_item_matrix, sample_vector, k_neigbours=k_neigbours, n_recomendation = n_recomendation)
    rec_list.append(item)

100%|██████████| 1708/1708 [07:14<00:00,  3.93it/s]


In [23]:
result_list=[]
for i in tqdm.trange(len(rec_list)):
    item = [id_to_itemid[rec] for rec in rec_list[i]]
    result_list.append(item)

100%|██████████| 1708/1708 [00:00<00:00, 41684.10it/s]


In [24]:
for i in range(len(result_list)):
    for j in range(len(result_list[i])):
        result_list[i][j] = str(result_list[i][j])
    result_list[i] = ' '.join(result_list[i])
result_list

['856942 1082185 995242 940947 8293439',
 '1106523 1133018 838136 826784 916122',
 '1053690 9526563 6463658 910032 13842214',
 '1082185 1037863 1119051 840361 1024306',
 '1082185 1122358 1106523 1022003 1013321',
 '1082185 1029743 840361 1116578 5569230',
 '1029743 995242 889692 9297474 1039126',
 '1082185 868764 15596488 6534178 900370',
 '6534178 1082185 1029743 862070 1038985',
 '981760 13876901 6534178 951590 1025611',
 '1082185 1070820 1029743 845208 995242',
 '9677886 948420 6533889 938118 1116821',
 '6534178 1033615 1029743 962229 1082185',
 '1075368 1058997 1126899 894360 949836',
 '1029743 1031864 6534178 934697 1096036',
 '1082185 1070820 6534178 1104349 827656',
 '1034176 1075368 1138858 995242 1005274',
 '903230 844165 1092026 1036501 8203451',
 '1029743 826249 929018 995242 1065593',
 '1021133 985119 1084591 9707137 907014',
 '15927661 7409957 959409 918733 1057777',
 '903567 1005186 926646 6534030 1017195',
 '6534178 1082185 9526100 1040807 1126899',
 '1003441 972569 9088

In [25]:
test['Predicted'] = result_list
test

Unnamed: 0,user_id,Predicted
0,1,856942 1082185 995242 940947 8293439
1,2,1106523 1133018 838136 826784 916122
2,3,1053690 9526563 6463658 910032 13842214
3,6,1082185 1037863 1119051 840361 1024306
4,7,1082185 1122358 1106523 1022003 1013321
...,...,...
1703,2494,1082185 1071939 840361 862349 8119157
1704,2496,981760 916122 1106523 883404 992870
1705,2498,1070820 1082185 1022066 901776 1053690
1706,2499,1070820 5568378 1060872 5570048 5569327


In [26]:
test.columns = [['UserId', 'Predicted']]
test

Unnamed: 0,UserId,Predicted
0,1,856942 1082185 995242 940947 8293439
1,2,1106523 1133018 838136 826784 916122
2,3,1053690 9526563 6463658 910032 13842214
3,6,1082185 1037863 1119051 840361 1024306
4,7,1082185 1122358 1106523 1022003 1013321
...,...,...
1703,2494,1082185 1071939 840361 862349 8119157
1704,2496,981760 916122 1106523 883404 992870
1705,2498,1070820 1082185 1022066 901776 1053690
1706,2499,1070820 5568378 1060872 5570048 5569327


In [27]:
test.to_csv('./data/result_final.csv', index=False)