In [1]:
import pandas as pd

interactions.csv — файл хранит данные по взаимодействию товаров и покупателей. Среди данных есть "холодные" товары и покупатели. В колонке row хранятся идентификаторы покупателя. В колонке col идентификаторы товара. В колонке data - значение взаимодействия.

Данные по товарам

item_asset.csv - файл хранит качественную характеристику товара. row - идентификатор товара, data - значение характеристики. col - порядковый номер фичи при выгрузке данных (смысла не несет, можно избавиться от этого столбца)

item_price.csv - файл хранит цену товара (уже нормализована). row - идентификатор товара, data - нормализованное значение цены. col - порядковый номер фичи при выгрузке данных (смысла не несет, можно избавиться от этого столбца)

item_subclass.csv - файл хранит значения категорий, к которым относится товар. row - идентификатор товара, col - номер категории, data - признак отношения к категории

Данные по пользователям

user_age.csv - файл хранит данные по возрасту пользователей. row - идентификатор пользователя, data - значение возраста (уже нормализованное), col - порядковый номер фичи при выгрузке данных (смысла не несет, можно избавиться от этого столбца)

user_region.csv - файл хранит one-hot encoded значения региона пользователя. row - идентификатор пользователя, col - номер one-hot feature региона, data - признак региона.

In [81]:
interactions = pd.read_csv("./interactions.csv")
item_asset = pd.read_csv("./item_asset.csv")
item_price = pd.read_csv("./item_price.csv")
item_subclass = pd.read_csv("./item_subclass.csv")
user_age = pd.read_csv("./user_age.csv")
user_region = pd.read_csv("./user_region.csv")

In [76]:
len(interactions['row'].unique())
len(interactions['col'].unique())
# row - id_person
# col - id_thing
# data - interaction_value

15277

## probably filter

In [82]:
interactions[['row', 'col']]

Unnamed: 0,row,col
0,0,3568
1,0,3827
2,0,4844
3,0,5734
4,0,6518
...,...,...
398631,30910,18176
398632,30910,18185
398633,30910,18248
398634,30910,18349


In [83]:
a = interactions[['row', 'col']].groupby('row').count().copy()
# print(a)
a = a[a['col'] > 0].reset_index().drop('col',axis=1)

In [6]:
# a

In [84]:
interactions = interactions.merge(a, left_on='row', right_on='row')

In [85]:
interactions

Unnamed: 0,row,col,data
0,0,3568,1.0
1,0,3827,1.0
2,0,4844,1.0
3,0,5734,1.0
4,0,6518,1.0
...,...,...,...
398631,30910,18176,1.0
398632,30910,18185,1.0
398633,30910,18248,1.0
398634,30910,18349,1.0


#########################

# Split train and val interactions by user_id

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_val_users, test_users = train_test_split(interactions[['row', 'col']].groupby('row').count().reset_index(), test_size=0.05, random_state=7)
train_users, val_users = train_test_split(train_val_users, test_size=0.2, random_state=7)

In [11]:
train_interactions = train_users.drop(['col'], axis=1).merge(interactions[['row', 'col']], left_on='row', right_on='row')
val_interactions = val_users.drop(['col'], axis=1).merge(interactions[['row', 'col']], left_on='row', right_on='row')
test_interactions = test_users.drop(['col'], axis=1).merge(interactions[['row', 'col']], left_on='row', right_on='row')

# Check datasets

In [12]:
item_asset[['row', 'data']]
# row - id_thing
# data - value

Unnamed: 0,row,data
0,0,0.009497
1,1,0.004226
2,2,0.003371
3,3,0.002991
4,4,0.002991
...,...,...
18485,18490,0.006268
18486,18491,0.012536
18487,18492,0.006268
18488,18493,0.005698


In [13]:
item_price[['row', 'data']]
# row - идентификатор товара
# data - нормализованное значение цены

Unnamed: 0,row,data
0,0,0.012911
1,1,0.005211
2,2,0.004131
3,3,0.003521
4,4,0.003521
...,...,...
18488,18490,0.008216
18489,18491,0.016620
18490,18492,0.008216
18491,18493,0.007418


In [14]:
item_subclass[['row', 'col']]
# row - идентификатор товара
# col - номер категории !!!! категориальный признак !!!!
# data - признак отношения к категории

Unnamed: 0,row,col
0,0,679
1,1,1376
2,2,1495
3,3,502
4,4,502
...,...,...
18490,18490,1207
18491,18491,1207
18492,18492,1207
18493,18493,1207


In [15]:
user_age[['row', 'data']] # мб не правильно описали столбцы
# row - идентификатор пользователя
# data - значение возраста 

Unnamed: 0,row,data
0,2,1.0
1,7,1.0
2,8,1.0
3,10,1.0
4,12,1.0
...,...,...
30312,30905,1.0
30313,30906,1.0
30314,30907,1.0
30315,30908,1.0


In [16]:
# user_region['col'].unique() = 1,2,3,4,5,6,7
# user_region['data'].unique() = 1
user_region
# row - идентификатор пользователя
# col - номер one-hot feature региона
# data - признак региона.

Unnamed: 0,row,col,data
0,0,6,1.0
1,0,0,1.0
2,1,7,1.0
3,1,0,1.0
4,2,5,1.0
...,...,...,...
26604,30902,4,1.0
26605,30904,7,1.0
26606,30907,6,1.0
26607,30909,6,1.0


# Users features (мб заполнить None и добавить в датасет)

In [17]:
# user_region # 26609 rows × 3 columns
# user_age[['row', 'data']] # 30317 rows × 2 columns

# user_features   # 25781 rows × 5 columns

In [18]:
user_features = user_region.merge(user_age, left_on='row', right_on='row',
          suffixes=('_region', '_age'))

In [19]:
user_features

Unnamed: 0,row,col_region,data_region,col_age,data_age
0,2,5,1.0,4,1.0
1,7,6,1.0,10,1.0
2,10,7,1.0,4,1.0
3,12,7,1.0,4,1.0
4,13,7,1.0,4,1.0
...,...,...,...,...,...
25776,30901,6,1.0,3,1.0
25777,30902,4,1.0,10,1.0
25778,30904,7,1.0,8,1.0
25779,30907,6,1.0,4,1.0


# item_features

In [20]:
# item_asset[['row', 'data']] # 18490 rows × 2 columns
# # row - id_thing
# # data - value

# item_price[['row', 'data']] # 18493 rows × 2 columns
# # row - идентификатор товара
# # data - нормализованное значение цены

# item_subclass[['row', 'col']] # 18495 rows × 2 columns
# # row - идентификатор товара
# # col - номер категории !!!! категориальный признак !!!!
# # data - признак отношения к категории

# item_features_1 # 18488 rows × 3 columns

# item_features # 18488 rows × 4 columns

In [21]:
item_features_1 = item_asset[['row', 'data']].merge(item_price[['row', 'data']], left_on='row', right_on='row',
          suffixes=('_asset', '_price'))

In [22]:
item_features = item_features_1.merge(item_subclass[['row', 'col']], left_on='row', right_on='row').rename(columns={'col': 'col_subclass'})

In [23]:
item_features

Unnamed: 0,row,data_asset,data_price,col_subclass
0,0,0.009497,0.012911,679
1,1,0.004226,0.005211,1376
2,2,0.003371,0.004131,1495
3,3,0.002991,0.003521,502
4,4,0.002991,0.003521,502
...,...,...,...,...
18483,18490,0.006268,0.008216,1207
18484,18491,0.012536,0.016620,1207
18485,18492,0.006268,0.008216,1207
18486,18493,0.005698,0.007418,1207


# Сделаем матрицу из товаров-итемов, которые провзаимодействовали

In [24]:
def get_full_dataset_from_interactions(interactions: pd.DataFrame, user_features: pd.DataFrame, item_features: pd.DataFrame) -> pd.DataFrame:

    user_interact = interactions[['row',	'col']].merge(user_features, left_on='row', right_on='row')
    user_item_interact = user_interact.merge(item_features, left_on='col', right_on='row')

    user_item_interact = user_item_interact.drop(['row_y'], axis=1).rename(columns={'row_x': 'user_id', 'col': 'item_id'})
    return user_item_interact

In [25]:
train_user_item_interact = get_full_dataset_from_interactions(train_interactions, user_features, item_features)
val_user_item_interact = get_full_dataset_from_interactions(val_interactions, user_features, item_features)
test_user_item_interact = get_full_dataset_from_interactions(test_interactions, user_features, item_features)

# Получим список всех итемов, чтобы из них случайно селектить негативные сэмплы

In [26]:
def get_all_items(item_features: pd.DataFrame) -> set:
  return set(list(item_features['row'].unique()))

In [27]:
all_items = get_all_items(item_features)

# Селектим негативные семплы

In [28]:
import random
from collections import defaultdict
from tqdm import tqdm

In [29]:
random.seed(7)

In [30]:
def select_negative_samples(user_item_interact: pd.DataFrame, all_items, alpha: int=1) -> pd.DataFrame:
  
  d_positive = defaultdict(list)
  for row in user_item_interact[['user_id', 'item_id']].values:
    d_positive[row[0]].append(row[1])

  tmp = []
  for key in tqdm(d_positive):
    key_list = [int(key)]*alpha*len(d_positive[key])
    tmp.extend(tuple(zip(key_list, random.sample(all_items - set(d_positive[key]), alpha*len(d_positive[key])))))

  df_negative = pd.DataFrame(tmp, columns=['row', 'col'])
  return df_negative

In [31]:
alpha=1
train_df_negative = select_negative_samples(train_user_item_interact, all_items, alpha=alpha)
val_df_negative = select_negative_samples(val_user_item_interact, all_items, alpha=alpha)

100%|██████████| 10512/10512 [00:10<00:00, 1018.45it/s]
100%|██████████| 2563/2563 [00:03<00:00, 752.70it/s]


# Получаем негативные интеракции

In [32]:
# train_user_item_interact = get_full_dataset_from_interactions(train_interactions, user_features, item_features)
# val_user_item_interact = get_full_dataset_from_interactions(val_interactions, user_features, item_features)

In [33]:
train_user_item_interact_negative = get_full_dataset_from_interactions(train_df_negative, user_features, item_features)
val_user_item_interact_negative = get_full_dataset_from_interactions(val_df_negative, user_features, item_features)

In [34]:
# val_user_item_interact

In [35]:
# val_user_item_interact_negative

# Add targets to dataset and concatinate (maybe shauffle rows)

In [36]:
import numpy as np

In [37]:
def add_targets_and_concat(positive_dataset: pd.DataFrame, negative_dataset: pd.DataFrame) -> pd.DataFrame:
  positive_dataset['target'] = pd.Series(np.ones(positive_dataset.shape[0]), index=positive_dataset.index)
  negative_dataset['target'] = pd.Series(np.zeros(negative_dataset.shape[0]), index=negative_dataset.index)

  dataset = pd.concat([positive_dataset, negative_dataset])
  return dataset

In [38]:
train_data = add_targets_and_concat(train_user_item_interact, train_user_item_interact_negative)
val_data = add_targets_and_concat(val_user_item_interact, val_user_item_interact_negative)

In [39]:
# train_data.shape

In [40]:
# val_data.shape

In [41]:
# test_data.shape

## add target to test_data

In [42]:
test_user_item_interact['target'] = pd.Series(np.ones(test_user_item_interact.shape[0]), index=test_user_item_interact.index)
test_data = test_user_item_interact

# Training

In [43]:
from sklearn.model_selection import ParameterGrid

In [44]:
!pip install catboost



In [45]:
from catboost import Pool, CatBoostClassifier

In [46]:
X_train = train_data.drop(['target', 'user_id',	'item_id'], axis=1) #.astype('float')
y_train = train_data['target'] #.astype('float')

X_val = val_data.drop(['target', 'user_id',	'item_id'], axis=1) #.astype('float')
y_val = val_data['target'] #.astype('float')

In [47]:
# from sklearn.metrics import roc_auc_score

In [48]:
grid = {'learning_rate': [1], 'depth': [2, 4, 6]}
# grid = {'learning_rate': [1], 'depth': [6]}

train_dataset = Pool(data=X_train,
                     label=y_train,
                     cat_features=[X_train.shape[1]-1])

eval_dataset = Pool(data=X_val,
                     label=y_val,
                     cat_features=[X_val.shape[1]-1])

best_roc_auc = 0.5

for params in ParameterGrid(grid):
  model = CatBoostClassifier(random_seed=7, eval_metric='AUC')

  model.set_params(**params)
  # Fit model

  model.fit(train_dataset,
            use_best_model=True,
            eval_set=eval_dataset)
  
  if model.get_best_score()['validation']['AUC'] > best_roc_auc:
    best_roc_auc = model.get_best_score()['validation']['AUC']
    best_params = model.get_params()
    best_iteration = model.get_best_iteration()
    best_model = model

# preds_proba = model.predict_proba(eval_dataset)
# print("best_roc_auc =", best_roc_auc)
# import scikitplot as skplt
# import matplotlib.pyplot as plt

# y_true = y_val
# y_probas = preds_proba# predicted probabilities generated by sklearn classifier
# skplt.metrics.plot_roc_curve(y_true, y_probas)
# plt.show()

# model = CatBoostClassifier(iterations=best_iteration, random_seed=0, eval_metric='AUC')
# model.set_params(**best_params)
# model.fit(train_dataset,
#             use_best_model=True,
#             eval_set=eval_dataset)

  # # Get predicted classes
  # preds_class = model.predict(eval_dataset)

  # # Get predicted probabilities for each class
  # preds_proba = model.predict_proba(eval_dataset)

0:	test: 0.7437352	best: 0.7437352 (0)	total: 341ms	remaining: 5m 41s
1:	test: 0.7587243	best: 0.7587243 (1)	total: 550ms	remaining: 4m 34s
2:	test: 0.7689221	best: 0.7689221 (2)	total: 755ms	remaining: 4m 10s
3:	test: 0.7703463	best: 0.7703463 (3)	total: 949ms	remaining: 3m 56s
4:	test: 0.7721307	best: 0.7721307 (4)	total: 1.15s	remaining: 3m 48s
5:	test: 0.7760410	best: 0.7760410 (5)	total: 1.4s	remaining: 3m 51s
6:	test: 0.7768544	best: 0.7768544 (6)	total: 1.59s	remaining: 3m 45s
7:	test: 0.7776841	best: 0.7776841 (7)	total: 1.85s	remaining: 3m 49s
8:	test: 0.7778940	best: 0.7778940 (8)	total: 2.07s	remaining: 3m 48s
9:	test: 0.7779873	best: 0.7779873 (9)	total: 2.31s	remaining: 3m 49s
10:	test: 0.7784392	best: 0.7784392 (10)	total: 2.51s	remaining: 3m 45s
11:	test: 0.7786580	best: 0.7786580 (11)	total: 2.73s	remaining: 3m 44s
12:	test: 0.7804163	best: 0.7804163 (12)	total: 2.97s	remaining: 3m 45s
13:	test: 0.7806907	best: 0.7806907 (13)	total: 3.17s	remaining: 3m 43s
14:	test: 0.7

# Full test data

In [49]:
def all_items_to_users(test_users: pd.DataFrame, all_items) -> pd.DataFrame:

  tmp = []
  for user in tqdm(test_users):
    for item in all_items:
      tmp.append((user, item))

  all_items_to_user = pd.DataFrame(tmp, columns=['row', 'col'])

  return all_items_to_user

In [50]:
test_users = test_data.groupby('user_id').count().reset_index()['user_id']

In [51]:
all_items_to_user = all_items_to_users(test_users, all_items)

100%|██████████| 696/696 [00:04<00:00, 140.62it/s]


In [52]:
# all_items_to_user #.rename(columns={'user_id': 'row', 'item_id': 'col'})

In [53]:
####

In [54]:
test_user_item_interact = get_full_dataset_from_interactions(all_items_to_user, user_features, item_features)

In [55]:
test_user_item_interact

Unnamed: 0,user_id,item_id,col_region,data_region,col_age,data_age,data_asset,data_price,col_subclass
0,39,0,7,1.0,4,1.0,0.009497,0.012911,679
1,78,0,6,1.0,6,1.0,0.009497,0.012911,679
2,159,0,6,1.0,5,1.0,0.009497,0.012911,679
3,184,0,6,1.0,5,1.0,0.009497,0.012911,679
4,277,0,6,1.0,10,1.0,0.009497,0.012911,679
...,...,...,...,...,...,...,...,...,...
12867643,30398,18494,7,1.0,2,1.0,0.007882,0.009859,1207
12867644,30582,18494,5,1.0,4,1.0,0.007882,0.009859,1207
12867645,30616,18494,6,1.0,3,1.0,0.007882,0.009859,1207
12867646,30702,18494,4,1.0,3,1.0,0.007882,0.009859,1207


# Predict test dataset

In [56]:
X_test = test_user_item_interact.drop(['user_id',	'item_id'], axis=1) #.astype('float')
test_dataset = Pool(data=X_test,
                     cat_features=[X_test.shape[1]-1])

preds_proba = model.predict_proba(test_dataset)

In [57]:
preds_proba[:, 1]

array([0.06422251, 0.07725609, 0.07479064, ..., 0.08472231, 0.0282197 ,
       0.04121314])

In [58]:
test_user_item_interact['probs'] = pd.Series(preds_proba[:,1], index=test_user_item_interact.index)
test_user_item_interact

Unnamed: 0,user_id,item_id,col_region,data_region,col_age,data_age,data_asset,data_price,col_subclass,probs
0,39,0,7,1.0,4,1.0,0.009497,0.012911,679,0.064223
1,78,0,6,1.0,6,1.0,0.009497,0.012911,679,0.077256
2,159,0,6,1.0,5,1.0,0.009497,0.012911,679,0.074791
3,184,0,6,1.0,5,1.0,0.009497,0.012911,679,0.074791
4,277,0,6,1.0,10,1.0,0.009497,0.012911,679,0.038322
...,...,...,...,...,...,...,...,...,...,...
12867643,30398,18494,7,1.0,2,1.0,0.007882,0.009859,1207,0.111089
12867644,30582,18494,5,1.0,4,1.0,0.007882,0.009859,1207,0.057694
12867645,30616,18494,6,1.0,3,1.0,0.007882,0.009859,1207,0.084722
12867646,30702,18494,4,1.0,3,1.0,0.007882,0.009859,1207,0.028220


In [59]:
tmp = []

sorted_test_user_item_interact = test_user_item_interact.sort_values(by=['user_id', 'probs'])
len_all_items = len(all_items)

for user_number in tqdm(range(len(test_users))):
    start_idx = (user_number+1)*len_all_items-10
    end_idx = (user_number+1)*len_all_items
    user_id = sorted_test_user_item_interact.iloc[start_idx:end_idx][['user_id', 'item_id']].values[:,0]
    item_id = sorted_test_user_item_interact.iloc[start_idx:end_idx][['user_id', 'item_id']].values[:,1]

    tmp.extend(tuple(zip(user_id, item_id)))

top_items_to_user = pd.DataFrame(tmp, columns=['row', 'col'])

100%|██████████| 696/696 [00:01<00:00, 650.91it/s]


In [60]:
top_items_to_user

Unnamed: 0,row,col
0,39,1664
1,39,12469
2,39,8982
3,39,15892
4,39,5113
...,...,...
6955,30859,4124
6956,30859,3922
6957,30859,2310
6958,30859,10116


In [61]:
test = top_items_to_user.merge(test_user_item_interact, left_on=['row', 'col'], right_on=['user_id', 'item_id'])

In [62]:
test = test.sort_values(by=['user_id', 'probs'], ascending=False)

In [63]:
test

Unnamed: 0,row,col,user_id,item_id,col_region,data_region,col_age,data_age,data_asset,data_price,col_subclass,probs
6959,30859,592,30859,592,4,1.0,4,1.0,0.001425,0.000845,1108,0.999953
6958,30859,10116,30859,10116,4,1.0,4,1.0,0.007312,0.006667,8,0.994276
6957,30859,2310,30859,2310,4,1.0,4,1.0,0.002374,0.002394,809,0.988660
6956,30859,3922,30859,3922,4,1.0,4,1.0,0.003229,0.002723,258,0.987390
6955,30859,4124,30859,4124,4,1.0,4,1.0,0.007075,0.006291,142,0.986975
...,...,...,...,...,...,...,...,...,...,...,...,...
4,39,5113,39,5113,7,1.0,4,1.0,0.004368,0.004601,805,0.982286
3,39,15892,39,15892,7,1.0,4,1.0,0.005223,0.007277,497,0.981964
2,39,8982,39,8982,7,1.0,4,1.0,0.001425,0.001502,1763,0.981208
1,39,12469,39,12469,7,1.0,4,1.0,0.005128,0.005305,788,0.981165


In [64]:
d_pred = defaultdict(list)
for row in test[['user_id', 'item_id']].values:
    d_pred[row[0]].append(row[1])

In [65]:
d_pred[43]

[]

In [66]:
d_test = defaultdict(list)
for row in test_data[['user_id', 'item_id']].values:
    d_test[row[0]].append(row[1])

In [67]:
d_test.values()

dict_values([[256, 5118, 5710, 5725, 6759, 7595, 8481, 8570, 8666, 10227, 10466, 12355, 13825], [256, 11294, 3569, 11640, 3128, 3564, 3833, 3879, 4024, 5120, 5128, 5166, 7770, 7775, 9947, 11379, 11941, 12670, 14353, 16339, 16356], [256, 3565, 4657, 7330, 4967, 7354, 7174, 7662, 4057, 15688, 18098, 161, 9036, 151, 155, 162, 2290, 2783, 4855, 5372, 5680, 6280, 6389, 10497, 14040, 15617], [256, 10227, 10230, 8668, 7462, 144, 15223, 6273, 4278, 9190, 15230, 17135], [256, 11191, 10067, 16211, 10068, 10003, 15706, 16215, 2612, 3368, 5022, 9347, 16176, 16179, 16213], [256, 9302, 10045, 5700, 7493, 1648, 13842, 7495, 17988, 17989, 7581, 4720, 4314, 4220, 6132, 13082, 11140, 5568, 10031, 13210, 6281, 890, 2378, 2422, 2423, 2425, 2427, 2444, 2728, 2850, 6483, 9196, 17950], [256, 16689, 1023, 187, 8347, 10916, 7215, 3805, 5196, 431, 6391, 4687, 5508, 7223, 131, 4179, 9053, 6526, 6948, 5135, 18260, 16421, 9143, 1944, 7241, 3636, 5179, 5560, 3995, 16298, 17333, 4486, 12980, 6981, 9049, 171, 4496, 4

# MAP@10

In [68]:
!pip install ml_metrics



In [69]:
import ml_metrics

In [70]:
ml_metrics.mapk(d_test.values(), d_pred.values(), 10)

0.007903106565103691

In [71]:
# 0.011075688097577498

In [72]:
# y = [[1,2,3,4,5,6,7,8,9,10]]
# y_prd = [[0,0,0,0,0,0,0,0,10,0]]

In [73]:
# ml_metrics.mapk(y, y_prd, 10)