In [22]:
import pandas as pd
import dask.dataframe as dd

In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.sparse import csr_matrix

from implicit import als

from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

import warnings

warnings.filterwarnings("ignore")

### Подготовка данных:

In [67]:
data_train = dd.read_csv('data_train.csv')
data_train = data_train.drop(columns='Unnamed: 0')
data_test = dd.read_csv('data_test.csv')
data_test = data_test.drop(columns='Unnamed: 0')

In [52]:
data_train.head(5)

Unnamed: 0,id,vas_id,buy_time,target
0,540968,8.0,1537131600,0.0
1,1454121,4.0,1531688400,0.0
2,2458816,1.0,1534107600,0.0
3,3535012,5.0,1535922000,0.0
4,1693214,1.0,1535922000,0.0


In [57]:
data_train.shape[0].compute()

831653

In [54]:
train_id = list(data_train['id'].unique())

In [55]:
data_test.head(5)

Unnamed: 0,id,vas_id,buy_time
0,3130519,2.0,1548018000
1,2000860,4.0,1548018000
2,1099444,2.0,1546808400
3,1343255,5.0,1547413200
4,1277040,2.0,1546808400


In [58]:
data_test.shape[0].compute()

71231

In [59]:
test_id = list(data_test['id'].unique())

In [60]:
features = dd.read_csv('features.csv', sep='\t')
features = features.drop(columns='Unnamed: 0')

In [61]:
features.head(5)

Unnamed: 0,id,buy_time,0,1,2,3,4,5,6,7,...,243,244,245,246,247,248,249,250,251,252
0,2013026,1531688400,18.910029,46.980888,4.969214,-1.386798,3.791754,-14.01179,-16.08618,-65.076097,...,-977.373846,-613.770792,-25.996269,-37.630448,-301.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
1,2014722,1539550800,36.690029,152.400888,448.069214,563.833202,463.841754,568.99821,-16.08618,-53.216097,...,-891.373846,-544.770792,-20.996269,48.369552,80.252276,-13.832889,-0.694428,-1.175933,-0.45614,0.0
2,2015199,1545598800,-67.019971,157.050888,-63.180786,178.103202,-68.598246,156.99821,3.51382,25.183903,...,-977.373846,-613.770792,-12.996269,-37.630448,10829.252276,-25.832889,-0.694428,-12.175933,-0.45614,0.0
3,2021765,1534107600,7.010029,150.200888,-6.930786,216.213202,76.621754,351.84821,-16.08618,-65.076097,...,-973.373846,-613.770792,-23.996269,-37.630448,-205.747724,-24.832889,-0.694428,-11.175933,-0.45614,1.0
4,2027465,1533502800,-90.439971,134.220888,-104.380786,153.643202,-109.798246,132.53821,-16.08618,-65.076097,...,1643.626154,2007.229208,206.003731,-21.630448,6667.252276,92.167111,-0.694428,49.824067,47.54386,0.0


In [62]:
train_features = features[features['id'].isin(train_id)]

In [63]:
train_features.head(5)

Unnamed: 0,id,buy_time,0,1,2,3,4,5,6,7,...,243,244,245,246,247,248,249,250,251,252
13,2046132,1534712400,300.820029,1599.480888,286.879214,1585.013202,281.461754,1563.90821,-16.08618,654.013903,...,-977.373846,-613.770792,-25.996269,-35.630448,-295.747724,-17.832889,-0.694428,-4.175933,-0.45614,0.0
16,2050810,1540760400,-86.209971,91.820888,-84.480786,110.333202,-89.898246,89.22821,-16.08618,-65.076097,...,-977.373846,-613.770792,-23.996269,190.369552,-286.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
19,2070757,1540760400,-96.799971,-408.179112,-110.740786,-460.786798,-114.038246,-479.77179,-16.08618,-65.076097,...,-925.373846,-561.770792,-21.996269,-37.630448,-151.747724,-24.832889,0.305572,-12.175933,-0.45614,1.0
20,2071522,1544994000,-94.939971,-363.699112,-108.880786,-411.226798,-114.298246,-432.33179,-16.08618,-65.076097,...,-977.373846,-613.770792,-25.996269,-37.630448,-306.747724,-25.832889,-0.694428,-12.175933,-0.45614,0.0
22,2075318,1533502800,-75.639971,669.690888,-89.580786,732.343202,-94.998246,736.65821,-16.08618,782.383903,...,-501.373846,-242.770792,-25.996269,-37.630448,-167.747724,-14.832889,2.305572,-4.175933,-0.45614,0.0


In [68]:
test_features = features[features['id'].isin(test_id)]

In [69]:
features = features.merge(train_features, how='left')

In [70]:
data_train.head(5)

KeyboardInterrupt: 

In [8]:
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [9]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, 5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [10]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [11]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

train_users = data_train_lvl_1['user_id'].unique()

result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1 = result_lvl_1.query('user_id in @train_users')
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


#### Результат работы модели первого уровня (лучший из полученых мною результатов на Kaggle, в функции fit_own_recommender использовалось значение K=3, вес в MainRecommender - tfidf_weight):

In [12]:
result_lvl_1['own_rec'] = [recommender.get_own_recommendations(i, N=5) for i in list(result_lvl_1['user_id'])]

result_lvl_1.apply(lambda x: recall_at_k(x['own_rec'], x['actual'], 5), axis=1).mean() * 100

3.699310170377275

In [14]:
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [15]:
data_train_lvl_2.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0


In [16]:
n_items_before = data_train_lvl_2['item_id'].nunique()

data_train_lvl_2 = prefilter_items(data_train_lvl_2)

n_items_after = data_train_lvl_2['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 27649 to 5001


In [17]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=5))

In [18]:
users_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,2021,"[1082185.0, 6534178.0, 981760.0, 951590.0, 102..."
1,1753,"[1082185.0, 6534178.0, 1029743.0, 1106523.0, 1..."


In [19]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,2021,1082185.0,1
0,2021,6534178.0,1
0,2021,981760.0,1
0,2021,951590.0,1


## Генерация признаков:

In [20]:
# генерация признаков:
# для пользователя:

# общая сумма покупок за период покупателя:
for i in user_features["user_id"]:
    receipts = list()
    max_quantity = 0
    most_popular = 0
    
    user_features.loc[user_features['user_id'] == i, 'total_sum'] = data_train_lvl_2[data_train_lvl_2['user_id'] == i]['sales_value'].sum()
    
    # средний чек:
    for j in data_train_lvl_2[data_train_lvl_2['user_id'] == i]['basket_id'].unique():
        receipts.append(data_train_lvl_2[data_train_lvl_2['user_id'] == i][data_train_lvl_2['basket_id'] == j]['sales_value'].sum())
    if len(receipts) != 0:
        user_features.loc[user_features['user_id'] == i, 'average_receipt'] = sum(receipts)/len(receipts)
    else: user_features.loc[user_features['user_id'] == i, 'average_receipt'] = 0
    
    # самый популярный магазин:
    for j in data_train_lvl_2[data_train_lvl_2['user_id'] == i]['store_id'].unique():
        new_m = len(data_train_lvl_2[(data_train_lvl_2['user_id'] == i)&(data_train_lvl_2['store_id'] == j)]['basket_id'].unique())
        if max_quantity < new_m:
            max_max_quantity = new_m
            most_popular = j
    user_features.loc[user_features['user_id'] == i, 'most_popular_store'] = j 

In [21]:
# генерация признаков:
# для товаров:

for i in item_features["commodity_desc"].unique():
    
    # средняя цена в категории:
    items_list = np.array(item_features[item_features["commodity_desc"] == i]["item_id"])
    item_features.loc[item_features["commodity_desc"] == i, 'average_cat_value'] = data_train_lvl_2[(data_train_lvl_2['item_id'].isin(items_list))]['sales_value'].sum()/len(items_list)
    
for j in item_features["item_id"].unique():
    
    # покупок в неделю:
    weeks = np.array(data_train_lvl_2[(data_train_lvl_2['item_id'] == j)]['week_no'].unique())
    item_features.loc[item_features['item_id'] == j, 'average_weekly_quantity'] = data_train_lvl_2[(data_train_lvl_2['item_id'] == j)]['quantity'].sum()/len(weeks)
        
    # соотношение цены товара к средней в категории:
    item_features.loc[item_features['item_id'] == j, 'worth'] = data_train_lvl_2[data_train_lvl_2['item_id'] == j]['sales_value'].mean()/item_features[item_features['item_id'] == j]['average_cat_value']

In [22]:
user_items_features = data_train_lvl_2[['user_id', 'item_id', 'quantity']].copy()
user_items_features = user_items_features.merge(item_features[['item_id', 'commodity_desc', 'average_weekly_quantity']], on=['item_id'], how='right')

user_items_features.head(2)

Unnamed: 0,user_id,item_id,quantity,commodity_desc,average_weekly_quantity
0,2021.0,840361.0,1.0,EGGS,78.666667
1,950.0,840361.0,1.0,EGGS,78.666667


In [23]:
# признаки для отношений пользователь-товар слишком долго считались, поэтому решила ввести коэффициенты популярности товаров

# коэффициент популярности среди покупателей для каждой категории
for j in list(user_items_features['commodity_desc'].unique()):
    popularity = len(user_items_features[user_items_features['commodity_desc'] == j]['user_id'].unique())
    if popularity < 100:
        user_items_features.loc[user_items_features['commodity_desc'] == j, 'is_popular'] = 0.8
    elif popularity < 600:
        user_items_features.loc[user_items_features['commodity_desc'] == j, 'is_popular'] = 0.6
    elif popularity < 1000:
        user_items_features.loc[user_items_features['commodity_desc'] == j, 'is_popular'] = 0.4
    else:
        user_items_features.loc[user_items_features['commodity_desc'] == j, 'is_popular'] = 0.2
        
# насколько популярен товар для пользователя, учитывая его популярность среди всех пользователей
for i in list(user_items_features['user_id'].unique()):
    for j in list(user_items_features['is_popular'].unique()):
        user_items_features.loc[(user_items_features['user_id'] == i) & (user_items_features['is_popular'] == j), 'is_favorite_cat'] = user_items_features[(user_items_features['user_id'] == i) & (user_items_features['is_popular'] == j)]['quantity'].sum() * user_items_features['is_popular']

In [24]:
user_items_features = user_items_features.drop(['quantity', 'commodity_desc', 'average_weekly_quantity'], axis=1)

In [25]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

t_lvl2_users = targets_lvl_2['user_id']
t_lvl2_items = targets_lvl_2['item_id']
t_lvl2_targets = targets_lvl_2['target']

In [26]:
targets_lvl_2 = targets_lvl_2[['target', 'user_id','item_id']]

In [27]:
targets_lvl_2

Unnamed: 0,target,user_id,item_id
0,0.0,2021,1082185.0
1,0.0,2021,6534178.0
2,0.0,2021,981760.0
3,1.0,2021,951590.0
4,1.0,2021,951590.0
...,...,...,...
13805,0.0,1697,1082185.0
13806,0.0,1697,6534178.0
13807,0.0,1697,1029743.0
13808,0.0,1697,995242.0


In [28]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on=['item_id'], how='left')

In [29]:
targets_lvl_2.head(2)

Unnamed: 0,target,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,average_cat_value,average_weekly_quantity,worth
0,0.0,2021,1082185.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,30.856627,313.166667,0.03467
1,0.0,2021,6534178.0,69,KIOSK-GAS,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,,16.178906,,


In [30]:
targets_lvl_2 = targets_lvl_2.merge(user_features, on=['user_id'], how='left')

In [31]:
targets_lvl_2 = targets_lvl_2.merge(user_items_features, on=['user_id', 'item_id'], how='left')

### Обучение модели второго уровня:

In [32]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [33]:
X_train.head(2)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,average_cat_value,average_weekly_quantity,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,total_sum,average_receipt,most_popular_store,is_popular,is_favorite_cat
0,2021,1082185.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,30.856627,313.166667,...,,,,,,,,,,
1,2021,6534178.0,69,KIOSK-GAS,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,,16.178906,,...,,,,,,,,,,


In [34]:
# категоризация признаков (для работы модели)

digital_features = ['manufacturer', 'average_cat_value', 'average_weekly_quantity', 'worth', 'total_sum', 'average_receipt', 'most_popular_store', 'is_popular','is_favorite_cat']
for i in digital_features:
    a = X_train[i].min()
    b = X_train[i].mean()
    c = X_train[i].max()
    # print(a, (b-a)/2, b, (c-b)/2, c)
    X_train[i] = pd.cut(X_train[i], bins=[a, a+(b-a)/2, b, b+(c-b)/2, c], labels=False)
    
not_digital_features = ['department', 'brand', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'average_weekly_quantity', 'age_desc',
                        'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc', 'household_size_desc', 'kid_category_desc']

for i in not_digital_features:
    cat_numbers = list(c for c in range(len(list(X_train[i].unique()))))
    for j,n in zip(list(X_train[i].unique()), cat_numbers):
        X_train.loc[X_train[i] == j, i] = n
        
X_train.fillna(99999, inplace= True)

In [35]:
X_train.head(2)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,average_cat_value,average_weekly_quantity,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,total_sum,average_receipt,most_popular_store,is_popular,is_favorite_cat
0,2021,1082185.0,99999.0,0,0,0,0,0,2.0,3.0,...,99999,99999,99999,99999,99999,99999.0,99999.0,99999.0,99999.0,99999.0
1,2021,6534178.0,0.0,1,1,1,1,1,1.0,99999.0,...,99999,99999,99999,99999,99999,99999.0,99999.0,99999.0,99999.0,99999.0


In [36]:
# модель - MLPClassifier из sklearn.neural_network

mlp = MLPClassifier(random_state=1, solver='sgd', learning_rate_init=0.000001, max_iter=500).fit(X_train, y_train)
train_preds = mlp.predict(X_train)

In [37]:
user_val_lvl_2 = pd.DataFrame(data_val_lvl_2['user_id'].unique())
user_val_lvl_2.columns = ['user_id']

train_users = data_train_lvl_1['user_id'].unique()
user_val_lvl_2 = user_val_lvl_2[user_val_lvl_2['user_id'].isin(train_users)]

user_val_lvl_2['candidates'] = user_val_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=5))

In [38]:
s = user_val_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

user_val_lvl_2 = user_val_lvl_2.drop('candidates', axis=1).join(s)

user_val_lvl_2.head(4)

Unnamed: 0,user_id,item_id
0,338,1082185.0
0,338,6534178.0
0,338,1029743.0
0,338,995242.0


### Обучение модели на валидационной выборке:

In [39]:
preds_2 = user_val_lvl_2.merge(item_features, on='item_id', how='left')
preds_2 = preds_2.merge(user_features, on='user_id', how='left')
preds_2 = preds_2.merge(user_items_features, on=['user_id', 'item_id'], how='left')

preds_2.head(2)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,average_cat_value,average_weekly_quantity,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,total_sum,average_receipt,most_popular_store,is_popular,is_favorite_cat
0,338,1082185.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,30.856627,313.166667,...,,,,,,,,,,
1,338,6534178.0,69,KIOSK-GAS,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,,16.178906,,...,,,,,,,,,,


In [40]:
# категоризация валидационной выборки:

digital_features = ['manufacturer', 'average_cat_value', 'average_weekly_quantity', 'worth', 'total_sum', 'average_receipt', 'most_popular_store', 'is_popular','is_favorite_cat']

for i in digital_features:
    a = preds_2[i].min()
    b = preds_2[i].mean()
    c = preds_2[i].max()
    # print(a, (b-a)/2, b, (c-b)/2, c)
    preds_2[i] = pd.cut(X_train[i], bins=[a, a+(b-a)/2, b, b+(c-b)/2, c], labels=False)
    
not_digital_features = ['department', 'brand', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'average_weekly_quantity', 'age_desc',
                        'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc', 'household_size_desc', 'kid_category_desc']

for i in not_digital_features:
    cat_numbers = list(c for c in range(len(list(preds_2[i].unique()))))
    for j,n in zip(list(preds_2[i].unique()), cat_numbers):
        preds_2.loc[preds_2[i] == j, i] = n
        
preds_2.fillna(99999, inplace= True)

In [41]:
features = digital_features + not_digital_features

In [46]:
X_val = preds_2

val_preds = mlp.predict_proba(X_val[features])[:,1]

In [47]:
preds_2['proba'] = val_preds

recomendations = preds_2[['user_id', 'item_id', 'proba']]

In [48]:
mlp_recs = pd.DataFrame(recomendations.sort_values(['user_id', 'proba'], ascending=False).groupby('user_id')\
             .apply(lambda x: x['item_id'].iloc[:5].values)).rename(columns={0: 'mlp_recs'})

mlp_recs_test = mlp_recs.merge(
    pd.DataFrame(
        data_val_lvl_2.groupby('user_id')['item_id'].unique()).rename(columns={'item_id': 'actual'}).reset_index(), how='left',on='user_id')

mlp_recs_test.head(5)

Unnamed: 0,user_id,mlp_recs,actual
0,1,"[995242.0, 995242.0, 1029743.0, 840361.0, 1082...","[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[1082185.0, 6534178.0, 1029743.0, 1106523.0, 9...","[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[1082185.0, 1082185.0, 6534178.0, 1029743.0, 9...","[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[1082185.0, 1082185.0, 6534178.0, 1029743.0, 1...","[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[1082185.0, 1082185.0, 6534178.0, 1029743.0, 1...","[835098, 872137, 910439, 924610, 992977, 10412..."


### Результаты работы модели второго уровня:

In [49]:
# приблизительная оценка работы модели (на неё ориентировалась, подбирая параметры):

mlp_recs_test.apply(lambda x: recall_at_k(x['mlp_recs'], x['actual']), axis=1).mean() * 100

3.6566656979547476

In [50]:
mlp_recs.rename(columns={'user_id': 'user_id'})
mlp_recs = mlp_recs.reset_index()

mlp_recs

Unnamed: 0,user_id,mlp_recs
0,1,"[995242.0, 995242.0, 1029743.0, 840361.0, 1082..."
1,3,"[1082185.0, 6534178.0, 1029743.0, 1106523.0, 9..."
2,6,"[1082185.0, 1082185.0, 6534178.0, 1029743.0, 9..."
3,7,"[1082185.0, 1082185.0, 6534178.0, 1029743.0, 1..."
4,8,"[1082185.0, 1082185.0, 6534178.0, 1029743.0, 1..."
...,...,...
2036,2496,"[1082185.0, 6534178.0, 981760.0, 981760.0, 108..."
2037,2497,"[1082185.0, 6534178.0, 1029743.0, 995242.0, 86..."
2038,2498,"[1082185.0, 6534178.0, 1029743.0, 1106523.0, 1..."
2039,2499,"[1070820.0, 1070820.0, 1082185.0, 6534178.0, 1..."


In [53]:
data_test = pd.read_csv('test_users.csv')

# data_test

In [54]:
for i in list(data_test['user_id'].unique()):
    if bool(len(mlp_recs[mlp_recs['user_id'] == i])):
        pred_value = str(tuple(mlp_recs[mlp_recs['user_id'] == i]['mlp_recs'])[0])
        data_test.loc[data_test['user_id'] == i, 'preds'] = pred_value[1:len(pred_value)-1].replace('.','')
    else: 
        data_test.loc[data_test['user_id'] == i, 'preds'] = '1082185 981760 995242 1029743 840361'

data_test = data_test.rename(columns={'user_id': 'UserId', 'preds': 'Predicted'})
data_test

Unnamed: 0,UserId,Predicted
0,1,995242 995242 1029743 840361 1082185
1,2,1082185 981760 995242 1029743 840361
2,3,1082185 6534178 1029743 1106523 951590
3,6,1082185 1082185 6534178 1029743 995242
4,7,1082185 1082185 6534178 1029743 1106523
...,...,...
1703,2494,1082185 6534178 1029743 840361 1127831
1704,2496,1082185 6534178 981760 981760 1082185
1705,2498,1082185 6534178 1029743 1106523 1106523
1706,2499,1070820 1070820 1082185 6534178 1029743


In [55]:
data_test.to_csv('submission_2StepModel.csv', index=False)

##### Подбор наиболее значимых признаков (не улучшил результат)

In [57]:
X_val = preds_2

for feature in features:
    new_features = features.copy()
    new_features.remove(feature)
    mlp = MLPClassifier(random_state=1, solver='sgd', learning_rate_init=0.000001, max_iter=50).fit(X_train[new_features], y_train)
    val_preds = mlp.predict_proba(X_val[new_features])[:,1]
    preds_2['proba_' + feature] = val_preds

In [66]:
best_features = []
for feature in features:
    recomendations = preds_2[['user_id', 'item_id', 'proba_'+feature]]
    mlp_recs = pd.DataFrame(recomendations.sort_values(['user_id', 'proba_'+feature], ascending=False).groupby('user_id')\
             .apply(lambda x: x['item_id'].iloc[:5].values)).rename(columns={0: 'mlp_recs'})
    mlp_recs_test = mlp_recs.merge(
    pd.DataFrame(
        data_val_lvl_2.groupby('user_id')['item_id'].unique()).rename(columns={'item_id': 'actual'}).reset_index(), how='left',on='user_id')
    # print('excluding', feature, mlp_recs_test.apply(lambda x: recall_at_k(x['mlp_recs'], x['actual']), axis=1).mean() * 100 - 3.6445371300449025)
    c = mlp_recs_test.apply(lambda x: recall_at_k(x['mlp_recs'], x['actual']), axis=1).mean() * 100 - 3.6445371300449025
    if (c < 0):
        print(feature, 'полезный признак')
        best_features.append(feature)
    else:
        print(feature, 'не очень полезный признак', c)

manufacturer не очень полезный признак 0.00801087455717564
average_cat_value полезный признак
average_weekly_quantity полезный признак
worth не очень полезный признак 0.00013265071824708485
total_sum полезный признак
average_receipt полезный признак
most_popular_store полезный признак
is_popular полезный признак
is_favorite_cat не очень полезный признак 0.012533794700746892
department полезный признак
brand не очень полезный признак 0.006581973733361668
commodity_desc полезный признак
sub_commodity_desc не очень полезный признак 0.006074636197885397
curr_size_of_product не очень полезный признак 0.014859589044072141
average_weekly_quantity полезный признак
age_desc полезный признак
marital_status_code не очень полезный признак 0.0008776390398539391
income_desc полезный признак
homeowner_desc не очень полезный признак 0.009033497158346115
hh_comp_desc не очень полезный признак 0.0006508735966161971
household_size_desc полезный признак
kid_category_desc полезный признак


In [67]:
best_features = best_features + ['commodity_desc', 'sub_commodity_desc']

mlp = MLPClassifier(random_state=1, solver='sgd', learning_rate_init=0.000001, max_iter=50).fit(X_train[best_features], y_train)
val_preds = mlp.predict_proba(X_val[best_features])[:,1]
preds_2['proba'] = val_preds

In [68]:
recomendations = preds_2[['user_id', 'item_id', 'proba']]
mlp_recs = pd.DataFrame(recomendations.sort_values(['user_id', 'proba'], ascending=False).groupby('user_id')\
                        .apply(lambda x: x['item_id'].iloc[:5].values)).rename(columns={0: 'mlp_recs'})
mlp_recs_test = mlp_recs.merge(
    pd.DataFrame(
        data_val_lvl_2.groupby('user_id')['item_id'].unique()).rename(columns={'item_id': 'actual'}).reset_index(), how='left',on='user_id')

mlp_recs_test.apply(lambda x: recall_at_k(x['mlp_recs'], x['actual']), axis=1).mean() * 100

3.6447933206133016