In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als
from sklearn.model_selection import train_test_split
# Модель второго уровня
from lightgbm import LGBMClassifier
import lightgbm  as lgb 
import catboost as catb

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items, postfilter_items, popularity_recommendation, perpare_lvl2_1, perpare_lvl2, category_to_digit
from src.recommenders import MainRecommender
from tqdm import tqdm

tqdm.pandas()
# from random import random

In [2]:
data = pd.read_csv('../retail_train.csv')
item_features = pd.read_csv('../product.csv')
user_features = pd.read_csv('../hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [3]:
N = 150 
final_predict_count = 30 
val_count = 5 
top_items_count = 5000 

In [4]:
data.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0


In [5]:
item_features.head(3)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,


In [6]:
user_features.head(3)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8


In [7]:
# Cхема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)

val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [8]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [9]:
def get_new_values(old_df, new_df,feature):
    old_values = old_df[feature].unique()
    new_values = new_df[feature].unique()
    appended_values = []

    for value  in new_values: 
        if value not in old_values:
            appended_values.append(value)
        
    appended_values = np.unique(appended_values)
    return appended_values

In [10]:
first_users_count = len(data_train_lvl_1['user_id'].unique()) 
first_items_count = len(data_train_lvl_1['item_id'].unique()) 

new_user_lvl_1 = get_new_values(data_train_lvl_1, data_train_lvl_2 ,'user_id')
new_items_lvl_1 = get_new_values(data_train_lvl_1, data_train_lvl_2 ,'item_id')

new_user_lvl_2 = get_new_values(data_train_lvl_1, data_val_lvl_2 ,'user_id')
new_items_lvl_2 = get_new_values(data_train_lvl_1, data_val_lvl_2 ,'item_id')

print(f'Изначальное к-во юзеров: {first_users_count}, items: {first_items_count}')
print(f'1-й уровень: {len(new_user_lvl_1)}, items: {len(new_items_lvl_1)}')
print(f'2-й уровень: {len(new_user_lvl_2)}, items: {len(new_items_lvl_2)}')

Изначальное к-во юзеров: 2299, items: 5001
1-й уровень: 70, items: 22772
2-й уровень: 74, items: 19567


In [11]:
data_gr = data.groupby('basket_id').mean()

In [12]:
# Среднее к-во покупаемых товаров.
user_features['median_quantity'] = user_features['user_id'].apply(lambda x: 
                                data_gr.loc[(data_gr['user_id']==x),'quantity'].median())

# Средний чек.                                
user_features['mean_sales_value'] = user_features['user_id'].apply(lambda x: 
                                data_gr.loc[(data_gr['user_id']==x),'sales_value'].mean())

In [13]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,median_quantity,mean_sales_value
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,1.1,2.726818
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,1.181818,2.989986


In [14]:
features=['income_desc','age_desc','homeowner_desc','kid_category_desc','household_size_desc','hh_comp_desc']
for feature_name in features:
    print(feature_name)
    print(user_features[feature_name].unique())
    print()

income_desc
['35-49K' '50-74K' '25-34K' '75-99K' 'Under 15K' '100-124K' '15-24K'
 '125-149K' '150-174K' '250K+' '175-199K' '200-249K']

age_desc
['65+' '45-54' '25-34' '35-44' '19-24' '55-64']

homeowner_desc
['Homeowner' 'Unknown' 'Renter' 'Probable Renter' 'Probable Owner']

kid_category_desc
['None/Unknown' '1' '2' '3+']

household_size_desc
['2' '3' '4' '1' '5+']

hh_comp_desc
['2 Adults No Kids' '2 Adults Kids' 'Single Female' 'Unknown'
 'Single Male' '1 Adult Kids']



In [15]:
income_desc = {'35-49K':42, '50-74K':62, '25-34K':30, '75-99K':87, 'Under 15K':15, '100-124K':112,
       '15-24K':20, '125-149K':137, '150-174K':162, '250K+':250, '175-199K':187, '200-249K':225}
       
user_features['income_desc'] = user_features['income_desc'].apply(lambda x: income_desc[x]) 

age_desc = {'65+':65, '45-54':50, '25-34':30, '35-44':40, '19-24':21, '55-64':60}	

user_features['age_desc'] = user_features['age_desc'].apply(lambda x: age_desc[x])

In [16]:
household_size_desc = {np.nan: 0, '1':1, '2':2, '3':3, '4':4, '5+':5 }

user_features['household_size_desc'] = user_features['household_size_desc'].apply(lambda x: household_size_desc[x])

In [17]:
kid_category_desc = {'None/Unknown':0, np.nan: 0, '1':1, '2':2, '3+':3 }

user_features['kid_category_desc'] = user_features['kid_category_desc'].apply(lambda x: kid_category_desc[x])

In [18]:
user_features[['hh_comp_desc_female', 'hh_comp_desc_male', 'hh_comp_desc_Adults_Kids']] = 0
user_features['hh_comp_desc_female'] = np.where((user_features['hh_comp_desc'] !='Single Male'), 1, 0)
user_features['hh_comp_desc_male'] = np.where((user_features['hh_comp_desc'] !='Single Female'), 1, 0)
user_features.loc[(user_features['hh_comp_desc']=='2 Adults Kids'), 'hh_comp_desc_Adults_Kids'] = 2
user_features.loc[(user_features['hh_comp_desc']=='1 Adult Kids'), 'hh_comp_desc_Adults_Kids'] = 1
user_features.loc[(user_features['hh_comp_desc'].isna()), ['hh_comp_desc_female','hh_comp_desc_male']] = 0
user_features.loc[(user_features['hh_comp_desc']=='Unknown'), ['hh_comp_desc_female','hh_comp_desc_male']] = 0
user_features.drop('hh_comp_desc', axis=1, inplace=True)

In [19]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,household_size_desc,kid_category_desc,user_id,median_quantity,mean_sales_value,hh_comp_desc_female,hh_comp_desc_male,hh_comp_desc_Adults_Kids
0,65,A,42,Homeowner,2,0,1,1.1,2.726818,1,1,0
1,50,A,62,Homeowner,2,0,7,1.181818,2.989986,1,1,0


In [20]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [21]:
item_features_temp = item_features.merge(data, on='item_id', how='left')

In [22]:
item_price = item_features_temp.groupby(['item_id','commodity_desc'])['sales_value'].mean().reset_index()
item_price.columns= ['item_id','commodity_desc','sales_value']
commoditys_desc = item_price['commodity_desc'].unique()

item_price['commodity_desc_mean_sale']=np.NaN

for commodity_desc in commoditys_desc:
    mean_value = item_price.loc[(item_price['commodity_desc']==commodity_desc),'sales_value'].mean()
    item_price.loc[(item_price['commodity_desc']==commodity_desc),'commodity_desc_mean_sale'] = mean_value

item_price.loc[(item_price['commodity_desc']=='NO COMMODITY DESCRIPTION'),'sales_value']

item_features = item_features.merge(item_price[['item_id','commodity_desc_mean_sale']], on='item_id',how='left')

In [23]:
quantity_count = item_features_temp.groupby(['item_id'])['quantity'].sum().reset_index()

quantity_count.columns = ['item_id','quantity']

quantity_in_week = item_features_temp.groupby(['item_id'])['week_no'].unique().reset_index()

quantity_in_week.columns = ['item_id','weeks']

quantity_in_week['weeks_count'] = quantity_in_week['weeks'].apply(lambda x: len(x))

quantity_in_week['sale_in_week'] = quantity_count['quantity']/quantity_in_week['weeks_count']  

item_features = item_features.merge(quantity_in_week[['item_id','sale_in_week']], on='item_id',how='left')

In [24]:
recommender = MainRecommender(data_train_lvl_1)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [25]:
train_data = perpare_lvl2_1(data_train_lvl_2, data_train_lvl_1, recommender,item_features, user_features, N=150)

  s = users_warm.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)


In [26]:
X_train = train_data.drop('target', axis=1)
y_train = train_data[['target']]

In [27]:
test_data = perpare_lvl2_1(data_val_lvl_2, data_train_lvl_1, recommender, item_features, user_features, N=150)

  s = users_warm.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)


In [28]:
X_test = test_data.drop('target', axis=1)
y_test = test_data[['target']]

In [29]:
X_train.head(2)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,commodity_desc_mean_sale,sale_in_week,...,marital_status_code,income_desc,homeowner_desc,household_size_desc,kid_category_desc,median_quantity,mean_sales_value,hh_comp_desc_female,hh_comp_desc_male,hh_comp_desc_Adults_Kids
0,2070,5569471.0,1208.0,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,2.705525,43.138298,...,U,62.0,Unknown,1.0,0.0,1.0,2.413486,0.0,0.0,0.0
1,2070,1022003.0,1251.0,GROCERY,National,SOUP,CONDENSED SOUP,10.5OZ,2.202742,47.680851,...,U,62.0,Unknown,1.0,0.0,1.0,2.413486,0.0,0.0,0.0


In [30]:
categorical = []
numerical = []
for col, value in X_train.iteritems():
    if value.dtype == 'object':
        categorical.append(col)
    else:
        numerical.append(col)

In [31]:
print(categorical)

['department', 'brand', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'marital_status_code', 'homeowner_desc']


In [32]:
print(numerical)

['user_id', 'item_id', 'manufacturer', 'commodity_desc_mean_sale', 'sale_in_week', 'age_desc', 'income_desc', 'household_size_desc', 'kid_category_desc', 'median_quantity', 'mean_sales_value', 'hh_comp_desc_female', 'hh_comp_desc_male', 'hh_comp_desc_Adults_Kids']


In [33]:
for feature in categorical:
 print(f'{feature}: {len(X_train[feature].unique())}')

department: 21
brand: 3
commodity_desc: 200
sub_commodity_desc: 746
curr_size_of_product: 657
marital_status_code: 4
homeowner_desc: 6


In [34]:
features = ['commodity_desc', 'sub_commodity_desc', 'curr_size_of_product']
X_train = X_train.drop(features, axis=1)
X_test = X_test.drop(features, axis=1)

In [35]:
print(X_train['department'].unique())

['GROCERY' 'MISC. TRANS.' 'PRODUCE' 'PASTRY' 'MEAT-PCKGD' 'MEAT'
 'KIOSK-GAS' 'NUTRITION' 'SALAD BAR' 'DRUG GM' 'DELI' 'FLORAL' nan
 'MISC SALES TRAN' 'GARDEN CENTER' 'SEAFOOD' 'CHEF SHOPPE' 'SEAFOOD-PCKGD'
 'TRAVEL & LEISUR' 'COUP/STR & MFG' 'FROZEN GROCERY']


In [36]:
features = [ 'department',
            'brand',
            'marital_status_code',
            'homeowner_desc',
           ]

In [37]:
X_train = category_to_digit(X_train, features)

In [38]:
X_test = category_to_digit(X_test, features)

In [39]:
y_train.mean()

target    0.239681
dtype: float64

In [40]:
y_test.mean()

target    0.165276
dtype: float64

In [41]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  test_size=0.2,
                                                  random_state=27,
                                                 )

In [42]:
x_features = X_train.columns.values[2:] # исключаем из обучения iser_id и item_id.

In [43]:
model_catb = catb.CatBoostClassifier(silent=True, 
                                    random_state=27,
                                    # eval_metric='F1',
                                     early_stopping_rounds=20,
                                     use_best_model=True,
                                     num_boost_round=10000
                                    )

In [44]:
model_catb.fit(X_train[x_features], y_train, eval_set=(X_val[x_features], y_val))

<catboost.core.CatBoostClassifier at 0x1b2cf5cf580>

In [45]:
def get_items(x_data, items, user_id, item_name, N=5, overall_top_purchases=None):
    items_list = []
 
    for item in items:
        flag = (x_data.loc[((x_data['user_id']==user_id) & (x_data['item_id']==item)),item_name].mean())
        
        if (flag > 0.3):
            items_list.append(item)

    if not(overall_top_purchases is None):

        if len(items_list) < N:
            items_list.extend(overall_top_purchases[:N])
        items_list = items_list[:N]
    return items_list

In [46]:
def get_final_recomendations(x_data, y_data, preds):
    x_data = x_data.copy()
    x_data['predict'] = preds
    x_data['actual'] = y_data['target'].values

    result = x_data.sort_values('predict', ascending=False).groupby('user_id')['item_id'].unique().reset_index()

    overall_top_purchases = x_data.groupby('item_id')['item_id'].count()
    overall_top_purchases = overall_top_purchases.sort_values(ascending=False).index.values

    result_df= {'user_id':[], 'actual':[], 'predict':[]}

    for res in tqdm(result.iterrows()):
        user_id = res[1]['user_id']
        item_ids = res[1]['item_id']
        actual = get_items(x_data, item_ids, user_id, 'actual', N=final_predict_count)
        if len(actual)>0:
            result_df['user_id'].append(user_id)
            predict_items= get_items(x_data, item_ids, user_id, 'predict', N=final_predict_count, overall_top_purchases = overall_top_purchases)
            result_df['predict'].append(postfilter_items(predict_items, item_features, N=val_count)) # Бизнес-ограничения. ^_^
            result_df['actual'].append(actual)
    return pd.DataFrame(result_df) 

In [47]:
train_preds = model_catb.predict_proba(X_train[x_features])[:,1]

In [48]:
result_train = get_final_recomendations(X_train, y_train, train_preds)

result_train.head(3)

2141it [01:01, 35.03it/s]


Unnamed: 0,user_id,actual,predict
0,1,"[1082185.0, 995242.0, 840361.0, 820165.0, 9409...","[1082185.0, 840361.0, 940947.0, 10455984.0, 82..."
1,2,"[1106523.0, 838136.0, 916122.0, 852864.0, 8850...","[1106523.0, 900072.0, 1082185.0, 923746.0, 104..."
2,4,[6773204.0],"[6534178.0, 1029743.0, 923746.0, 1044078.0, 91..."


In [49]:
precision_train = result_train.apply(lambda row: precision_at_k(row['predict'], row['actual']), axis=1).mean()
print(f'Train precision: {precision_train:.03}')

Train precision: 0.39


In [50]:
test_preds = model_catb.predict_proba(X_test[x_features])[:,1]

In [51]:
result_test = get_final_recomendations(X_test, y_test, test_preds)

result_test.head(3)

2042it [01:15, 27.05it/s]


Unnamed: 0,user_id,actual,predict
0,1,"[1082185.0, 995242.0, 940947.0, 8293439.0, 100...","[1082185.0, 840361.0, 940947.0, 10455984.0, 82..."
1,3,"[1053690.0, 9526563.0, 6463658.0]","[951590.0, 938700.0, 1082185.0, 1029743.0, 923..."
2,6,"[995242.0, 1119051.0, 825541.0, 840361.0, 5569...","[1082185.0, 1029743.0, 878715.0, 981760.0, 104..."


In [52]:
precision_test = result_test.apply(lambda row: precision_at_k(row['predict'], row['actual']), axis=1).mean()
print(f'Test precision: {precision_test:.03}')

Test precision: 0.295


In [53]:
result_test.to_csv('prediction.csv', index=False)

In [54]:
import pickle

with open('catboost_model.pickle', 'wb') as f:
    pickle.dump(model_catb, f)