<a href="https://colab.research.google.com/github/SovetovAleksey/ReqSys/blob/course_project/course_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [None]:
data = pd.read_csv('data/retail_train.csv')
item_features = pd.read_csv('raw_data/product.csv')
user_features = pd.read_csv('raw_data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0.1,Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1832874,1078,35573861879,524,1082185,1,0.56,375,0.0,1440,76,0.0,0.0
1,402281,324,29170411703,165,7168774,2,6.98,367,0.0,1115,24,0.0,0.0


In [None]:
data_train_lvl_1.shape, data_train_lvl_2.shape, data_val_lvl_2.shape

((199764, 13), (16752, 13), (11333, 13))

In [None]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 33411 to 5001


In [None]:
recommender = MainRecommender(data_train_lvl_1)



  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [None]:
recommender.get_als_recommendations(2375, N=5)

[899624, 1044078, 896613, 1106523, 926905]

In [None]:
recommender.get_similar_items_recommendation(2375, N=5)

[1045779, 1137483, 940726, 1064408, 933248]

In [None]:
recommender.get_similar_users_recommendation(237, N=5)

[8090509, 1070820, 1029743, 948384, 8090521]

In [None]:
recommender.get_own_recommendations(2375, N=5)

[896613, 899624, 835300, 907099, 973181]

In [None]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[1098248, 1017299, 1077430, 938004, 1082185, 8..."
1,2,"[830127, 7442008, 899624, 944568]"


In [None]:
result_lvl_1['candidates'] = result_lvl_1['user_id'].apply(lambda row: recommender.get_own_recommendations(row, N=5))
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,candidates
0,1,"[1098248, 1017299, 1077430, 938004, 1082185, 8...","[856942, 1087895, 940947, 1104349, 5577022]"
1,2,"[830127, 7442008, 899624, 944568]","[1075368, 940947, 1007414, 909714, 959737]"


In [None]:
result_lvl_1.apply(lambda row: precision_at_k(row['candidates'], row['actual']), axis=1).mean()

0.04732704402515749

In [None]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=10))

In [None]:
users_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,1501,"[1070820, 1079528, 1044078, 939789, 1115175, 1..."
1,1633,"[1029743, 1028891, 844179, 933835, 854852, 110..."


In [None]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,1501,1070820.0,1
0,1501,1079528.0,1
0,1501,1044078.0,1
0,1501,939789.0,1


In [None]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target
0,1501,1070820.0,0.0
1,1501,1079528.0,0.0


In [None]:
targets_lvl_2['target'].mean()

0.03745093763628434

In [None]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [None]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [None]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,1501,1070820.0,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,,,,,,,
1,1501,1079528.0,0.0,737,DRUG GM,National,NEWSPAPER,NEWSPAPER,,,,,,,,


In [None]:
pd.set_option('display.max_columns', 100)

data_for_feature = pd.concat([data_train_lvl_1, data_val_lvl_1])
data_for_feature = data_for_feature.merge(item_features, on='item_id', how='left')
data_for_feature = data_for_feature.merge(user_features, on='user_id', how='left')

n_items_before = data_for_feature['item_id'].nunique()

data_for_feature = prefilter_items(data_for_feature, item_features=item_features, take_n_popular=5000)

n_items_after = data_for_feature['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 11232 to 5000


In [None]:
data_for_feature.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88326 entries, 0 to 98931
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            88326 non-null  int64  
 1   user_id               88326 non-null  int64  
 2   basket_id             88326 non-null  int64  
 3   day                   88326 non-null  int64  
 4   item_id               88326 non-null  int64  
 5   quantity              88326 non-null  int64  
 6   sales_value           88326 non-null  float64
 7   store_id              88326 non-null  int64  
 8   retail_disc           88326 non-null  float64
 9   trans_time            88326 non-null  int64  
 10  week_no               88326 non-null  int64  
 11  coupon_disc           88326 non-null  float64
 12  coupon_match_disc     88326 non-null  float64
 13  price                 88326 non-null  float64
 14  manufacturer          64043 non-null  float64
 15  department         

In [None]:
data_for_feature.head(2)

Unnamed: 0.1,Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,402281,324,29170411703,165,7168774,2,6.98,367,0.0,1115,24,0.0,0.0,3.49,794.0,GROCERY,National,CONVENIENT BRKFST/WHLSM SNACKS,CEREAL BARS,7.4 OZ,45-54,U,50-74K,Unknown,Single Female,1,None/Unknown
1,1348564,1982,32957769022,404,12811490,1,3.99,319,0.0,2101,58,0.0,0.0,3.99,1091.0,DRUG GM,National,CHARCOAL AND LIGHTER FLUID,CHARCOAL,,25-34,U,100-124K,Unknown,Single Male,1,None/Unknown


In [None]:
# на какую в среднем сумму юзер делает покупки за неделю
mean_weekly_expenses = data_for_feature.groupby(['user_id', 'week_no'])['sales_value'].agg('sum').groupby(['user_id']).agg('mean')\
                                    .rename('mean_weekly_expenses')
targets_lvl_2 = targets_lvl_2.merge(mean_weekly_expenses, how='left', on='user_id')
targets_lvl_2.head(1)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,mean_weekly_expenses
0,1501,1070820.0,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,,,,,,,,8.432727


In [None]:
# среднее количество продаж в неделю в категории
amount_sales_in_category = data_for_feature.groupby(['commodity_desc', 'week_no'])['quantity'].agg('sum')\
                        .groupby(['commodity_desc']).agg('mean').rename('amount_sales_in_category')
targets_lvl_2 = targets_lvl_2.merge(amount_sales_in_category, how='left', on=['commodity_desc'])
targets_lvl_2.head(1)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,mean_weekly_expenses,amount_sales_in_category
0,1501,1070820.0,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,,,,,,,,8.432727,37.643678


In [None]:
# средняя цена товара в категории
mean_price_in_category = data_for_feature.groupby(['commodity_desc'])['sales_value'].agg('mean').rename('mean_price_in_category')
targets_lvl_2 = targets_lvl_2.merge(mean_price_in_category, how='left', on=['commodity_desc'])
targets_lvl_2.head(1)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,mean_weekly_expenses,amount_sales_in_category,mean_price_in_category
0,1501,1070820.0,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,,,,,,,,8.432727,37.643678,2.986169


In [None]:
# среднее количество покупок юзера в неделю в текущей категории
users_buys_in_category = data_for_feature.groupby(['user_id', 'commodity_desc', 'week_no'])['quantity'].agg('sum')\
                                .groupby(['user_id', 'commodity_desc']).agg('mean').rename('users_buys_in_category')
targets_lvl_2 = targets_lvl_2.merge(users_buys_in_category, how='left', on=['user_id', 'commodity_desc'])
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,mean_weekly_expenses,amount_sales_in_category,mean_price_in_category,users_buys_in_category
0,1501,1070820.0,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,,,,,,,,8.432727,37.643678,2.986169,1.0
1,1501,1079528.0,0.0,737,DRUG GM,National,NEWSPAPER,NEWSPAPER,,,,,,,,,8.432727,1.0,5.0,1.0


In [None]:
targets_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18344 entries, 0 to 18343
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   user_id                   18344 non-null  int64  
 1   item_id                   18344 non-null  float64
 2   target                    18344 non-null  float64
 3   manufacturer              18344 non-null  int64  
 4   department                18344 non-null  object 
 5   brand                     18344 non-null  object 
 6   commodity_desc            18344 non-null  object 
 7   sub_commodity_desc        18344 non-null  object 
 8   curr_size_of_product      18344 non-null  object 
 9   age_desc                  7705 non-null   object 
 10  marital_status_code       7705 non-null   object 
 11  income_desc               7705 non-null   object 
 12  homeowner_desc            7705 non-null   object 
 13  hh_comp_desc              7705 non-null   object 
 14  househ

In [None]:
cat_features = targets_lvl_2.columns[3:16].to_list()
targets_lvl_2[cat_features] = targets_lvl_2[cat_features].fillna('Unknown')
targets_lvl_2[cat_features] = targets_lvl_2[cat_features].astype('category')

num_features = targets_lvl_2.select_dtypes(exclude=['category']).drop(['user_id', 'item_id', 'target'], 1).columns.to_list()
targets_lvl_2[num_features] = targets_lvl_2[num_features].fillna(targets_lvl_2[num_features].mean())

  num_features = targets_lvl_2.select_dtypes(exclude=['category']).drop(['user_id', 'item_id', 'target'], 1).columns.to_list()


In [None]:
targets_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18344 entries, 0 to 18343
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   user_id                   18344 non-null  int64   
 1   item_id                   18344 non-null  float64 
 2   target                    18344 non-null  float64 
 3   manufacturer              18344 non-null  category
 4   department                18344 non-null  category
 5   brand                     18344 non-null  category
 6   commodity_desc            18344 non-null  category
 7   sub_commodity_desc        18344 non-null  category
 8   curr_size_of_product      18344 non-null  category
 9   age_desc                  18344 non-null  category
 10  marital_status_code       18344 non-null  category
 11  income_desc               18344 non-null  category
 12  homeowner_desc            18344 non-null  category
 13  hh_comp_desc              18344 non-null  cate

In [None]:
%%time
from catboost import CatBoostClassifier

cat_model = CatBoostClassifier(random_state=100,
                               iterations=400,
                               depth=3,
                               l2_leaf_reg=2,
                               learning_rate=0.5,
                               verbose=False,
                               cat_features=cat_features)

X_train = targets_lvl_2.drop('target', 1)
y_train = targets_lvl_2['target']

cat_model.fit(X_train, y_train)



Wall time: 13.6 s


<catboost.core.CatBoostClassifier at 0x13d42a958e0>

In [None]:
train_preds = cat_model.predict_proba(X_train)[:, 1]
targets_lvl_2['preds_proba'] = train_preds

In [None]:
recs_cb_k5 = targets_lvl_2.sort_values(['user_id', 'preds_proba'], ascending=False).groupby('user_id').head(5)\
                .groupby('user_id')['item_id'].unique().reset_index()
recs_cb_k5.head(2)

Unnamed: 0,user_id,item_id
0,1,"[1087895.0, 940947.0, 856942.0, 5582712.0, 110..."
1,2,"[959737.0, 1007414.0, 952163.0, 909714.0, 1016..."


In [None]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']

result_lvl_2 = result_lvl_2.merge(recs_cb_k5, on='user_id', how='left')
result_lvl_2 = result_lvl_2.rename(columns={'item_id': 'candidates'})

result_lvl_2.head(2)

Unnamed: 0,user_id,actual,candidates
0,1,"[1035805, 829563, 931136, 877391, 995242, 1082...","[1087895.0, 940947.0, 856942.0, 5582712.0, 110..."
1,2,"[980666, 1096261, 13876377, 901062, 13190188, ...","[959737.0, 1007414.0, 952163.0, 909714.0, 1016..."


In [None]:
result_lvl_2 = result_lvl_2[~result_lvl_2['candidates'].isna()]
result_lvl_2.apply(lambda row: precision_at_k(row['candidates'], row['actual']), axis=1).mean()

0.03220617202889041