# Course project

- Целевая метрика - precision@5. Порог для уcпешной сдачи проекта precision@5 > 25%
- Будет public тестовый датасет, на котором вы сможете измерять метрику
- Также будет private тестовый датасет для измерения финального качества
- НЕ обязательно, но крайне желательно использовать 2-ух уровневые рекоммендательные системы в проекте
- Вы сдаете код проекта в виде github репозитория и csv файл с рекомендациями 

__Загрузим библиотеки и данные__

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

import warnings
warnings.simplefilter('ignore')

In [2]:
train = pd.read_csv('data/retail_train.csv')
test = pd.read_csv('data/retail_test1.csv')
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')

__Посмотрим на данные__

In [3]:
train.head(4)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0


In [4]:
test.head(4)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0


In [5]:
print(train.shape[0], test.shape[0])

2396804 88734


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2396804 entries, 0 to 2396803
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   user_id            int64  
 1   basket_id          int64  
 2   day                int64  
 3   item_id            int64  
 4   quantity           int64  
 5   sales_value        float64
 6   store_id           int64  
 7   retail_disc        float64
 8   trans_time         int64  
 9   week_no            int64  
 10  coupon_disc        float64
 11  coupon_match_disc  float64
dtypes: float64(4), int64(8)
memory usage: 219.4 MB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88734 entries, 0 to 88733
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   user_id            88734 non-null  int64  
 1   basket_id          88734 non-null  int64  
 2   day                88734 non-null  int64  
 3   item_id            88734 non-null  int64  
 4   quantity           88734 non-null  int64  
 5   sales_value        88734 non-null  float64
 6   store_id           88734 non-null  int64  
 7   retail_disc        88734 non-null  float64
 8   trans_time         88734 non-null  int64  
 9   week_no            88734 non-null  int64  
 10  coupon_disc        88734 non-null  float64
 11  coupon_match_disc  88734 non-null  float64
dtypes: float64(4), int64(8)
memory usage: 8.1 MB


__Проверка на пропуски__

In [8]:
train.isna().sum()

user_id              0
basket_id            0
day                  0
item_id              0
quantity             0
sales_value          0
store_id             0
retail_disc          0
trans_time           0
week_no              0
coupon_disc          0
coupon_match_disc    0
dtype: int64

In [9]:
test.isna().sum()

user_id              0
basket_id            0
day                  0
item_id              0
quantity             0
sales_value          0
store_id             0
retail_disc          0
trans_time           0
week_no              0
coupon_disc          0
coupon_match_disc    0
dtype: int64

__Обработаем столбцы__

In [10]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

__Применим фильтрацию__

In [11]:
VAL_SIZE = 5

train_1 = train[train['week_no'] < train['week_no'].max() - (VAL_SIZE)]
val_1 = train[train['week_no'] >= train['week_no'].max() - (VAL_SIZE)]

train_2 = val_1.copy()

In [12]:
n_items_before = train_1['item_id'].nunique()
train_1 = prefilter_items(train_1, item_features=item_features, take_n_popular=200)
n_items_after = train_1['item_id'].nunique()

print(f'Decreased # items from {n_items_before} to {n_items_after}')

Decreased # items from 85828 to 201


In [13]:
recommender = MainRecommender(train_1)



  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/201 [00:00<?, ?it/s]

__Сделаем эмбеддинг товаров и юзеров__

In [14]:
items_emb_df = recommender.items_emb_df
users_emb_df = recommender.users_emb_df

__Напишем функции для генерации новых фич__

In [15]:
def new_item_features(data, item_features, items_emb_df):
    """Новые признаки для продуктов"""

    new_item_features = item_features.merge(data, on='item_id', how='left')
    
    # Добавим эмбеддинг
    item_features = item_features.merge(items_emb_df, how='left')

    
    # discount
    mean_disc = new_item_features.groupby('item_id')['coupon_disc'].mean().reset_index().sort_values('coupon_disc')
    item_features = item_features.merge(mean_disc, on='item_id', how='left')
    

    # manufacturer
    rare_manufacturer = item_features.manufacturer.value_counts()[item_features.manufacturer.value_counts() < 50].index
    item_features.loc[item_features.manufacturer.isin(rare_manufacturer), 'manufacturer'] = 999999999
    item_features.manufacturer = item_features.manufacturer.astype('object')
    

    # Количество продаж и среднее количество продаж товара
    item_qnt = new_item_features.groupby(['item_id'])['quantity'].count().reset_index()
    item_qnt.rename(columns={'quantity': 'quantity_of_sales'}, inplace=True)

    item_qnt['quantity_of_sales_per_week'] = item_qnt['quantity_of_sales'] / new_item_features['week_no'].nunique()
    item_features = item_features.merge(item_qnt, on='item_id', how='left')
    

    # Среднее количество продаж товара в категории за неделю
    items_in_department = new_item_features.groupby('department')['item_id'].count().reset_index().sort_values(
        'item_id', ascending=False
    )
    items_in_department.rename(columns={'item_id': 'items_in_department'}, inplace=True)

    qnt_of_sales_per_dep = new_item_features.groupby(['department'])['quantity'].count().reset_index().sort_values(
        'quantity', ascending=False
    )
    qnt_of_sales_per_dep.rename(columns={'quantity': 'qnt_of_sales_per_dep'}, inplace=True)


    items_in_department = items_in_department.merge(qnt_of_sales_per_dep, on='department')
    items_in_department['qnt_of_sales_per_item_per_dep_per_week'] = (
        items_in_department['qnt_of_sales_per_dep'] / 
        items_in_department['items_in_department'] / 
        new_item_features['week_no'].nunique()
    )
    items_in_department = items_in_department.drop(['items_in_department'], axis=1)
    item_features = item_features.merge(items_in_department, on=['department'], how='left')
    

    # sub_commodity_desc
    items_in_department = new_item_features.groupby('sub_commodity_desc')['item_id'].count().reset_index().sort_values(
        'item_id', ascending=False
    )
    items_in_department.rename(columns={'item_id': 'items_in_sub_commodity_desc'}, inplace=True)

    qnt_of_sales_per_dep = new_item_features.groupby(['sub_commodity_desc'])['quantity'].count().reset_index().sort_values(
        'quantity', ascending=False
    )
    qnt_of_sales_per_dep.rename(columns={'quantity': 'qnt_of_sales_per_sub_commodity_desc'}, inplace=True)


    items_in_department = items_in_department.merge(qnt_of_sales_per_dep, on='sub_commodity_desc')
    items_in_department['qnt_of_sales_per_item_per_sub_commodity_desc_per_week'] = (
        items_in_department['qnt_of_sales_per_sub_commodity_desc'] / 
        items_in_department['items_in_sub_commodity_desc'] / 
        new_item_features['week_no'].nunique()
    )
    items_in_department = items_in_department.drop(['items_in_sub_commodity_desc'], axis=1)
    item_features = item_features.merge(items_in_department, on=['sub_commodity_desc'], how='left')
    
    return item_features

In [16]:
def new_user_features(data, user_features, users_emb_df):
    """Новые признаки для пользователей"""

    new_user_features = user_features.merge(data, on='user_id', how='left')
    
    # Добавим эмбеддинг
    user_features = user_features.merge(users_emb_df, how='left')
    

    # Обычное время покупки
    time = new_user_features.groupby('user_id')['trans_time'].mean().reset_index()
    time.rename(columns={'trans_time': 'mean_time'}, inplace=True)
    time = time.astype(np.float32)
    user_features = user_features.merge(time, how='left')
    

    # Возраст
    user_features['age'] = user_features['age_desc'].replace(
        {'65+': 70, '45-54': 50, '25-34': 30, '35-44': 40, '19-24':20, '55-64':60}
    )
    user_features = user_features.drop('age_desc', axis=1)
    

    # Доход
    user_features['income'] = user_features['income_desc'].replace(
        {'35-49K': 45,
     '50-74K': 70,
     '25-34K': 30,
     '75-99K': 95,
     'Under 15K': 15,
     '100-124K': 120,
     '15-24K': 20,
     '125-149K': 145,
     '150-174K': 170,
     '250K+': 250,
     '175-199K': 195,
     '200-249K': 245}
    )
    user_features = user_features.drop('income_desc', axis=1)
    

    # Дети
    user_features['kids'] = 0
    user_features.loc[(user_features['kid_category_desc'] == '1'), 'kids'] = 1
    user_features.loc[(user_features['kid_category_desc'] == '2'), 'kids'] = 2
    user_features.loc[(user_features['kid_category_desc'] == '3'), 'kids'] = 3
    user_features = user_features.drop('kid_category_desc', axis=1)
    

    # Средний чек, средний чек в неделю
    basket = new_user_features.groupby(['user_id'])['sales_value'].sum().reset_index()

    baskets_qnt = new_user_features.groupby('user_id')['basket_id'].count().reset_index()
    baskets_qnt.rename(columns={'basket_id': 'baskets_qnt'}, inplace=True)

    average_basket = basket.merge(baskets_qnt)

    average_basket['average_basket'] = average_basket.sales_value / average_basket.baskets_qnt
    average_basket['sum_per_week'] = average_basket.sales_value / new_user_features.week_no.nunique()

    average_basket = average_basket.drop(['sales_value', 'baskets_qnt'], axis=1)
    user_features = user_features.merge(average_basket, how='left')

    return user_features

In [17]:
item_features = new_item_features(train_2, item_features, items_emb_df)
item_features.head(4)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,0,1,2,...,12,13,14,coupon_disc,quantity_of_sales,quantity_of_sales_per_week,qnt_of_sales_per_dep,qnt_of_sales_per_item_per_dep_per_week,qnt_of_sales_per_sub_commodity_desc,qnt_of_sales_per_item_per_sub_commodity_desc_per_week
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,,,,...,,,,,0,0.0,112255,0.137313,101,0.134667
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,,,,...,,,,,0,0.0,226,0.05858,225,0.064433
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,,,,,...,,,,,0,0.0,2436,0.102267,356,0.140268
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ,,,,...,,,,,0,0.0,112255,0.137313,141,0.141566


In [18]:
user_features = new_user_features(train_2, user_features, users_emb_df)
user_features.head(4)

Unnamed: 0,marital_status_code,homeowner_desc,hh_comp_desc,household_size_desc,user_id,0,1,2,3,4,...,11,12,13,14,mean_time,age,income,kids,average_basket,sum_per_week
0,A,Homeowner,2 Adults No Kids,2,1,4.860968,20.688519,4.658322,13.580238,-2.266062,...,20.169758,18.893497,-3.333689,1.003967,1324.803711,70,45,0,2.48729,44.356667
1,A,Homeowner,2 Adults No Kids,2,7,12.806444,12.456957,14.80447,8.6687,2.65675,...,-8.434517,14.779305,10.636637,11.172967,1622.862915,50,70,0,2.547257,74.295
2,U,Unknown,2 Adults Kids,3,8,14.641075,3.815117,10.5734,1.016455,12.063002,...,8.317652,5.88717,16.699127,9.250943,1824.990112,30,30,1,2.413793,81.666667
3,U,Homeowner,2 Adults Kids,4,13,7.826128,2.127095,15.92153,2.852006,10.092723,...,15.939384,9.8855,24.726446,2.252765,1608.363647,30,95,2,6.446182,177.27


__Разделим данные__

In [19]:
def train_test_preprocessing(data, train_1, recommender, item_features, user_features):
    """Подготовка обучающего и валидационного датасетов, разбиение на X и y"""
    
    users = pd.DataFrame(data['user_id'].unique())

    users.columns = ['user_id']

    # Пока только warm start
    train_users = train_1['user_id'].unique()
    users = users[users['user_id'].isin(train_users)]
    
    # Рекомендации на основе собственных покупок
    users = users.copy()
    users['candidates'] = users['user_id'].apply(
        lambda x: recommender.get_own_recommendations(x, N=10))

    s = users.apply(
        lambda x: pd.Series(x['candidates']), axis=1
    ).stack().reset_index(level=1, drop=True)

    s.name = 'item_id'

    users = users.drop('candidates', axis=1).join(s)

    users['flag'] = 1

    targets = data[['user_id', 'item_id']].copy()
    targets.head(2)

    targets['target'] = 1 

    targets = users.merge(targets, on=['user_id', 'item_id'], how='left')

    targets['target'].fillna(0, inplace= True)
    targets.drop('flag', axis=1, inplace=True)
    targets = targets.drop_duplicates()      

    targets = targets.merge(item_features, on='item_id', how='left')
    targets = targets.merge(user_features, on='user_id', how='left')

    X = targets.drop('target', axis=1)
    y = targets['target']
    
    return X, y

In [20]:
X_train, y_train = train_test_preprocessing(train_2, train_1, recommender, item_features, user_features)

In [21]:
cat_features = X_train.select_dtypes(include=['object']).columns.tolist()
cat_features

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'marital_status_code',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc']

In [22]:
X_train[cat_features + ['user_id', 'item_id']] = X_train[cat_features + ['user_id', 'item_id']].astype('category')

In [23]:
X_test, y_test = train_test_preprocessing(test, train_1, recommender, item_features, user_features)
X_test[cat_features + ['user_id', 'item_id']] = X_test[cat_features + ['user_id', 'item_id']].astype('category')

__Обучение__

In [24]:
lgbc = LGBMClassifier(objective='binary', categorical_feature = cat_features)

In [25]:
def get_important_features(model, X_train, y_train):
    """Список важных признаков"""
    
    model.fit(X_train, y_train)
    feature_imp = list(zip(X_train.columns.tolist(), model.feature_importances_))
    feature_imp = pd.DataFrame(feature_imp, columns=['feature', 'value'])
    basic_feats = feature_imp.loc[feature_imp.value > 0, 'feature'].tolist()
    return basic_feats

In [26]:
basic_features = get_important_features(lgbc, X_train, y_train)
basic_features

['user_id',
 'item_id',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 '0_x',
 '1_x',
 '2_x',
 '3_x',
 '4_x',
 '5_x',
 '6_x',
 '7_x',
 '8_x',
 '9_x',
 '10_x',
 '11_x',
 '12_x',
 '13_x',
 '14_x',
 'coupon_disc',
 'quantity_of_sales',
 'qnt_of_sales_per_dep',
 'qnt_of_sales_per_item_per_dep_per_week',
 'qnt_of_sales_per_sub_commodity_desc',
 'qnt_of_sales_per_item_per_sub_commodity_desc_per_week',
 'marital_status_code',
 'hh_comp_desc',
 'household_size_desc',
 '0_y',
 '1_y',
 '2_y',
 '3_y',
 '4_y',
 '5_y',
 '6_y',
 '7_y',
 '8_y',
 '9_y',
 '10_y',
 '11_y',
 '12_y',
 '13_y',
 '14_y',
 'mean_time',
 'age',
 'income',
 'kids',
 'average_basket',
 'sum_per_week']

In [27]:
lgbc = LGBMClassifier(categorical_feature=cat_features, n_estimators=525)
lgbc.fit(X_train[basic_features], y_train)

LGBMClassifier(categorical_feature=['manufacturer', 'department', 'brand',
                                    'commodity_desc', 'sub_commodity_desc',
                                    'curr_size_of_product',
                                    'marital_status_code', 'homeowner_desc',
                                    'hh_comp_desc', 'household_size_desc'],
               n_estimators=525)

In [28]:
def get_final_recomendation(X_test, test_preds_proba, val_2, train_1):
    """Финальный список рекомендованных товаров"""
    
    X_test['predict_proba'] = test_preds_proba

    X_test.sort_values(['user_id', 'predict_proba'], ascending=False, inplace=True)

    result = X_test.groupby('user_id').head(5)

    recs = result.groupby('user_id')['item_id']
    recomendations = []
    for user, preds in recs:
        recomendations.append({'user_id': user, 'recomendations': preds.tolist()})

    recomendations = pd.DataFrame(recomendations)

    result_2 = val_2.groupby('user_id')['item_id'].unique().reset_index()
    result_2.columns=['user_id', 'actual']

    result = result_2.merge(recomendations, how='left')

    cold_users = np.setdiff1d(val_2['user_id'], train_1['user_id']).tolist()

    popular_recs = popularity_recommendation(train_1, n=5)
    result.loc[result['user_id'].isin(cold_users), 'recomendations'] = result['user_id'].apply(lambda x: popular_recs)
   
    return result

In [29]:
def popularity_recommendation(data, n=5):
    """Топ-n популярных товаров"""
    
    popular = data.groupby('item_id')['quantity'].count().reset_index()
    popular.sort_values('quantity', ascending=False, inplace=True)
    popular = popular[popular['item_id'] != 999999]
    recs = popular.head(n).item_id
    return recs.tolist()

__Вычислим precision@5__

In [30]:
test_preds_proba = lgbc.predict_proba(X_test[basic_features])[:, 1]

result = get_final_recomendation(X_test, test_preds_proba, test, train_1)

In [31]:
result.head(4)

Unnamed: 0,user_id,actual,recomendations
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[1082185, 840361, 1004906, 995242, 986947]"
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[1133018, 1106523, 916122, 940947, 901062]"
2,3,"[827683, 908531, 989069, 1071377, 1080155, 109...","[1082185, 1106523, 1133018, 1092026, 938700]"
3,6,"[956902, 960791, 1037863, 1119051, 1137688, 84...","[1082185, 849843, 845208, 5569230, 840361]"


In [32]:
result.apply(lambda row: precision_at_k(row['recomendations'], row['actual']), axis=1).mean()

0.33198938992042437

__Сохраним результат__

In [33]:
result.drop('actual', axis=1, inplace=True)

In [34]:
result.head(4)

Unnamed: 0,user_id,recomendations
0,1,"[1082185, 840361, 1004906, 995242, 986947]"
1,2,"[1133018, 1106523, 916122, 940947, 901062]"
2,3,"[1082185, 1106523, 1133018, 1092026, 938700]"
3,6,"[1082185, 849843, 845208, 5569230, 840361]"


In [38]:
result.to_csv('recommendations.csv', index=False)