### Рекомендательные системы. Курсовой проект.

##### Целевая метрика precision@5

In [1]:
import pandas as pd
import numpy as np

# Для работы с матрицами
from scipy.sparse import csr_matrix
# Матричная факторизация
from implicit import als
# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import matplotlib
import matplotlib.image as img
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
matplotlib.rcParams.update({'font.size': 12})
matplotlib.rcParams.update({'figure.figsize': (12,6)})
matplotlib.rcParams["axes.grid"] = True

In [3]:
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items, get_targets_sec_level, extend_new_user_features, extend_new_item_features, \
extend_user_item_new_features, get_popularity_recommendations, postfilter_items, get_final_recomendations
from src.recommenders import MainRecommender

In [4]:
data = pd.read_csv('../raw_data/retail_train.csv')
item_features = pd.read_csv('../raw_data/product.csv')
user_features = pd.read_csv('../raw_data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# -- давние покупки -- | -- 6 недель -- | -- 3 недели -- 
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(5)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


#### Предварительная фильтрация

In [5]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=3000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 3001


#### Обучаем модель первого уровня

In [6]:
recommender = MainRecommender(data_train_lvl_1)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3001.0), HTML(value='')))




#### Эмбеддинги

In [7]:
items_emb_df = recommender.items_emb_df
users_emb_df = recommender.users_emb_df

#### Добавляем новые признаки

In [8]:
# Количество рекомендаций
N = 100
train = extend_user_item_new_features(data_train_lvl_2, data_train_lvl_1, recommender, item_features, user_features, items_emb_df, users_emb_df, N)
train.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,19_y,mean_time,age,income,children,avr_bask,sum_per_week,count_purchases_week_mean,sum_purchases_week_mean,target
0,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,...,-1.880183,1274.421509,50.0,70.0,0.0,2.290045,77.86153,0.000659,0.00274,0.0
1,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002445,0.002537,0.0
2,2021,40618753059,594,856060,1,1.77,443,-0.09,101,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002657,0.002727,0.0
3,2021,40618753059,594,869344,1,1.67,443,-0.22,101,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003816,0.005256,0.0
4,2021,40618753059,594,896862,2,5.0,443,-2.98,101,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011038,0.007207,1.0


In [9]:
X_train = train.drop(['target'], axis=1)
y_train = train['target'].ravel()

In [10]:
cat_features=[]
for col in X_train.columns:
    if (X_train[col].dtype == np.object):
          cat_features.append(col)
            
X_train[cat_features + ['user_id', 'item_id']] = X_train[cat_features + ['user_id', 'item_id']].astype('category')

In [11]:
test = extend_user_item_new_features(data_val_lvl_2, data_val_lvl_1, recommender, item_features, user_features, items_emb_df, users_emb_df, N)
X_test = test.drop(['target'], axis=1)
y_test = test['target'].ravel()
X_test[cat_features + ['user_id', 'item_id']] = X_test[cat_features + ['user_id', 'item_id']].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price']=data['sales_value']/data['quantity']


In [12]:
%%time
lgb = LGBMClassifier(objective='binary', max_depth = 7, categorical_column=cat_features)
lgb.fit(X_train, y_train)



Wall time: 1.75 s


LGBMClassifier(categorical_column=['department', 'brand', 'commodity_desc',
                                   'sub_commodity_desc', 'curr_size_of_product',
                                   'marital_status_code', 'homeowner_desc',
                                   'hh_comp_desc', 'household_size_desc'],
               max_depth=7, objective='binary')

In [13]:
def get_important_features(model, X_train, y_train):
    # Отбор важных признаков
    model.fit(X_train, y_train)
    feature = list(zip(X_train.columns.tolist(), model.feature_importances_))
    feature = pd.DataFrame(feature, columns=['feature', 'value'])
    features = feature.loc[feature.value > 0, 'feature'].tolist()
    return features

In [14]:
important_features = get_important_features(lgb, X_train, y_train)

#### Обучаем модель второго уровня

In [15]:
%%time
lgb.fit(X_train[important_features], y_train)



Wall time: 1.89 s


LGBMClassifier(categorical_column=['department', 'brand', 'commodity_desc',
                                   'sub_commodity_desc', 'curr_size_of_product',
                                   'marital_status_code', 'homeowner_desc',
                                   'hh_comp_desc', 'household_size_desc'],
               max_depth=7, objective='binary')

In [16]:
preds = lgb.predict(X_test[important_features])
test_preds_proba = lgb.predict_proba(X_test[important_features])[:, 1]

#### Финальная фильтрация данных

In [17]:
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

  from pandas import Panel


In [18]:
result = get_final_recomendations(X_test, test_preds_proba, data, data_train_lvl_1, item_features)

HBox(children=(FloatProgress(value=0.0, max=2499.0), HTML(value='')))




In [19]:
result.head()

Unnamed: 0,user_id,actual,recomendations
0,1,"[825123, 831447, 840361, 845307, 852014, 85498...","[874972, 1132231, 1048918, 8293439, 1132771]"
1,2,"[854852, 930118, 1077555, 1098066, 5567388, 55...","[874972, 1029743, 844179, 5569230, 916122]"
2,3,"[866211, 878996, 882830, 904360, 921345, 93194...","[874972, 1029743, 1106523, 5569230, 916122]"
3,4,"[836163, 857849, 877523, 878909, 883932, 89142...","[874972, 1029743, 5569230, 916122, 844179]"
4,5,"[938983, 5980822, 1012352, 825538, 1002499, 69...","[874972, 1106523, 5569230, 916122, 844179]"


#### Метрика precision@5

In [20]:
# Точность
result.apply(lambda row: precision_at_k(row['recomendations'], row['actual'], k=5), axis=1).mean()

0.6982793117246832

#### Сохранение предсказаний

In [21]:
result.drop('actual', axis=1, inplace=True)
result.to_csv('recommendations.csv', index=False)