# Вебинар 6. Двухуровневые модели рекомендаций


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
# from src.utils import prefilter_items
from src.recommenders import MainRecommender
from lightgbm import LGBMRanker
from lightgbm import LGBMClassifier



In [2]:
data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

top_popular_500 = data.groupby('item_id')['item_id'].count().sort_values(ascending=False).head(500).tolist()

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
# n_items_before = data_train_lvl_1['item_id'].nunique()

# #параметры функции могут отличаться
# data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)


# n_items_after = data_train_lvl_1['item_id'].nunique()
# print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

In [4]:
# recommender.get_als_recommendations(2375, N=5)

In [5]:
# recommender.get_own_recommendations(2375, N=5)

In [6]:
# recommender.get_similar_items_recommendation(2375, N=5)

In [7]:
# recommender.get_similar_users_recommendation(2375, N=5)

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [8]:
my_recomender = MainRecommender(data=data_train_lvl_1,data_test=data_val_lvl_1)

In [9]:
# параметры предсказаний
params = {'filter_already_liked_items':False, 
                        'filter_items':[999999], 
                        "recalculate_user":True}
# параметры als
param_als = {'factors':1100, 'regularization':35, 'iterations':20, 
                             'num_threads':-1,'calculate_training_loss':True}

In [10]:
my_recomender.make_data(agg_column=('quantity','count'),filtr=[4,1],full =False)

{'status': True,
 'matrix': None,
 'params': {'agg_column': ('quantity', 'count'),
  'filtr': [4, 1],
  'full': False},
 'uim_matrix_w': <2497x5001 sparse matrix of type '<class 'numpy.float64'>'
 	with 641491 stored elements in Compressed Sparse Row format>,
 'uim_matrix': <2497x5001 sparse matrix of type '<class 'numpy.float64'>'
 	with 641491 stored elements in Compressed Sparse Row format>,
 'ium_matrix_w_tfidf': <5001x2497 sparse matrix of type '<class 'numpy.float64'>'
 	with 641491 stored elements in COOrdinate format>,
 'ium_matrix_tfidf': <5001x2497 sparse matrix of type '<class 'numpy.float64'>'
 	with 641491 stored elements in COOrdinate format>,
 'ium_matrix_w_bm25': <5001x2497 sparse matrix of type '<class 'numpy.float64'>'
 	with 641491 stored elements in COOrdinate format>,
 'ium_matrix_bm25': <5001x2497 sparse matrix of type '<class 'numpy.float64'>'
 	with 641491 stored elements in COOrdinate format>}

In [11]:
my_recomender.fit_own_recommender()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




<implicit.nearest_neighbours.ItemItemRecommender at 0x1e70dfc0340>

In [12]:
my_recomender.validation_own_recommender(N=5,params=params)

0.23307013469576984

In [13]:
my_recomender.fit_own_recommender('tf_idf')
my_recomender.validation_own_recommender(N=5,params=params)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




0.2505341384115157

In [14]:
my_recomender.fit_own_recommender('bm25')
my_recomender.validation_own_recommender(N=5,params=params)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




0.07691593125870867

In [15]:
my_recomender.fit_als(params=param_als,weighting ='bm25' )



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




<implicit.als.AlternatingLeastSquares at 0x1e72c7c7940>

In [16]:
my_recomender.validation_similar_items_recommendation()

0.4509986065954484

In [17]:
# my_recomender.validation_als_recommender(params=params)

In [18]:
result = my_recomender.data_validation['data'].drop(['full_train','predict'],axis=1)

In [19]:
def add_items(x,n):
    delta = n- len(x)
    free = [i for i in top_popular_500 if i not in x]
    if delta:
        add = free[:delta]
        x = x+add
    return x    
        

In [20]:
users_lev_1 = result.user_id.tolist()

In [None]:
res_recall = pd.DataFrame()

In [69]:
%%time

for i in [20, 50, 100, 200, 500]:
    col = f'item_simular_{i}'
    predict = my_recomender.get_similar_items_recommendation(users = users_lev_1,N=i)['similar_recommendation'].tolist()
    result[col] = predict
    result[col] = result[col].apply(lambda x: add_items (x,i))
    res_recall.loc[col,'recall'] = result.apply(lambda row : recall_at_k(row[col],row['test'],k=i),axis = 1).mean()


Wall time: 8min 29s


In [67]:
%%time

for i in [20, 50, 100, 200, 500]:
    col = f'als_{i}'
    predict = my_recomender.predict_als(users=users_lev_1,N=i,params=params)['result'].tolist()
    result[col] = predict
    result[col] = result[col].apply(lambda x: add_items (x,i))
    res_recall.loc[col,'recall'] = result.apply(lambda row : recall_at_k(row[col],row['test'],k=i),axis = 1).mean()

Wall time: 2h 40min 30s


In [70]:
res_recall

Unnamed: 0,recall
als_20,0.107261
als_50,0.168085
als_100,0.22533
als_200,0.289185
als_500,0.3737
item_simular_20,0.112366
item_simular_50,0.167533
item_simular_100,0.216337
item_simular_200,0.262514
item_simular_500,0.300258


In [72]:
result

Unnamed: 0,user_id,test,train,item_simular_20,item_simular_50,item_simular_100,item_simular_200,item_simular_500,lgb_clas,als_20,als_50,als_100,als_200,als_500
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[825123, 831447, 840361, 845307, 852014, 85498...","[856942, 1082185, 995242, 9527290, 940947, 557...","[856942, 1082185, 995242, 9527290, 940947, 557...","[856942, 1082185, 995242, 9527290, 940947, 557...","[856942, 1082185, 995242, 9527290, 940947, 557...","[856942, 1082185, 995242, 9527290, 940947, 557...","[1082185, 9655212, 1075074, 931136, 5978648]","[1082185, 995242, 9527290, 856942, 940947, 934...","[1082185, 995242, 9527290, 856942, 940947, 934...","[1082185, 995242, 9527290, 856942, 940947, 934...","[1082185, 995242, 9527290, 856942, 940947, 934...","[1082185, 995242, 9527290, 856942, 940947, 934..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[854852, 930118, 1077555, 1098066, 5567388, 55...","[8090521, 5569230, 1133018, 1106523, 1075368, ...","[8090521, 5569230, 1133018, 1106523, 1075368, ...","[8090521, 5569230, 1133018, 1106523, 1075368, ...","[8090521, 5569230, 1133018, 1106523, 1075368, ...","[8090521, 5569230, 1133018, 1106523, 1075368, ...","[1106523, 914190, 1053690, 1075368, 1133018]","[1133018, 5569230, 1082185, 1106523, 8090521, ...","[1133018, 5569230, 1082185, 1106523, 8090521, ...","[1133018, 5569230, 1082185, 1106523, 8090521, ...","[1133018, 5569230, 1082185, 1106523, 8090521, ...","[1133018, 5569230, 1082185, 1106523, 8090521, ..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[836163, 857849, 877523, 878909, 883932, 89142...","[883932, 891423, 962229, 910109, 1121367, 1075...","[883932, 891423, 962229, 910109, 1121367, 1075...","[883932, 891423, 962229, 910109, 1121367, 1075...","[883932, 891423, 962229, 910109, 1121367, 1075...","[883932, 891423, 962229, 910109, 1121367, 1075...","[962229, 1029743, 1052294, 1075368, 944534]","[883932, 902172, 891423, 962229, 1075368, 9518...","[883932, 902172, 891423, 962229, 1075368, 9518...","[883932, 902172, 891423, 962229, 1075368, 9518...","[883932, 902172, 891423, 962229, 1075368, 9518...","[883932, 902172, 891423, 962229, 1075368, 9518..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[840361, 851494, 851819, 851903, 863447, 87623...","[1082185, 840361, 1119051, 1037863, 863447, 55...","[1082185, 840361, 1119051, 1037863, 863447, 55...","[1082185, 840361, 1119051, 1037863, 863447, 55...","[1082185, 840361, 1119051, 1037863, 863447, 55...","[1082185, 840361, 1119051, 1037863, 863447, 55...","[1098844, 1082185, 1029743, 6534178, 981760]","[1082185, 878996, 1024306, 1037863, 1119051, 9...","[1082185, 878996, 1024306, 1037863, 1119051, 9...","[1082185, 878996, 1024306, 1037863, 1119051, 9...","[1082185, 878996, 1024306, 1037863, 1119051, 9...","[1082185, 878996, 1024306, 1037863, 1119051, 9..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[865569, 886703, 889731, 893400, 995436, 10205...","[1082185, 1122358, 6944571, 1022003, 828867, 1...","[1082185, 1122358, 6944571, 1022003, 828867, 1...","[1082185, 1122358, 6944571, 1022003, 828867, 1...","[1082185, 1122358, 6944571, 1022003, 828867, 1...","[1082185, 1122358, 6944571, 1022003, 828867, 1...","[1082185, 1029743, 1126899, 1106523, 5978656]","[1082185, 1122358, 828867, 6944571, 1022003, 8...","[1082185, 1122358, 828867, 6944571, 1022003, 8...","[1082185, 1122358, 828867, 6944571, 1022003, 8...","[1082185, 1122358, 828867, 6944571, 1022003, 8...","[1082185, 1122358, 828867, 6944571, 1022003, 8..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2148,2496,"[831509, 867188, 1013623, 1048851, 5592734, 16...","[840361, 852159, 871756, 886703, 899624, 91612...","[981760, 883404, 916122, 5569230, 1056509, 995...","[981760, 883404, 916122, 5569230, 1056509, 995...","[981760, 883404, 916122, 5569230, 1056509, 995...","[981760, 883404, 916122, 5569230, 1056509, 995...","[981760, 883404, 916122, 5569230, 1056509, 995...","[6534178, 1082185, 1106523, 10285187, 10285149]","[1133018, 914190, 981760, 916122, 995876, 1106...","[1133018, 914190, 981760, 916122, 995876, 1106...","[1133018, 914190, 981760, 916122, 995876, 1106...","[1133018, 914190, 981760, 916122, 995876, 1106...","[1133018, 914190, 981760, 916122, 995876, 1106..."
2149,2497,"[820291, 824759, 838797, 859010, 859075, 86077...","[838220, 1037840, 1052294, 5569230, 8090537, 1...","[860776, 995785, 1066685, 965719, 896938, 8705...","[860776, 995785, 1066685, 965719, 896938, 8705...","[860776, 995785, 1066685, 965719, 896938, 8705...","[860776, 995785, 1066685, 965719, 896938, 8705...","[860776, 995785, 1066685, 965719, 896938, 8705...","[1082185, 1029743, 1119089, 896938, 965719]","[995785, 904360, 860776, 845208, 5569230, 8996...","[995785, 904360, 860776, 845208, 5569230, 8996...","[995785, 904360, 860776, 845208, 5569230, 8996...","[995785, 904360, 860776, 845208, 5569230, 8996...","[995785, 904360, 860776, 845208, 5569230, 8996..."
2150,2498,"[865511, 962991, 1076374, 1102358, 5564901, 15...","[824555, 835576, 901776, 904023, 911215, 91749...","[1070820, 1126899, 1082185, 961554, 1053690, 8...","[1070820, 1126899, 1082185, 961554, 1053690, 8...","[1070820, 1126899, 1082185, 961554, 1053690, 8...","[1070820, 1126899, 1082185, 961554, 1053690, 8...","[1070820, 1126899, 1082185, 961554, 1053690, 8...","[1070820, 1106523, 1126899, 1082185, 1053690]","[1082185, 1070820, 861272, 1053690, 1126899, 8...","[1082185, 1070820, 861272, 1053690, 1126899, 8...","[1082185, 1070820, 861272, 1053690, 1126899, 8...","[1082185, 1070820, 861272, 1053690, 1126899, 8...","[1082185, 1070820, 861272, 1053690, 1126899, 8..."
2151,2499,"[861282, 921744, 1050968, 13842089, 828837, 86...","[838186, 853197, 864143, 883665, 932949, 93383...","[1070820, 826249, 5570048, 944317, 1074405, 99...","[1070820, 826249, 5570048, 944317, 1074405, 99...","[1070820, 826249, 5570048, 944317, 1074405, 99...","[1070820, 826249, 5570048, 944317, 1074405, 99...","[1070820, 826249, 5570048, 944317, 1074405, 99...","[1070820, 1082185, 6534178, 6533889, 866528]","[826249, 883404, 1070820, 1098066, 965766, 999...","[826249, 883404, 1070820, 1098066, 965766, 999...","[826249, 883404, 1070820, 1098066, 965766, 999...","[826249, 883404, 1070820, 1098066, 965766, 999...","[826249, 883404, 1070820, 1098066, 965766, 999..."


### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [24]:
df = pd.DataFrame({'user_id':result.user_id.values.repeat(50),
                 'item_id':np.concatenate(result.item_simular_50.values)})

In [283]:
df_als = pd.DataFrame({'user_id':result.user_id.values.repeat(50),
                 'item_id':np.concatenate(result.als_50.values)})

In [284]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2= targets_lvl_2.groupby(['user_id', 'item_id']).head(1)
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = df.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')
targets_lvl_2_als = df_als.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')
targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2_als['target'].fillna(0, inplace= True)

In [285]:
targets_lvl_2.target.mean(),targets_lvl_2_als.target.mean()

(0.19505805852299118, 0.15119368323269855)

In [29]:
item_features

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ
...,...,...,...,...,...,...,...
92348,18293142,6384,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,
92349,18293439,6393,DRUG GM,National,BOOKSTORE,CHILDRENS LOW END,
92350,18293696,6406,DRUG GM,National,BOOKSTORE,PAPERBACK BEST SELLER,
92351,18294080,6442,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,


In [30]:
user_features

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16
...,...,...,...,...,...,...,...,...
796,35-44,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,2494
797,45-54,A,75-99K,Homeowner,Unknown,3,1,2496
798,45-54,U,35-49K,Unknown,Single Male,1,None/Unknown,2497
799,25-34,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,2498


In [286]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2_als = targets_lvl_2_als.merge(item_features, on='item_id', how='left')
targets_lvl_2_als = targets_lvl_2_als.merge(user_features, on='user_id', how='left')

In [32]:
data_for_featch= data[data.week_no<(data.week_no.max()-val_lvl_2_size_weeks)]

In [33]:
data_for_featch

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.00,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.30,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.00,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2282320,222,41297772783,635,1120741,1,0.59,304,0.00,1716,91,0.0,0.0
2282321,462,41297773713,635,993339,1,1.99,304,0.00,2040,91,0.0,0.0
2282322,462,41297773713,635,995242,1,1.00,304,-0.89,2040,91,0.0,0.0
2282323,462,41297773713,635,10180324,1,3.00,304,-0.29,2040,91,0.0,0.0


In [34]:
total_user_purchase = data_for_featch.groupby('user_id').user_id.count()
total_item_purchase = data_for_featch.groupby('item_id').item_id.count()

In [35]:
user_average_check = data_for_featch.groupby(['user_id','basket_id'])['sales_value'].mean()
user_average_check = user_average_check.groupby('user_id').mean()
user_average_check = dict(user_average_check)

In [36]:
number_of_baskets_per_week = data_for_featch.groupby(['user_id','week_no'])['user_id'].count()
number_of_baskets_per_week = number_of_baskets_per_week.groupby('user_id').mean()
number_of_baskets_per_week = dict(number_of_baskets_per_week)

In [37]:
user_average_basket_count = data_for_featch.groupby(['user_id','basket_id'])['sales_value'].count()
user_average_basket_count = user_average_basket_count.groupby('user_id').mean()
user_average_basket_count = dict(user_average_check)

In [38]:
days_of_purchase = data_for_featch[['user_id','day']].copy()
days_of_purchase['week_day'] = days_of_purchase.day%7
days_of_purchase.loc[days_of_purchase.week_day==0,'week_day'] = 7
days_of_purchase = pd.pivot_table(days_of_purchase,index='user_id',columns='week_day',values = 'day',
                           aggfunc='count',   fill_value=0)
days_of_purchase['total'] = total_user_purchase

In [39]:
days_of_purchase

week_day,1,2,3,4,5,6,7,total
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,315,381,179,356,43,106,127,1507
2,21,99,116,86,101,187,45,655
3,199,98,125,66,258,5,119,870
4,41,0,58,7,32,110,53,301
5,9,2,29,14,39,90,36,219
...,...,...,...,...,...,...,...,...
2496,197,148,71,495,399,68,41,1419
2497,270,292,254,221,208,229,302,1776
2498,75,116,100,114,156,130,102,793
2499,184,31,167,307,170,46,38,943


In [287]:
targets_lvl_2['user_average_check']=targets_lvl_2.user_id.map(user_average_check)
targets_lvl_2_als['user_average_check']=targets_lvl_2_als.user_id.map(user_average_check)

In [288]:
targets_lvl_2['number_of_baskets_per_week']=targets_lvl_2.user_id.map(number_of_baskets_per_week)
targets_lvl_2_als['number_of_baskets_per_week']=targets_lvl_2_als.user_id.map(number_of_baskets_per_week)

In [289]:
targets_lvl_2['user_average_basket_count']=targets_lvl_2.user_id.map(user_average_basket_count)
targets_lvl_2_als['user_average_basket_count']=targets_lvl_2_als.user_id.map(user_average_basket_count)

In [290]:
for i in range(1,8):
    targets_lvl_2[f'week_day{i}']= targets_lvl_2.user_id.map(days_of_purchase[i]/days_of_purchase['total'])
    targets_lvl_2_als[f'week_day{i}']= targets_lvl_2_als.user_id.map(days_of_purchase[i]/days_of_purchase['total'])

In [45]:
item_average_basket_count = data_for_featch.groupby(['item_id','basket_id'])['sales_value'].count()
item_average_basket_count = item_average_basket_count.groupby('item_id').mean()


In [46]:
days_of_purchase_item = data_for_featch[['item_id','day']].copy()
days_of_purchase_item['week_day'] = days_of_purchase_item.day%7
days_of_purchase_item.loc[days_of_purchase_item.week_day==0,'week_day'] = 7
days_of_purchase_item = pd.pivot_table(days_of_purchase_item,index='item_id',columns='week_day',values = 'day',
                           aggfunc='count',   fill_value=0)
days_of_purchase_item['total'] = total_item_purchase

In [47]:
item_price = data_for_featch[['item_id','quantity','sales_value']].copy()
item_price['price'] = item_price.sales_value/item_price.quantity
item_price = item_price.groupby('item_id')['price'].max()

In [291]:
targets_lvl_2['item_average_basket_count']=targets_lvl_2.item_id.map(item_average_basket_count)
targets_lvl_2_als['item_average_basket_count']=targets_lvl_2_als.item_id.map(item_average_basket_count)

In [292]:
targets_lvl_2['item_price']=targets_lvl_2.item_id.map(item_price)
targets_lvl_2_als['item_price']=targets_lvl_2_als.item_id.map(item_price)

In [293]:
for i in range(1,8):
    targets_lvl_2[f'week_day_{i}_item']= targets_lvl_2.item_id.map(days_of_purchase_item[i]/days_of_purchase_item['total'])
    targets_lvl_2_als[f'week_day_{i}_item']= targets_lvl_2_als.item_id.map(days_of_purchase_item[i]/days_of_purchase_item['total'])

In [294]:
targets_lvl_2.loc[targets_lvl_2.item_price==np.inf,'item_price']=0
targets_lvl_2_als.loc[targets_lvl_2_als.item_price==np.inf,'item_price']=0

In [295]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2['target']
X_train_als = targets_lvl_2_als.drop('target', axis=1)
y_train_als = targets_lvl_2_als['target']

In [76]:
train_users = result['user_id'].unique()
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2 = result_lvl_2[result_lvl_2['user_id'].isin(train_users)]
users_lev_2 = result_lvl_2.user_id.tolist()

In [78]:
result_lvl_2['item_sim'] =\
    my_recomender.get_similar_items_recommendation(users = users_lev_2,N=5)['similar_recommendation'].tolist()

In [455]:
score = pd.DataFrame()

In [456]:
score.loc['1_level_item_simular','precision@5'] =\
        result_lvl_2.apply(lambda row : precision_at_k(row['item_sim'],row['actual'],k=5),axis = 1).mean()

In [457]:
score

Unnamed: 0,precision@5
1_level_item_simular,0.399896


### Применим бустинг

In [458]:
cat_feats = ['manufacturer', 'department', 'brand',
       'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product',
       'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc',
       'hh_comp_desc', 'household_size_desc', 'kid_category_desc']
X_train[cat_feats] = X_train[cat_feats].astype('category')
X_train_als[cat_feats] = X_train_als[cat_feats].astype('category')

In [459]:
params_lgb = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.1}

params_tune={    
    "n_estimators": 2000,
    "n_jobs": 15,
    "seed": 27,
    'reg_alpha': 0,
    'reg_lambda': 500,
    'max_depth': 4,
    'min_child_samples':6,
    'num_leaves':6
}

In [460]:
%%time
params_lgb.update(params_tune)
lgb = LGBMClassifier(**params_lgb)
lgb.fit(X_train, y_train,early_stopping_rounds = 90, eval_set=[(X_train, y_train)], eval_metric=['auc'],verbose=False)

train_preds = lgb.predict_proba(X_train)[:,1]

res_lgb_clas = pd.DataFrame({'user_id':X_train.user_id,'item_id':X_train.item_id,'predict':train_preds})
res_lgb_clas = res_lgb_clas.sort_values(by=['user_id','predict'],ascending=[True,False])
res_lgb_clas = res_lgb_clas.groupby('user_id').head(5)
result_lvl_2['lgb_clas'] = result_lvl_2.user_id.apply(lambda x : res_lgb_clas[res_lgb_clas.user_id==x]['item_id'].tolist())
score.loc['boost_sim_item','precision@5'] =\
        result_lvl_2.apply(lambda row : precision_at_k(row['lgb_clas'],row['actual'],k=5),axis = 1).mean()

Wall time: 12.5 s


In [461]:
score

Unnamed: 0,precision@5
1_level_item_simular,0.399896
boost_sim_item,0.33577


In [462]:
%%time
params_lgb.update(params_tune)
lgb = LGBMClassifier(**params_lgb)
lgb.fit(X_train_als, y_train_als,early_stopping_rounds = 90,eval_metric  = 'auc',eval_set=[(X_train_als,y_train_als)],verbose=False)

train_preds = lgb.predict_proba(X_train_als)[:,1]

res_lgb_clas = pd.DataFrame({'user_id':X_train_als.user_id,'item_id':X_train_als.item_id,'predict':train_preds})
res_lgb_clas = res_lgb_clas.sort_values(by=['user_id','predict'],ascending=[True,False])
res_lgb_clas = res_lgb_clas.groupby('user_id').head(5)
result_lvl_2['lgb_clas'] = result_lvl_2.user_id.apply(lambda x : res_lgb_clas[res_lgb_clas.user_id==x]['item_id'].tolist())
score.loc['boost_als','precision@5'] =\
        result_lvl_2.apply(lambda row : precision_at_k(row['lgb_clas'],row['actual'],k=5),axis = 1).mean()

Wall time: 12.8 s


In [463]:
score

Unnamed: 0,precision@5
1_level_item_simular,0.399896
boost_sim_item,0.33577
boost_als,0.328251


In [464]:
grs = X_train.groupby(['user_id'], sort=False)['item_id'].count().to_numpy()

In [465]:
lgb_params = { 
    'objective':'lambdarank',
    'boosting_type': 'gbdt',
    'n_estimators': 3000,
    'learning_rate': 0.1,
    'max_depth': 4,
    'verbose': 1,
    'is_unbalance': True,
    "n_jobs": 15,
    "seed": 27,
    'reg_alpha': 0,
    'reg_lambda': 0,
    'max_depth':6,
    'min_child_samples':10,
    'num_leaves':8
}

In [466]:
lgb = LGBMRanker(**lgb_params, silent=False)
lgb.fit(X_train, y_train, group=grs,
        eval_set=[(X_train, y_train)], eval_group=[grs],  
        eval_metric=['ndcg'],
        eval_at=[5, 10 ], early_stopping_rounds=50,verbose=False)
train_preds = lgb.predict(X_train)

res_lgb_clas = pd.DataFrame({'user_id':X_train.user_id,'item_id':X_train.item_id,'predict':train_preds})
res_lgb_clas = res_lgb_clas.sort_values(by=['user_id','predict'],ascending=[True,False])
res_lgb_clas = res_lgb_clas.groupby('user_id').head(5)
result_lvl_2['lgb_ranker'] = result_lvl_2.user_id.apply(lambda x : res_lgb_clas[res_lgb_clas.user_id==x]['item_id'].tolist())
score.loc['lgb_ranker','precision@5'] =\
        result_lvl_2.apply(lambda row : precision_at_k(row['lgb_ranker'],row['actual'],k=5),axis = 1).mean()

In [467]:
score

Unnamed: 0,precision@5
1_level_item_simular,0.399896
boost_sim_item,0.33577
boost_als,0.328251
lgb_ranker,0.35342


In [468]:
grs = X_train_als.groupby(['user_id'], sort=False)['item_id'].count().to_numpy()

In [469]:
lgb = LGBMRanker(**lgb_params, silent=False)
lgb.fit(X_train_als, y_train_als, group=grs,
        eval_set=[(X_train_als, y_train_als)], eval_group=[grs],  
        eval_metric=['ndcg'],
        eval_at=[5, 10 ], early_stopping_rounds=50,verbose=False)
train_preds = lgb.predict(X_train_als)

res_lgb_clas = pd.DataFrame({'user_id':X_train_als.user_id,'item_id':X_train_als.item_id,'predict':train_preds})
res_lgb_clas = res_lgb_clas.sort_values(by=['user_id','predict'],ascending=[True,False])
res_lgb_clas = res_lgb_clas.groupby('user_id').head(5)
result_lvl_2['lgb_ranker_als'] = result_lvl_2.user_id.apply(lambda x : res_lgb_clas[res_lgb_clas.user_id==x]['item_id'].tolist())
score.loc['lgb_ranker_als','precision@5'] =\
        result_lvl_2.apply(lambda row : precision_at_k(row['lgb_ranker_als'],row['actual'],k=5),axis = 1).mean()

In [471]:
score.sort_values(by='precision@5',ascending=False)

Unnamed: 0,precision@5
1_level_item_simular,0.399896
lgb_ranker_als,0.357493
lgb_ranker,0.35342
boost_sim_item,0.33577
boost_als,0.328251


### Финальный проект

Мы уже прошли всю необходимуб теорию для финального проекта. Проект осуществляется на данных из вебинара (данные считаны в начале ДЗ).
Рекомендуем вам **начать делать проект сразу после этого домашнего задания**
- Целевая метрика - precision@5. Порог для уcпешной сдачи проекта precision@5 > 0.27%
- Будет public тестовый датасет, на котором вы сможете измерять метрику
- Также будет private тестовый датасет для измерения финального качества
- НЕ обязательно, но крайне желательно использовать 2-ух уровневые рекоммендательные системы в проекте
- Вы сдаете код проекта в виде github репозитория и csv файл с рекомендациями 