In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Бустинг
from lightgbm import LGBMRanker
from lightgbm import LGBMClassifier

# Самостоятельно написанные классы и  функции 
from src.metrics import precision_at_k, recall_at_k
from src.recommenders import MainRecommender



In [2]:
# данные транзакций
data = pd.read_csv('data/retail_train.csv')
# Характеристики товаров
item_features = pd.read_csv('data/product.csv')
# Характеристики юзеров
user_features = pd.read_csv('data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки --для перфильтра  | -- 6 недель -- для обучения на бустинге  | -- 3 недель --  для тестирования

val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

top_popular_500 = data.groupby('item_id')['item_id'].count().sort_values(ascending=False).head(500).tolist()

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


### Сделаем различный ремсчеты использу только матрицу тьерацуий и implicit

In [3]:
# параметры предсказаний
params = {'filter_already_liked_items':False, 
                        'filter_items':[999999], 
                        "recalculate_user":True}
# параметры als
param_als = {'factors':1100, 'regularization':35, 'iterations':20, 
                             'num_threads':-1,'calculate_training_loss':True}

добавим еще один столбец для расчета весов

In [4]:
data_mod = data.copy()
data_mod['weight'] = data.week_no//10+1

In [5]:
score_1_level= pd.DataFrame(columns = ['agg_column','type_weight','type_technik','num_top','score'])

In [6]:
my_recomender = MainRecommender(data=data_mod,split_info=(3,'week_no'))

In [7]:
my_recomender.make_data(agg_column=('quantity','count'),filtr=[1],full =False,top = 5000)

{'status': True,
 'matrix': None,
 'params': {'agg_column': ('quantity', 'count'), 'filtr': [1], 'full': False},
 'uim_matrix_w': <2499x5001 sparse matrix of type '<class 'numpy.float64'>'
 	with 679365 stored elements in Compressed Sparse Row format>,
 'uim_matrix': <2499x5001 sparse matrix of type '<class 'numpy.float64'>'
 	with 679365 stored elements in Compressed Sparse Row format>,
 'ium_matrix_w_tfidf': <5001x2499 sparse matrix of type '<class 'numpy.float64'>'
 	with 679365 stored elements in COOrdinate format>,
 'ium_matrix_tfidf': <5001x2499 sparse matrix of type '<class 'numpy.float64'>'
 	with 679365 stored elements in COOrdinate format>,
 'ium_matrix_w_bm25': <5001x2499 sparse matrix of type '<class 'numpy.float64'>'
 	with 679365 stored elements in COOrdinate format>,
 'ium_matrix_bm25': <5001x2499 sparse matrix of type '<class 'numpy.float64'>'
 	with 679365 stored elements in COOrdinate format>}

In [8]:
# own_recommender
for n,i in enumerate([None,'tf_idf','bm25']):
    my_recomender.fit_own_recommender(i)
    score_1_level.loc[n,:] = ['quantity,count', i,'own_recommender','5000',\
                    my_recomender.validation_own_recommender(N=5,params=params)]
          
score_1_level

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




Unnamed: 0,agg_column,type_weight,type_technik,num_top,score
0,"quantity,count",,own_recommender,5000,0.197111
1,"quantity,count",tf_idf,own_recommender,5000,0.211019
2,"quantity,count",bm25,own_recommender,5000,0.0652057


In [9]:
# als
for n,i in enumerate([None,'tf_idf','bm25']):
    my_recomender.fit_als(params=param_als,weighting =i )
    score_1_level.loc[score_1_level.shape[0],:] = ['quantity,count', i,'als','5000', my_recomender.validation_als_recommender(N=5,params=params)]
          
score_1_level
# todo



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,agg_column,type_weight,type_technik,num_top,score
0,"quantity,count",,own_recommender,5000,0.197111
1,"quantity,count",tf_idf,own_recommender,5000,0.211019
2,"quantity,count",bm25,own_recommender,5000,0.0652057
3,"quantity,count",,als,5000,0.279138
4,"quantity,count",tf_idf,als,5000,0.302938
5,"quantity,count",bm25,als,5000,0.344074


In [10]:
# similar_items_recommendation
for n,i in enumerate([None,'tf_idf','bm25']):
    my_recomender.fit_als(params=param_als,weighting =i )
    score_1_level.loc[score_1_level.shape[0],:] = ['quantity,count', i,'similar_items','5000',\
                                my_recomender.validation_similar_items_recommendation()]
          
score_1_level


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,agg_column,type_weight,type_technik,num_top,score
0,"quantity,count",,own_recommender,5000,0.197111
1,"quantity,count",tf_idf,own_recommender,5000,0.211019
2,"quantity,count",bm25,own_recommender,5000,0.0652057
3,"quantity,count",,als,5000,0.279138
4,"quantity,count",tf_idf,als,5000,0.302938
5,"quantity,count",bm25,als,5000,0.344074
6,"quantity,count",,similar_items,5000,0.400269
7,"quantity,count",tf_idf,similar_items,5000,0.400269
8,"quantity,count",bm25,similar_items,5000,0.400269


#### поменяем топ популярных с 5 до 10 тысяч

In [11]:
my_recomender.make_data(agg_column=('quantity','count'),filtr=[1],full =False,top = 10000)

{'status': True,
 'matrix': None,
 'params': {'agg_column': ('quantity', 'count'), 'filtr': [1], 'full': False},
 'uim_matrix_w': <2499x10001 sparse matrix of type '<class 'numpy.float64'>'
 	with 892043 stored elements in Compressed Sparse Row format>,
 'uim_matrix': <2499x10001 sparse matrix of type '<class 'numpy.float64'>'
 	with 892043 stored elements in Compressed Sparse Row format>,
 'ium_matrix_w_tfidf': <10001x2499 sparse matrix of type '<class 'numpy.float64'>'
 	with 892043 stored elements in COOrdinate format>,
 'ium_matrix_tfidf': <10001x2499 sparse matrix of type '<class 'numpy.float64'>'
 	with 892043 stored elements in COOrdinate format>,
 'ium_matrix_w_bm25': <10001x2499 sparse matrix of type '<class 'numpy.float64'>'
 	with 892043 stored elements in COOrdinate format>,
 'ium_matrix_bm25': <10001x2499 sparse matrix of type '<class 'numpy.float64'>'
 	with 892043 stored elements in COOrdinate format>}

In [12]:
my_recomender.top

10000

In [13]:
# fit_own_recommender
for i in [None,'tf_idf','bm25']:
    my_recomender.fit_own_recommender(i)
    score_1_level.loc[score_1_level.shape[0],:] =\
        ['quantity,count', i,'own_recommender','10 000', my_recomender.validation_own_recommender(N=5,params=params)]
          
score_1_level

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10001.0), HTML(value='')))




Unnamed: 0,agg_column,type_weight,type_technik,num_top,score
0,"quantity,count",,own_recommender,5000,0.197111
1,"quantity,count",tf_idf,own_recommender,5000,0.211019
2,"quantity,count",bm25,own_recommender,5000,0.0652057
3,"quantity,count",,als,5000,0.279138
4,"quantity,count",tf_idf,als,5000,0.302938
5,"quantity,count",bm25,als,5000,0.344074
6,"quantity,count",,similar_items,5000,0.400269
7,"quantity,count",tf_idf,similar_items,5000,0.400269
8,"quantity,count",bm25,similar_items,5000,0.400269
9,"quantity,count",,own_recommender,10 000,0.202816


In [14]:
# als
for i in [None,'tf_idf','bm25']:
    my_recomender.fit_als(params=param_als,weighting =i )
    score_1_level.loc[score_1_level.shape[0],:] = ['quantity,count', i,'als','10 000', my_recomender.validation_als_recommender(N=5,params=params)]
          
score_1_level
# todo

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,agg_column,type_weight,type_technik,num_top,score
0,"quantity,count",,own_recommender,5000,0.197111
1,"quantity,count",tf_idf,own_recommender,5000,0.211019
2,"quantity,count",bm25,own_recommender,5000,0.0652057
3,"quantity,count",,als,5000,0.279138
4,"quantity,count",tf_idf,als,5000,0.302938
5,"quantity,count",bm25,als,5000,0.344074
6,"quantity,count",,similar_items,5000,0.400269
7,"quantity,count",tf_idf,similar_items,5000,0.400269
8,"quantity,count",bm25,similar_items,5000,0.400269
9,"quantity,count",,own_recommender,10 000,0.202816


In [15]:
# similar_items_recommendation
for n,i in enumerate([None,'tf_idf','bm25']):
    my_recomender.fit_als(params=param_als,weighting =i )
    score_1_level.loc[score_1_level.shape[0],:] = ['quantity,count', i,'similar_items','10 000',\
                                my_recomender.validation_similar_items_recommendation()]
          
score_1_level


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,agg_column,type_weight,type_technik,num_top,score
0,"quantity,count",,own_recommender,5000,0.197111
1,"quantity,count",tf_idf,own_recommender,5000,0.211019
2,"quantity,count",bm25,own_recommender,5000,0.0652057
3,"quantity,count",,als,5000,0.279138
4,"quantity,count",tf_idf,als,5000,0.302938
5,"quantity,count",bm25,als,5000,0.344074
6,"quantity,count",,similar_items,5000,0.400269
7,"quantity,count",tf_idf,similar_items,5000,0.400269
8,"quantity,count",bm25,similar_items,5000,0.400269
9,"quantity,count",,own_recommender,10 000,0.202816


#### Изменим agg_column

In [16]:
my_recomender.make_data(agg_column=('weight','sum',),filtr=[1],full =False,top = 5000)
# fit_own_recommender
for i in [None,'tf_idf','bm25']:
    my_recomender.fit_own_recommender(i)
    score_1_level.loc[score_1_level.shape[0]+1,:] = \
    ['weight,sum', i,'own_recommender','5000', my_recomender.validation_own_recommender(N=5,params=params)]
    
    
# similar_items_recommendation
for i in [None,'tf_idf','bm25']:
    my_recomender.fit_als(params=param_als,weighting =i )
    score_1_level.loc[score_1_level.shape[0]+1,:] = ['weight,sum', i,'similar_items','5000',\
                                my_recomender.validation_similar_items_recommendation()]
          
score_1_level

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,agg_column,type_weight,type_technik,num_top,score
0,"quantity,count",,own_recommender,5000,0.197111
1,"quantity,count",tf_idf,own_recommender,5000,0.211019
2,"quantity,count",bm25,own_recommender,5000,0.0652057
3,"quantity,count",,als,5000,0.279138
4,"quantity,count",tf_idf,als,5000,0.302938
5,"quantity,count",bm25,als,5000,0.344074
6,"quantity,count",,similar_items,5000,0.400269
7,"quantity,count",tf_idf,similar_items,5000,0.400269
8,"quantity,count",bm25,similar_items,5000,0.400269
9,"quantity,count",,own_recommender,10 000,0.202816


In [17]:
# als
for i in [None,'tf_idf','bm25']:
    my_recomender.fit_als(params=param_als,weighting =i )
    score_1_level.loc[score_1_level.shape[0]+1,:] = \
            ['weight,sum', i,'als','5000', my_recomender.validation_als_recommender(N=5,params=params)]
          
score_1_level
# todo

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,agg_column,type_weight,type_technik,num_top,score
0,"quantity,count",,own_recommender,5000,0.197111
1,"quantity,count",tf_idf,own_recommender,5000,0.211019
2,"quantity,count",bm25,own_recommender,5000,0.0652057
3,"quantity,count",,als,5000,0.279138
4,"quantity,count",tf_idf,als,5000,0.302938
5,"quantity,count",bm25,als,5000,0.344074
6,"quantity,count",,similar_items,5000,0.400269
7,"quantity,count",tf_idf,similar_items,5000,0.400269
8,"quantity,count",bm25,similar_items,5000,0.400269
9,"quantity,count",,own_recommender,10 000,0.202816


In [18]:
my_recomender.make_data(agg_column=('week_no','sum',),filtr=[1],full =False,top = 5000)
# fit_own_recommender
for i in [None,'tf_idf','bm25']:
    my_recomender.fit_own_recommender(i)
    score_1_level.loc[score_1_level.shape[0]+1,:] = \
    ['week_no,sum', i,'own_recommender','5000', my_recomender.validation_own_recommender(N=5,params=params)]
    
    
# similar_items_recommendation
for i in [None,'tf_idf','bm25']:
    my_recomender.fit_als(params=param_als,weighting =i )
    score_1_level.loc[score_1_level.shape[0]+1,:] = ['week_no,sum', i,'similar_items','5000',\
                                my_recomender.validation_similar_items_recommendation()]
          
score_1_level

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,agg_column,type_weight,type_technik,num_top,score
0,"quantity,count",,own_recommender,5000,0.197111
1,"quantity,count",tf_idf,own_recommender,5000,0.211019
2,"quantity,count",bm25,own_recommender,5000,0.0652057
3,"quantity,count",,als,5000,0.279138
4,"quantity,count",tf_idf,als,5000,0.302938
5,"quantity,count",bm25,als,5000,0.344074
6,"quantity,count",,similar_items,5000,0.400269
7,"quantity,count",tf_idf,similar_items,5000,0.400269
8,"quantity,count",bm25,similar_items,5000,0.400269
9,"quantity,count",,own_recommender,10 000,0.202816


In [19]:
# als
for i in [None,'tf_idf','bm25']:
    my_recomender.fit_als(params=param_als,weighting =i )
    score_1_level.loc[score_1_level.shape[0]+1,:] = \
            ['week_no,sum', i,'als','5000', my_recomender.validation_als_recommender(N=5,params=params)]
          
score_1_level
# todo

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,agg_column,type_weight,type_technik,num_top,score
0,"quantity,count",,own_recommender,5000,0.197111
1,"quantity,count",tf_idf,own_recommender,5000,0.211019
2,"quantity,count",bm25,own_recommender,5000,0.0652057
3,"quantity,count",,als,5000,0.279138
4,"quantity,count",tf_idf,als,5000,0.302938
5,"quantity,count",bm25,als,5000,0.344074
6,"quantity,count",,similar_items,5000,0.400269
7,"quantity,count",tf_idf,similar_items,5000,0.400269
8,"quantity,count",bm25,similar_items,5000,0.400269
9,"quantity,count",,own_recommender,10 000,0.202816


### 2 уровневая модель

In [20]:
my_recomender = MainRecommender(data=data_train_lvl_1,data_test=data_val_lvl_1)
my_recomender.make_data(agg_column=('quantity','count'),filtr=[1],full =False,top = 5000)
my_recomender.fit_als(params=param_als,weighting ='bm25' )

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




<implicit.als.AlternatingLeastSquares at 0x1e8b99c4430>

создадим набор  из 50 кандидатов для рекомендаций, из которых, 5  будет выбирать бустинг. 

In [21]:
# функция для добавления номенклатуры в список, если не хватает из топ 500 популярных
def add_items(x,n):
    delta = n- len(x)
    free = [i for i in top_popular_500 if i not in x]
    if delta:
        add = free[:delta]
        x = x+add
    return x  

In [22]:
def my_LGBMClassifier(data):
    score = pd.DataFrame()
    params_lgb.update(params_tune)
    lgb = LGBMClassifier(**params_lgb)
    lgb.fit(data[0], data[1],early_stopping_rounds = 90, eval_set=[data], eval_metric=['auc'],verbose=False)

    train_preds = lgb.predict_proba(data[0])[:,1]

    res_lgb_clas = pd.DataFrame({'user_id':data[0].user_id,'item_id':data[0].item_id,'predict':train_preds})
    res_lgb_clas = res_lgb_clas.sort_values(by=['user_id','predict'],ascending=[True,False])
    res_lgb_clas = res_lgb_clas.groupby('user_id').head(5)
    result_lvl_2['lgb_clas'] = result_lvl_2.user_id.apply(lambda x : res_lgb_clas[res_lgb_clas.user_id==x]['item_id'].tolist())
    score.loc['LGBMClassifier','precision@5'] =\
            result_lvl_2.apply(lambda row : precision_at_k(row['lgb_clas'],row['actual'],k=5),axis = 1).mean()
    return score

In [23]:
def my_LGBMRanker(data):
    score = pd.DataFrame()
    lgb = LGBMRanker(**lgb_params, silent=False)
    lgb.fit(X_train_als, y_train_als, group=grs,
            eval_set=[(X_train_als, y_train_als)], eval_group=[grs],  
            eval_metric=['ndcg'],
            eval_at=[5, 10 ], early_stopping_rounds=50,verbose=False)
    train_preds = lgb.predict(X_train_als)

    res_lgb_clas = pd.DataFrame({'user_id':X_train_als.user_id,'item_id':X_train_als.item_id,'predict':train_preds})
    res_lgb_clas = res_lgb_clas.sort_values(by=['user_id','predict'],ascending=[True,False])
    res_lgb_clas = res_lgb_clas.groupby('user_id').head(5)
    result_lvl_2['lgb_ranker_als'] = result_lvl_2.user_id.apply(lambda x : res_lgb_clas[res_lgb_clas.user_id==x]['item_id'].tolist())
    score.loc['lgb_ranker','precision@5'] =\
            result_lvl_2.apply(lambda row : precision_at_k(row['lgb_ranker_als'],row['actual'],k=5),axis = 1).mean()
    return score

In [24]:
res_recall = pd.DataFrame()

In [25]:
i = 50
result = my_recomender.data_validation['data'].drop(['full_train'],axis=1)
users_lev_1 = result.user_id.tolist()
# similar_item
col = 'similar_item'
predict = my_recomender.get_similar_items_recommendation(users = users_lev_1,N=i)['similar_recommendation'].tolist()
result[col] = predict
result[col] = result[col].apply(lambda x: add_items (x,i))
res_recall.loc[col,'recall'] = result.apply(lambda row : recall_at_k(row[col],row['test'],k=i),axis = 1).mean()

# als
col = 'als'
predict = my_recomender.predict_als(users=users_lev_1,N=i,params=params)['result'].tolist()
result[col] = predict
result[col] = result[col].apply(lambda x: add_items (x,i))
res_recall.loc[col,'recall'] = result.apply(lambda row : recall_at_k(row[col],row['test'],k=i),axis = 1).mean()
res_recall


Unnamed: 0,recall
similar_item,0.16746
als,0.167738


In [26]:
result

Unnamed: 0,user_id,test,train,similar_item,als
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[825123, 831447, 840361, 845307, 852014, 85498...","[856942, 1082185, 995242, 9527290, 940947, 557...","[1082185, 995242, 9527290, 856942, 940947, 934..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[854852, 930118, 1077555, 1098066, 5567388, 55...","[8090521, 5569230, 1133018, 1106523, 1040807, ...","[1133018, 5569230, 1082185, 1106523, 8090521, ..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[836163, 857849, 877523, 878909, 883932, 89142...","[883932, 891423, 910109, 962229, 1121367, 1075...","[883932, 902172, 891423, 962229, 1075368, 9518..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[840361, 851494, 851819, 851903, 863447, 87623...","[1082185, 840361, 1119051, 1037863, 863447, 55...","[1082185, 878996, 1024306, 1037863, 1119051, 9..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[865569, 886703, 889731, 893400, 995436, 10205...","[1082185, 1122358, 6944571, 1022003, 828867, 1...","[1082185, 1122358, 828867, 6944571, 1022003, 8..."
...,...,...,...,...,...
2148,2496,"[831509, 867188, 1013623, 1048851, 5592734, 16...","[840361, 852159, 871756, 886703, 899624, 91612...","[981760, 916122, 883404, 5569230, 995876, 1056...","[1133018, 914190, 916122, 981760, 995876, 1106..."
2149,2497,"[820291, 824759, 838797, 859010, 859075, 86077...","[838220, 1037840, 1052294, 5569230, 8090537, 1...","[860776, 995785, 1066685, 965719, 896938, 8705...","[995785, 904360, 860776, 5569230, 845208, 8996..."
2150,2498,"[865511, 962991, 1076374, 1102358, 5564901, 15...","[824555, 835576, 901776, 904023, 911215, 91749...","[1082185, 1070820, 1126899, 961554, 1053690, 1...","[1082185, 1070820, 861272, 1053690, 1126899, 8..."
2151,2499,"[861282, 921744, 1050968, 13842089, 828837, 86...","[838186, 853197, 864143, 883665, 932949, 93383...","[1070820, 826249, 5570048, 944317, 1074405, 55...","[826249, 883404, 1070820, 1098066, 965766, 999..."


In [27]:
df = pd.DataFrame({'user_id':result.user_id.values.repeat(50),
                 'item_id':np.concatenate(result.similar_item.values)})

df_als = pd.DataFrame({'user_id':result.user_id.values.repeat(50),
                 'item_id':np.concatenate(result.als.values)})

In [28]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2= targets_lvl_2.groupby(['user_id', 'item_id']).head(1)
targets_lvl_2['target'] = 1  # тут только покупки

In [29]:
# Посмотрим на результат бустинга без фичей

In [30]:
params_lgb = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.1}

params_tune={    
    "n_estimators": 2000,
    "n_jobs": 15,
    "seed": 27,
    'reg_alpha': 0,
    'reg_lambda': 500,
    'max_depth': 4,
    'min_child_samples':6,
    'num_leaves':6
}

lgb_params = { 
    'objective':'lambdarank',
    'boosting_type': 'gbdt',
    'n_estimators': 3000,
    'learning_rate': 0.1,
    'max_depth': 4,
    'verbose': 1,
    'is_unbalance': True,
    "n_jobs": 15,
    "seed": 27,
    'reg_alpha': 0,
    'reg_lambda': 0,  
    'max_depth':6,
    'min_child_samples':10,
    'num_leaves':8
}

In [31]:
targets_lvl_2 = df.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')
targets_lvl_2_als = df_als.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')
targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2_als['target'].fillna(0, inplace= True)
targets_lvl_2.target.mean(),targets_lvl_2_als.target.mean()

(0.19477008824895495, 0.15124941941477008)

In [32]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2['target']
X_train_als = targets_lvl_2_als.drop('target', axis=1)
y_train_als = targets_lvl_2_als['target']

In [33]:
res_score=pd.DataFrame(columns=['precision@5','note'])

In [34]:
train_users = result['user_id'].unique()
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2 = result_lvl_2[result_lvl_2['user_id'].isin(train_users)]
users_lev_2 = result_lvl_2.user_id.tolist()

In [35]:
res_ = my_LGBMClassifier((X_train, y_train))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'simular_item_no_featch'

In [36]:
res_ = my_LGBMClassifier((X_train_als, y_train_als))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'als_no_featch'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [37]:
res_score

Unnamed: 0,precision@5,note
LGBMClassifier,0.30705,simular_item_no_featch
LGBMClassifier,0.28846,als_no_featch


In [38]:
# Попробуем ранкер

In [39]:
grs = X_train.groupby(['user_id'], sort=False)['item_id'].count().to_numpy()

In [40]:
res_ = my_LGBMRanker((X_train, y_train))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'simular_item_no_featch'

res_ = my_LGBMRanker((X_train_als, y_train_als))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'als_no_featch'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [41]:
res_score

Unnamed: 0,precision@5,note
LGBMClassifier,0.30705,simular_item_no_featch
LGBMClassifier,0.28846,als_no_featch
lgb_ranker,0.301828,simular_item_no_featch
lgb_ranker,0.301828,als_no_featch


In [42]:
# Добавим в качестве фичей элементы матрицы bm52

In [43]:
id_to_item = my_recomender.id_to_itemid
id_to_user = my_recomender.id_to_userid
b = my_recomender.user_item_matrix['ium_matrix_bm25']
items_id = b.nonzero()[0]
users_id= b.nonzero()[1]
data_id = b.data
my_dict  ={ (id_to_user[users_id[i]],id_to_item[items_id[i]]):b.data[i] for i in range(len(items_id))}

In [44]:
%%time
targets_lvl_2['bm52']  = 0
for  i in targets_lvl_2.iterrows():
    row = i[1]
    try:
        targets_lvl_2.loc[i[0],'bm52']=my_dict[(row.user_id,row.item_id)]
    except:
        targets_lvl_2.loc[i[0],'bm52'] = 0

targets_lvl_2_als['bm52']  = 0
for  i in targets_lvl_2_als.iterrows():
    row = i[1]
    try:
        targets_lvl_2_als.loc[i[0],'bm52']=my_dict[(row.user_id,row.item_id)]
    except:
        targets_lvl_2_als.loc[i[0],'bm52'] = 0    

Wall time: 54.6 s


In [45]:
# Посмотрим на результаты

In [46]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2['target']
X_train_als = targets_lvl_2_als.drop('target', axis=1)
y_train_als = targets_lvl_2_als['target']

In [47]:
res_ = my_LGBMClassifier((X_train, y_train))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'simular_item_only_bm52'

res_ = my_LGBMClassifier((X_train_als, y_train_als))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'als_only_bm52'


res_ = my_LGBMRanker((X_train, y_train))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'simular_item_only_bm52'

res_ = my_LGBMRanker((X_train_als, y_train_als))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'als_no_only_bm52'
res_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Unnamed: 0,precision@5,note
LGBMClassifier,0.30705,simular_item_no_featch
LGBMClassifier,0.28846,als_no_featch
lgb_ranker,0.301828,simular_item_no_featch
lgb_ranker,0.301828,als_no_featch
LGBMClassifier,0.312689,simular_item_only_bm52
LGBMClassifier,0.309034,als_only_bm52
lgb_ranker,0.340574,simular_item_only_bm52
lgb_ranker,0.340574,als_no_only_bm52


In [48]:
res_score

Unnamed: 0,precision@5,note
LGBMClassifier,0.30705,simular_item_no_featch
LGBMClassifier,0.28846,als_no_featch
lgb_ranker,0.301828,simular_item_no_featch
lgb_ranker,0.301828,als_no_featch
LGBMClassifier,0.312689,simular_item_only_bm52
LGBMClassifier,0.309034,als_only_bm52
lgb_ranker,0.340574,simular_item_only_bm52
lgb_ranker,0.340574,als_no_only_bm52


In [49]:
# Зафиксировали прирост по всем позициям

In [50]:
# Добавм данные о пользователяъ и товарах и еще раз проверим результаты

In [51]:
item_features

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ
...,...,...,...,...,...,...,...
92348,18293142,6384,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,
92349,18293439,6393,DRUG GM,National,BOOKSTORE,CHILDRENS LOW END,
92350,18293696,6406,DRUG GM,National,BOOKSTORE,PAPERBACK BEST SELLER,
92351,18294080,6442,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,


In [52]:
user_features

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16
...,...,...,...,...,...,...,...,...
796,35-44,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,2494
797,45-54,A,75-99K,Homeowner,Unknown,3,1,2496
798,45-54,U,35-49K,Unknown,Single Male,1,None/Unknown,2497
799,25-34,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,2498


In [53]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2_als = targets_lvl_2_als.merge(item_features, on='item_id', how='left')
targets_lvl_2_als = targets_lvl_2_als.merge(user_features, on='user_id', how='left')


In [54]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2['target']
X_train_als = targets_lvl_2_als.drop('target', axis=1)
y_train_als = targets_lvl_2_als['target']

cat_feats = ['manufacturer', 'department', 'brand',
       'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product',
       'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc',
       'hh_comp_desc', 'household_size_desc', 'kid_category_desc']
X_train[cat_feats] = X_train[cat_feats].astype('category')
X_train_als[cat_feats] = X_train_als[cat_feats].astype('category')


res_ = my_LGBMClassifier((X_train, y_train))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'simular_item_bm52_iu'

res_ = my_LGBMClassifier((X_train_als, y_train_als))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'als_bm52_ui'


res_ = my_LGBMRanker((X_train, y_train))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'simular_item_bm52_ui'

res_ = my_LGBMRanker((X_train_als, y_train_als))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'als_no_bm52_ui'
res_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Unnamed: 0,precision@5,note
LGBMClassifier,0.30705,simular_item_no_featch
LGBMClassifier,0.28846,als_no_featch
lgb_ranker,0.301828,simular_item_no_featch
lgb_ranker,0.301828,als_no_featch
LGBMClassifier,0.312689,simular_item_only_bm52
LGBMClassifier,0.309034,als_only_bm52
lgb_ranker,0.340574,simular_item_only_bm52
lgb_ranker,0.340574,als_no_only_bm52
LGBMClassifier,0.32,simular_item_bm52_iu
LGBMClassifier,0.323342,als_bm52_ui


In [55]:
# Добавим фичей

In [56]:
data_for_featch= data[data.week_no<(data.week_no.max()-val_lvl_2_size_weeks)]

In [57]:
total_user_purchase = data_for_featch.groupby('user_id').user_id.count()
total_item_purchase = data_for_featch.groupby('item_id').item_id.count()

In [58]:
user_average_check = data_for_featch.groupby(['user_id','basket_id'])['sales_value'].mean()
user_average_check = user_average_check.groupby('user_id').mean()
user_average_check = dict(user_average_check)

In [59]:
number_of_baskets_per_week = data_for_featch.groupby(['user_id','week_no'])['user_id'].count()
number_of_baskets_per_week = number_of_baskets_per_week.groupby('user_id').mean()
number_of_baskets_per_week = dict(number_of_baskets_per_week)

In [60]:
user_average_basket_count = data_for_featch.groupby(['user_id','basket_id'])['sales_value'].count()
user_average_basket_count = user_average_basket_count.groupby('user_id').mean()
user_average_basket_count = dict(user_average_check)

In [61]:
days_of_purchase = data_for_featch[['user_id','day']].copy()
days_of_purchase['week_day'] = days_of_purchase.day%7
days_of_purchase.loc[days_of_purchase.week_day==0,'week_day'] = 7
days_of_purchase = pd.pivot_table(days_of_purchase,index='user_id',columns='week_day',values = 'day',
                           aggfunc='count',   fill_value=0)
days_of_purchase['total'] = total_user_purchase

In [62]:
targets_lvl_2['user_average_check']=targets_lvl_2.user_id.map(user_average_check)
targets_lvl_2_als['user_average_check']=targets_lvl_2_als.user_id.map(user_average_check)

In [63]:
targets_lvl_2['number_of_baskets_per_week']=targets_lvl_2.user_id.map(number_of_baskets_per_week)
targets_lvl_2_als['number_of_baskets_per_week']=targets_lvl_2_als.user_id.map(number_of_baskets_per_week)

In [64]:
targets_lvl_2['user_average_basket_count']=targets_lvl_2.user_id.map(user_average_basket_count)
targets_lvl_2_als['user_average_basket_count']=targets_lvl_2_als.user_id.map(user_average_basket_count)

In [65]:
for i in range(1,8):
    targets_lvl_2[f'week_day{i}']= targets_lvl_2.user_id.map(days_of_purchase[i]/days_of_purchase['total'])
    targets_lvl_2_als[f'week_day{i}']= targets_lvl_2_als.user_id.map(days_of_purchase[i]/days_of_purchase['total'])

In [66]:
item_average_basket_count = data_for_featch.groupby(['item_id','basket_id'])['sales_value'].count()
item_average_basket_count = item_average_basket_count.groupby('item_id').mean()

In [67]:
days_of_purchase_item = data_for_featch[['item_id','day']].copy()
days_of_purchase_item['week_day'] = days_of_purchase_item.day%7
days_of_purchase_item.loc[days_of_purchase_item.week_day==0,'week_day'] = 7
days_of_purchase_item = pd.pivot_table(days_of_purchase_item,index='item_id',columns='week_day',values = 'day',
                           aggfunc='count',   fill_value=0)
days_of_purchase_item['total'] = total_item_purchase

In [68]:
item_price = data_for_featch[['item_id','quantity','sales_value']].copy()
item_price['price'] = item_price.sales_value/item_price.quantity
item_price = item_price.groupby('item_id')['price'].max()

In [69]:
targets_lvl_2['item_average_basket_count']=targets_lvl_2.item_id.map(item_average_basket_count)
targets_lvl_2_als['item_average_basket_count']=targets_lvl_2_als.item_id.map(item_average_basket_count)

In [70]:
targets_lvl_2['item_price']=targets_lvl_2.item_id.map(item_price)
targets_lvl_2_als['item_price']=targets_lvl_2_als.item_id.map(item_price)

In [71]:
for i in range(1,8):
    targets_lvl_2[f'week_day_{i}_item']= targets_lvl_2.item_id.map(days_of_purchase_item[i]/days_of_purchase_item['total'])
    targets_lvl_2_als[f'week_day_{i}_item']= targets_lvl_2_als.item_id.map(days_of_purchase_item[i]/days_of_purchase_item['total'])

In [72]:
targets_lvl_2.loc[targets_lvl_2.item_price==np.inf,'item_price']=0
targets_lvl_2_als.loc[targets_lvl_2_als.item_price==np.inf,'item_price']=0

In [73]:
# ПРоверим результаты

In [74]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2['target']
X_train_als = targets_lvl_2_als.drop('target', axis=1)
y_train_als = targets_lvl_2_als['target']

cat_feats = ['manufacturer', 'department', 'brand',
       'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product',
       'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc',
       'hh_comp_desc', 'household_size_desc', 'kid_category_desc']
X_train[cat_feats] = X_train[cat_feats].astype('category')
X_train_als[cat_feats] = X_train_als[cat_feats].astype('category')


res_ = my_LGBMClassifier((X_train, y_train))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'simular_item_bm52_iu_f'

res_ = my_LGBMClassifier((X_train_als, y_train_als))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'als_bm52_ui_f'


res_ = my_LGBMRanker((X_train, y_train))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'simular_item_bm52_ui_f'

res_ = my_LGBMRanker((X_train_als, y_train_als))
res_score=pd.concat([res_score,res_])
res_score.tail(1).note = 'als_no_bm52_ui_f'
res_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Unnamed: 0,precision@5,note
LGBMClassifier,0.30705,simular_item_no_featch
LGBMClassifier,0.28846,als_no_featch
lgb_ranker,0.301828,simular_item_no_featch
lgb_ranker,0.301828,als_no_featch
LGBMClassifier,0.312689,simular_item_only_bm52
LGBMClassifier,0.309034,als_only_bm52
lgb_ranker,0.340574,simular_item_only_bm52
lgb_ranker,0.340574,als_no_only_bm52
LGBMClassifier,0.32,simular_item_bm52_iu
LGBMClassifier,0.323342,als_bm52_ui


### Выводы

В построении одноруровневой модели использовали подходы уменьшения числа популярных товаров сдо 5 и 10 тысяч, так же пробовали разные поля для взвешивания матрицы UIM/
Пробовали длеать расчеты без взвешивания, tf_idf и bm52. Для построоения предсказаний использовали подходы own_recommender	, item_simular(в том числе включая item уже купленный пользователем), а так же als.  
Из таблицы видно, что существенно лучший результат дает item_simular - 0.400269, причем на него практически не влияют ни поля по которым мы взвешиваем, ни тип взвешивания ни количество ограничений топ популярных item.  
Второй по значимости метод - als. На него уже сильно вдиет тип взыешивания, поля по которым расчитывается матрица,  и в меньшей мере количесвто топ популярных товаров.. Наилучший результат достигнут при использовании :
quantity,count	 - bm25	 - als	 - 10 000 - 	0.3476
Обучение  одноуровневой модели велось на данных минус 3 недели. Тест посленине три недели.

Для работы с двухуровневой моделью мы берем 50 лучших рекомендаций из двух лучших систем item_simular и als. С лучшими параметрами. Обучаем на данных минус 9 недель. Получаем recall примерно одинаковый.  
Далее используя модель бустинга LGBMClassifier и LGBMRanker обучаем на таргете полученом с 3 по 9 неделю с конца. Сверяем результаты на данных последних трех недель.  
ПРоверяем несколько вариантов расчетов:  
Только uaer_id и item_id + target
Далее добавляем коф взвешивания quantity,count	 - bm25
Дадее долбавляем информацию о товарах и пользователях  
Далнее разрабатываем новые фичи  
Результат показал, что LGBMRanker работает лучше, чем больше фич, тем выше результат.
    Максимальное качество полученно - 0.365849. Причем из какой выборки были получены кандидаты (als или item_simular) значения не имеет




Таким образом при исходных данных и схеме валидации, лучший результат показала одноуровневая модель item_simular