In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import precision_at_k, mean_average_precision_at_k, AUC_at_k, ndcg_at_k

from scipy.spatial.distance import cdist
import tqdm

# Модель второго уровня
from lightgbm import LGBMClassifier

In [2]:
CANDIDATS_NUMBER = 210

In [3]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    precision = flags.sum() / len(recommended_list)
        
    return precision

In [4]:
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


In [5]:
test = pd.read_csv('./data/test_users.csv')
test_users = test['user_id'].unique()
len(test_users)

1708

In [6]:
data = pd.read_csv('./data/retail_train.csv')
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [7]:
data['weighted'] = data['week_no'] ** 4

In [8]:

val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3


data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

#data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks))]


data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]



In [9]:
def get_alt_recommendation(data_train, data_test, n_recomendation = 5):
    #global fake_data, user_item_matrix, data_train_raw, result, id_list, userid_to_id
    
    k_neigbours=9 
    
    
    def get_N_niarest_vector(matrix, sample_vector, n=5):
        dist_line = cdist(sample_vector, matrix, metric='cosine')
        items = dist_line[0].argsort()[-n:][::-1]
        batch = matrix[items]
        dist_selected = dist_line[0][items]
        return batch, (dist_selected.sum()-1)/(n-1)    
    
    
    
    def get_user_recomrndations( user_item_matrix, sample_vector, k_neigbours=9, n_recomendation = 5):
        global rec_from_all
        batch, dist = get_N_niarest_vector(user_item_matrix.values, sample_vector, k_neigbours)

        #Вектор личных предпочтений
        rec_from_yourself = sample_vector

        #Вектор всех предпочтений.
        rec_from_all = user_item_matrix.values.sum(axis=0)/len(user_item_matrix)

        #Вектор ближайших предпочтений.
        rec_from_neighbours = batch.sum(axis=0)/len(batch)

        k1=(dist*5.5)   
        k2=(dist*0.001) 

        #Некий целевой вектор.
        rec = rec_from_yourself + rec_from_all/k1 # + rec_from_neighbours*k2

        #Отбор товаров с максимальными весами.
        items = rec[0].argsort()[-n_recomendation-1:][::-1]

        idx_to_zerro = itemid_to_id[999999]
        items = list(items)
        if idx_to_zerro in items:
            items.remove(idx_to_zerro)
        items =items[:n_recomendation]

                
        return items
    
    

    def get_common_recomrndations( user_item_matrix, sample_vector, k_neigbours=9, n_recomendation = 5):


        rec_from_all = user_item_matrix.values.sum(axis=0)/len(user_item_matrix)

        rec_from_all_items = rec_from_all.argsort()[-n_recomendation-1:][::-1]

        idx_to_zerro = itemid_to_id[999999]
        rec_from_all_items = list(rec_from_all_items)
        if idx_to_zerro in rec_from_all_items:
            rec_from_all_items.remove(idx_to_zerro)
        rec_from_all_items = rec_from_all_items[:n_recomendation] 


        common_items = [id_to_itemid[rec] for rec in rec_from_all_items]

        return common_items 
  
    
    
    
    result_users = data_train['user_id'].unique()
    data_test = data_test[data_test['user_id'].isin(result_users)]
    
    
    popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
    popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

    data_train_raw = data_train.copy()

    top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

    # Заведем фиктивный item_id (если юзер НЕ покупал товары из топ-5000, то он "купил" такой товар)
    data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

    user_item_matrix = pd.pivot_table(data_train, 
                                      index='user_id', columns='item_id', 
                                      values='weighted', #       values='quantity',
                                      aggfunc='sum',   #aggfunc='count',
                                      fill_value=0
                                     )

    #user_item_matrix[user_item_matrix > 0] = 1 # так как в итоге хотим предсказать 
    user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

    # переведем в формат saprse matrix
    sparse_user_item = csr_matrix(user_item_matrix).tocsr()



    # перенумеруем пользователей и товары
    userids = user_item_matrix.index.values
    itemids = user_item_matrix.columns.values

    matrix_userids = np.arange(len(userids))
    matrix_itemids = np.arange(len(itemids))

    id_to_itemid = dict(zip(matrix_itemids, itemids))
    id_to_userid = dict(zip(matrix_userids, userids))

    itemid_to_id = dict(zip(itemids, matrix_itemids))
    userid_to_id = dict(zip(userids, matrix_userids))

    
    result = data_test.groupby('user_id')['item_id'].unique().reset_index()
    result.columns=['user_id', 'actual']
    result['actual'] = result['actual'].apply(lambda x: list(x))
    

    #k_neigbours=9 
    #n_recomendation = 5

    id_list = result['user_id'].map(userid_to_id).values

    rec_list=[]
    for i in tqdm.trange(len(id_list)):    
        sample_vector = np.reshape(user_item_matrix.values[id_list[i]],(1,-1))
        item = get_user_recomrndations( user_item_matrix, sample_vector, k_neigbours=k_neigbours, n_recomendation = n_recomendation)
        rec_list.append(item)
    

    result_list=[]
    for i in tqdm.trange(len(rec_list)):
        item = [id_to_itemid[rec] for rec in rec_list[i]]
        result_list.append(item)
       

    result['test'] = result_list
    
    rec_from_all_items  =  get_common_recomrndations( user_item_matrix, sample_vector, k_neigbours=9, n_recomendation = 5)
    
    return result,  rec_from_all_items 

In [10]:
result,  rec_from_all_items = get_alt_recommendation(data_train_lvl_1, data_val_lvl_1, n_recomendation = CANDIDATS_NUMBER)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
100%|██████████| 2153/2153 [09:10<00:00,  3.91it/s]
100%|██████████| 2153/2153 [00:01<00:00, 1362.40it/s]


In [11]:
result.apply(lambda x: precision_at_k(x['test'], x['actual'],  5), axis=1).mean()

0.47803065490013935

In [12]:
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique(), columns = ['user_id']) 
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]
users_lvl_2['candidates'] = result['test']
users_lvl_2

Unnamed: 0,user_id,candidates
0,2070,"[856942, 1082185, 940947, 995242, 5577022, 952..."
1,2021,"[826784, 8090521, 1106523, 1133018, 951590, 10..."
2,1753,"[962229, 6773204, 951821, 990797, 998119, 8464..."
3,2120,"[1082185, 1037863, 840361, 1029743, 1119051, 9..."
4,1346,"[6944571, 1122358, 828867, 1082185, 1106523, 1..."
...,...,...
2149,1446,"[860776, 995785, 1066685, 897125, 834484, 8705..."
2150,1784,"[1070820, 1082185, 1022066, 1076580, 870515, 1..."
2151,436,"[1070820, 944317, 1082185, 5570048, 5568378, 8..."
2152,1697,"[1082185, 859237, 6534178, 951703, 10254382, 1..."


In [13]:
default_rec  =  rec_from_all_items

In [14]:
candidates = users_lvl_2['candidates'].values
for i in range(len(candidates)):
    if candidates[i] is np.nan:
        candidates[i] = default_rec
        
users_lvl_2['candidates'] = candidates
users_lvl_2

Unnamed: 0,user_id,candidates
0,2070,"[856942, 1082185, 940947, 995242, 5577022, 952..."
1,2021,"[826784, 8090521, 1106523, 1133018, 951590, 10..."
2,1753,"[962229, 6773204, 951821, 990797, 998119, 8464..."
3,2120,"[1082185, 1037863, 840361, 1029743, 1119051, 9..."
4,1346,"[6944571, 1122358, 828867, 1082185, 1106523, 1..."
...,...,...
2149,1446,"[860776, 995785, 1066685, 897125, 834484, 8705..."
2150,1784,"[1070820, 1082185, 1022066, 1076580, 870515, 1..."
2151,436,"[1070820, 944317, 1082185, 5570048, 5568378, 8..."
2152,1697,"[1082185, 859237, 6534178, 951703, 10254382, 1..."


In [15]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2.head(2)

Unnamed: 0,user_id,item_id
0,2070,856942.0
0,2070,1082185.0


In [16]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
display(targets_lvl_2)
targets_lvl_2['target'].mean()

Unnamed: 0,user_id,item_id,target
0,2070,856942.0,0.0
1,2070,1082185.0,1.0
2,2070,940947.0,0.0
3,2070,995242.0,1.0
4,2070,5577022.0,0.0
...,...,...,...
459173,1745,1082185.0,0.0
459174,1745,6534178.0,0.0
459175,1745,1029743.0,0.0
459176,1745,995242.0,0.0


0.04885687032044218

In [17]:
def add_features(ds, data_ds=None):
    if data_ds is None:
        data_ds = ds
    
    
    ds = ds.merge(item_features, on='item_id', how='left')
    ds = ds.merge(user_features, on='user_id', how='left')
    ds = ds.merge(data_ds, on=['user_id', 'item_id'], how='left')
    
    ds = ds.groupby(['user_id', 'item_id']).first().reset_index()

    baskets_by_user = data_ds.groupby('user_id')['basket_id'].nunique().reset_index().rename(columns={'basket_id': 'baskets_by_user'})
    sales_value_by_user = data_ds.groupby('user_id')['sales_value'].sum().reset_index().rename(columns={'sales_value': 'sales_value_by_user'})

    ds = ds.merge(baskets_by_user, on='user_id', how='left').merge(sales_value_by_user, on='user_id', how='left')
    ds['user_mean_check'] = ds['sales_value_by_user'] / ds['baskets_by_user'] 
    # print(ds.shape)

    train_with_features = data_ds.merge(item_features, on='item_id', how='left')

    quantity_by_user_commodity_desc = train_with_features.groupby(['user_id', 'commodity_desc'])['quantity'].sum().reset_index().rename(columns={'quantity': 'quantity_by_user_commodity_desc'})
    ds = ds.merge(quantity_by_user_commodity_desc, on=['user_id', 'commodity_desc'], how='left').fillna({'quantity_by_user_commodity_desc':0})
    # print(ds.shape)

    weeks_by_user = data_ds.groupby('user_id')['week_no'].nunique().reset_index().rename(columns={'week_no': 'weeks_by_user'})
    ds = ds.merge(weeks_by_user, on='user_id', how='left')

    ds['baskets_per_week_by_user'] = ds['baskets_by_user'] / ds['weeks_by_user'] 
    # print(ds.shape)

    weeks_by_item = data_ds.groupby('item_id')['week_no'].nunique().reset_index().rename(columns={'week_no': 'weeks_by_item'})
    quanity_by_item = data_ds.groupby('item_id')['quantity'].sum().reset_index().rename(columns={'quantity': 'quantity_by_item'})
    ds = ds.merge(weeks_by_item, on='item_id', how='left').merge(quanity_by_item, on='item_id', how='left')

    ds['quanity_per_week_by_item'] = ds['quantity_by_item'] / ds['weeks_by_item'] 
    #print(ds.shape)

    cat_features = [f for f, t in zip(ds.dtypes.index, ds.dtypes) if t == 'object']    
    for c in cat_features:
        ds[c] = ds[c].astype('category')
    
    return ds

targets_lvl_2 = add_features(targets_lvl_2, data_train_lvl_2)

targets_lvl_2

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,weighted,baskets_by_user,sales_value_by_user,user_mean_check,quantity_by_user_commodity_desc,weeks_by_user,baskets_per_week_by_user,weeks_by_item,quantity_by_item,quanity_per_week_by_item
0,1,819330.0,0.0,69,GROCERY,Private,CONVENIENT BRKFST/WHLSM SNACKS,GRANOLA BARS,10 OZ,65+,...,,7,341.78,48.825714,0.0,6,1.166667,6.0,11.0,1.833333
1,1,822140.0,0.0,2557,GROCERY,National,CANNED JUICES,ASEPTIC PACK JUICE AND DRINKS,10/6.75 OZ,65+,...,,7,341.78,48.825714,0.0,6,1.166667,5.0,9.0,1.800000
2,1,825365.0,0.0,69,MEAT-PCKGD,Private,LUNCHMEAT,PEPPERONI/SALAMI,8 OZ,65+,...,,7,341.78,48.825714,2.0,6,1.166667,6.0,48.0,8.000000
3,1,825494.0,0.0,69,GROCERY,Private,FRZN VEGETABLE/VEG DSH,FRZN BAGGED VEGETABLES - PLAIN,16 OZ,65+,...,,7,341.78,48.825714,2.0,6,1.166667,5.0,21.0,4.200000
4,1,826249.0,0.0,69,GROCERY,Private,BAKED BREAD/BUNS/ROLLS,HAMBURGER BUNS,12 OZ,65+,...,,7,341.78,48.825714,9.0,6,1.166667,6.0,310.0,51.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451920,2500,12324954.0,0.0,69,GROCERY,Private,BAKED BREAD/BUNS/ROLLS,HAMBURGER BUNS,16CT/24 OZ,,...,,10,417.00,41.700000,3.0,5,2.000000,6.0,8.0,1.333333
451921,2500,12384775.0,0.0,3650,MEAT,National,BEEF,PATTIES,,,...,,10,417.00,41.700000,2.0,5,2.000000,5.0,25.0,5.000000
451922,2500,12988031.0,0.0,544,GROCERY,National,BAG SNACKS,MISC BAG SNACKS,10.5 OZ,,...,,10,417.00,41.700000,0.0,5,2.000000,6.0,26.0,4.333333
451923,2500,13007846.0,0.0,759,GROCERY,National,YOGURT,YOGURT MULTI-PACKS,16 OZ,,...,,10,417.00,41.700000,21.0,5,2.000000,5.0,12.0,2.400000


In [18]:
cat_features = [f for f, t in zip(targets_lvl_2.dtypes.index, targets_lvl_2.dtypes) if str(t) == 'category']
cat_features

['department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [19]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2['target']

In [20]:
lgb = LGBMClassifier(objective='binary', max_depth=7, num_leaves=256)
lgb.fit(X_train, y_train, categorical_feature=cat_features)



LGBMClassifier(max_depth=7, num_leaves=256, objective='binary')

In [21]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index().copy()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.shape

(2042, 2)

# data_train_lvl_2 update

In [22]:
users_lvl_2 = pd.DataFrame(data_val_lvl_2['user_id'].unique(), columns = ['user_id']) 
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

In [23]:
users_lvl_2.shape, candidates.shape

((2041, 1), (2153,))

In [24]:
users_lvl_2['candidates'] = result['test'] 

In [25]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
X_val = add_features(users_lvl_2, data_val_lvl_2)
X_val

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,weighted,baskets_by_user,sales_value_by_user,user_mean_check,quantity_by_user_commodity_desc,weeks_by_user,baskets_per_week_by_user,weeks_by_item,quantity_by_item,quanity_per_week_by_item
0,1,820301,1276,GROCERY,National,WATER - CARBONATED/FLVRD DRINK,NON-CARB FLVRD DRNKNG/MNRL WAT,101.4 OZ,65+,A,...,,4,200.12,50.0300,0.0,3,1.333333,2.0,2.0,1.00
1,1,823704,2082,MEAT-PCKGD,National,BREAKFAST SAUSAGE/SANDWICHES,ROLLS - PORK,16 OZ,65+,A,...,,4,200.12,50.0300,0.0,3,1.333333,4.0,70.0,17.50
2,1,824005,673,PRODUCE,National,MUSHROOMS,MUSHROOMS WHITE SLICED PKG,8 OZ,65+,A,...,,4,200.12,50.0300,0.0,3,1.333333,4.0,93.0,23.25
3,1,826597,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,65+,A,...,,4,200.12,50.0300,1.0,3,1.333333,4.0,14.0,3.50
4,1,827667,1102,GROCERY,National,BAKED SWEET GOODS,SNACK CAKE - MULTI PACK,16 OZ,65+,A,...,,4,200.12,50.0300,0.0,3,1.333333,4.0,12.0,3.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428605,2500,9835606,4037,MEAT,National,PORK,RIBS,,,,...,,12,418.71,34.8925,0.0,4,3.000000,2.0,8.0,4.00
428606,2500,12301100,4001,MEAT,National,BEEF,LEAN,,,,...,,12,418.71,34.8925,0.0,4,3.000000,4.0,55.0,13.75
428607,2500,12301109,4008,MEAT,National,BEEF,PRIMAL,,,,...,,12,418.71,34.8925,0.0,4,3.000000,4.0,55.0,13.75
428608,2500,12810393,3490,MEAT,National,PORK,ENHANCED,,,,...,,12,418.71,34.8925,0.0,4,3.000000,4.0,92.0,23.00


In [26]:
pred_ds = X_val[['user_id', 'item_id']].copy()
pred_ds['proba'] = lgb.predict_proba(X_val)[:,1]
pred_ds = pred_ds.groupby(['user_id', 'item_id'])['proba'].mean().reset_index() # пары user_id-item_id могут быть не уникальны
pred_s = pred_ds.groupby('user_id').apply(lambda x: x.sort_values('proba', ascending=False)['item_id'].tolist())#.reset_index().rename(columns={0:'lvl_2'})

#top = pred_ds.groupby('item_id')['proba'].mean().reset_index().sort_values('proba', ascending=False)['item_id'].tolist()

top = X_train.groupby('item_id')['quantity'].sum().reset_index().sort_values('quantity', ascending=False)['item_id'].tolist()


def get_lvl_2_recommendations(user_id, N=5):
    r = pred_s[user_id]
    return r[:N]

def get_recommendations(fn, user_id, N=5):
    r = []
    try:
        r = fn(user_id, N)
    except:
        print(f'Нет рекомендации для user_id={user_id} из {fn}')
    if len(r) < N:
        print(f'Недостаточно рекомендации для user_id={user_id} из {fn}, добавляем {N - len(r)} из наилучших для lvl_2')
        r += top[:N - len(r)] 
    return r[:N]

In [27]:
result_lvl_2['lvl_2'] = result_lvl_2['user_id'].apply(lambda u: get_recommendations(get_lvl_2_recommendations, u, 5))
result_lvl_2.head(2)

Нет рекомендации для user_id=1984 из <function get_lvl_2_recommendations at 0x0000011D9F723318>
Недостаточно рекомендации для user_id=1984 из <function get_lvl_2_recommendations at 0x0000011D9F723318>, добавляем 5 из наилучших для lvl_2


Unnamed: 0,user_id,actual,lvl_2
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1031697, 1004906, 1082185, 995242, 990656]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1127831, 6534178, 1029743, 1082185, 15927403]"


In [28]:
candidates = result_lvl_2['lvl_2'].values
for i in range(len(candidates)):
    if candidates[i] is np.nan:
        candidates[i] = rec_from_all_items
        
result_lvl_2['lvl_2'] = candidates
result_lvl_2

Unnamed: 0,user_id,actual,lvl_2
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1031697, 1004906, 1082185, 995242, 990656]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1127831, 6534178, 1029743, 1082185, 15927403]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1024306, 1071939, 995242, 1037863, 840361]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[1013321, 1082185, 5592931, 5568378, 866211]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[845208, 923746, 1029743, 859075, 1041259]"
...,...,...,...
2037,2496,[6534178],"[6534178, 819845, 13115579, 1127831, 12757316]"
2038,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ...","[951590, 1070820, 6534178, 5569230, 938700]"
2039,2498,"[15716530, 834484, 901776, 914190, 958382, 972...","[1082185, 914190, 834484, 1070820, 13115493]"
2040,2499,"[867188, 877580, 902396, 914190, 951590, 95813...","[5569471, 914190, 976998, 1106091, 5568729]"


In [29]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(bought_list, recommended_list)
    precision = flags.sum() / len(recommended_list)
        
    return precision

In [30]:
result_lvl_2.apply(lambda x: precision_at_k(x['lvl_2'], x['actual'], k=5), axis=1).mean()

0.6426052889324191

# 0.6426052889324191

# Step 2

In [31]:
test = pd.read_csv('./data/test_users.csv')
test_users = test['user_id'].unique()
len(test_users)


1708

In [32]:
id_to_pred_list = test['user_id'].values
data_to_pred = data[data['user_id'].isin(id_to_pred_list)]
data_to_pred.shape

(2045045, 13)

In [33]:
candidates,  rec_from_all_items = get_alt_recommendation(data_train_lvl_1, data_to_pred, n_recomendation = CANDIDATS_NUMBER)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
100%|██████████| 1707/1707 [07:16<00:00,  3.91it/s]
100%|██████████| 1707/1707 [00:01<00:00, 1592.20it/s]


In [34]:
users_lvl_3 = pd.DataFrame(test['user_id'].unique(), columns = ['user_id']) 
users_lvl_3['candidates'] = candidates['test']

In [35]:
s = users_lvl_3.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_3 = users_lvl_3.drop('candidates', axis=1).join(s)
X_val = add_features(users_lvl_3, data_to_pred)
X_val

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,weighted,baskets_by_user,sales_value_by_user,user_mean_check,quantity_by_user_commodity_desc,weeks_by_user,baskets_per_week_by_user,weeks_by_item,quantity_by_item,quanity_per_week_by_item
0,1,820165.0,2.0,PRODUCE,National,CITRUS,ORANGES NAVELS ALL,,65+,A,...,40960000.0,79.0,3959.91,50.125443,60.0,64.0,1.234375,81,4503,55.592593
1,1,823721.0,317.0,GROCERY,National,CHEESE,GRATED CHEESE,8 OZ,65+,A,...,3111696.0,79.0,3959.91,50.125443,57.0,64.0,1.234375,90,443,4.922222
2,1,826249.0,69.0,GROCERY,Private,BAKED BREAD/BUNS/ROLLS,HAMBURGER BUNS,12 OZ,65+,A,...,,79.0,3959.91,50.125443,122.0,64.0,1.234375,95,5363,56.452632
3,1,826695.0,135.0,GROCERY,National,FRZN POTATOES,FRZN FRENCH FRIES,20 OZ,65+,A,...,15752961.0,79.0,3959.91,50.125443,2.0,64.0,1.234375,70,144,2.057143
4,1,830503.0,1089.0,MEAT-PCKGD,National,LUNCHMEAT,HAM,9 OZ,65+,A,...,8503056.0,79.0,3959.91,50.125443,41.0,64.0,1.234375,81,247,3.049383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358465,2499,13115493.0,1276.0,GROCERY,National,ISOTONIC DRINKS,ISOTONIC DRINKS SINGLE SERVE,32 OZ,25-34,U,...,,85.0,2874.39,33.816353,8.0,49.0,1.734694,51,512,10.039216
358466,2499,13512060.0,69.0,MEAT,Private,SMOKED MEATS,HAM- NET WEIGHT,8 OZ,25-34,U,...,,85.0,2874.39,33.816353,1.0,49.0,1.734694,38,119,3.131579
358467,2499,13842224.0,2224.0,GROCERY,National,SOFT DRINKS,SOFT DRINK BOTTLE NON-CARB (EX,,25-34,U,...,,85.0,2874.39,33.816353,54.0,49.0,1.734694,29,142,4.896552
358468,2499,15511891.0,69.0,GROCERY,Private,BAKED BREAD/BUNS/ROLLS,HOT DOG BUNS,16 OZ,25-34,U,...,31640625.0,85.0,2874.39,33.816353,51.0,49.0,1.734694,34,236,6.941176


In [36]:
pred_ds = X_val[['user_id', 'item_id']].copy()
pred_ds['proba'] = lgb.predict_proba(X_val)[:,1]
pred_ds = pred_ds.groupby(['user_id', 'item_id'])['proba'].mean().reset_index() # пары user_id-item_id могут быть не уникальны
pred_s = pred_ds.groupby('user_id').apply(lambda x: x.sort_values('proba', ascending=False)['item_id'].tolist())#.reset_index().rename(columns={0:'lvl_2'})

#top = pred_ds.groupby('item_id')['proba'].mean().reset_index().sort_values('proba', ascending=False)['item_id'].tolist()

top = X_train.groupby('item_id')['quantity'].sum().reset_index().sort_values('quantity', ascending=False)['item_id'].tolist()


def get_lvl_2_recommendations(user_id, N=5):
    r = pred_s[user_id]
    return r[:N]

def get_recommendations(fn, user_id, N=5):
    r = []
    try:
        r = fn(user_id, N)
    except:
        print(f'Нет рекомендации для user_id={user_id} из {fn}')
    if len(r) < N:
        print(f'Недостаточно рекомендации для user_id={user_id} из {fn}, добавляем {N - len(r)} из наилучших для lvl_2')
        r += top[:N - len(r)] 
    return r[:N]

In [37]:
test['Predicted'] = test['user_id'].apply(lambda u: get_recommendations(get_lvl_2_recommendations, u, 5))
test.head(2)

Нет рекомендации для user_id=2500 из <function get_lvl_2_recommendations at 0x0000011DA1597318>
Недостаточно рекомендации для user_id=2500 из <function get_lvl_2_recommendations at 0x0000011DA1597318>, добавляем 5 из наилучших для lvl_2


Unnamed: 0,user_id,Predicted
0,1,"[820165.0, 991268.0, 1042907.0, 1043064.0, 104..."
1,2,"[820301.0, 1068719.0, 1039549.0, 1040807.0, 10..."


In [38]:
candidates = test['Predicted'].values

for i in range(len(candidates)):
    if candidates[i] is np.nan:
        candidates[i] = rec_from_all_items
        
test['Predicted'] = candidates
test

Unnamed: 0,user_id,Predicted
0,1,"[820165.0, 991268.0, 1042907.0, 1043064.0, 104..."
1,2,"[820301.0, 1068719.0, 1039549.0, 1040807.0, 10..."
2,3,"[819255.0, 822407.0, 1068719.0, 1069979.0, 107..."
3,6,"[819308.0, 820165.0, 1055646.0, 1058779.0, 106..."
4,7,"[824915.0, 825618.0, 1070214.0, 1071377.0, 107..."
...,...,...
1703,2494,"[995785.0, 874972.0, 873203.0, 899624.0, 90436..."
1704,2496,"[1069003.0, 961554.0, 1051211.0, 1043301.0, 10..."
1705,2498,"[951197.0, 868764.0, 1138189.0, 944466.0, 8662..."
1706,2499,"[972931.0, 1132771.0, 1067419.0, 1068719.0, 10..."


In [39]:
result_list = test['Predicted'].values

for i in range(len(result_list)):
    for j in range(len(result_list[i])):       
        result_list[i][j] = str(int(result_list[i][j]))
    result_list[i] = ' '.join(result_list[i])
    #break

In [40]:
result_list[:5]

array(['820165 991268 1042907 1043064 1043301',
       '820301 1068719 1039549 1040807 1043301',
       '819255 822407 1068719 1069979 1071939',
       '819308 820165 1055646 1058779 1063207',
       '824915 825618 1070214 1071377 1072483'], dtype=object)

In [41]:
test['Predicted'] = result_list
test

Unnamed: 0,user_id,Predicted
0,1,820165 991268 1042907 1043064 1043301
1,2,820301 1068719 1039549 1040807 1043301
2,3,819255 822407 1068719 1069979 1071939
3,6,819308 820165 1055646 1058779 1063207
4,7,824915 825618 1070214 1071377 1072483
...,...,...
1703,2494,995785 874972 873203 899624 904360
1704,2496,1069003 961554 1051211 1043301 1034686
1705,2498,951197 868764 1138189 944466 866292
1706,2499,972931 1132771 1067419 1068719 1081177


In [42]:
test.columns = [['UserId', 'Predicted']]
test

Unnamed: 0,UserId,Predicted
0,1,820165 991268 1042907 1043064 1043301
1,2,820301 1068719 1039549 1040807 1043301
2,3,819255 822407 1068719 1069979 1071939
3,6,819308 820165 1055646 1058779 1063207
4,7,824915 825618 1070214 1071377 1072483
...,...,...
1703,2494,995785 874972 873203 899624 904360
1704,2496,1069003 961554 1051211 1043301 1034686
1705,2498,951197 868764 1138189 944466 866292
1706,2499,972931 1132771 1067419 1068719 1081177


In [43]:
test.to_csv('./data/result_pro2.csv', index=False)