In [1]:
import os

DIR = 'input'

TRAIN_VAL_PATH = os.path.join(DIR, 'train_val1.parquet')  
VAL_PATH = os.path.join(DIR, 'val1.parquet') 
RECS_VAL_PATH = os.path.join(DIR, 'recs_val1.parquet') 

TOPK_VAL_PATH = os.path.join(DIR, 'topk_val.parquet')
TOPK_VAL1_PATH = os.path.join(DIR, 'topk_val1.parquet')
TOPK_VAL2_PATH = os.path.join(DIR, 'topk_val2.parquet')

CLUSTERS_PATH = os.path.join(DIR, 'clusters.parquet')  
USER_DECODER_PATH = os.path.join(DIR, 'user_decoder.pkl') 
RANKER_MODEL_PATH = os.path.join(DIR, 'ranker_model.pkl')
RANKER_MODEL1_PATH = os.path.join(DIR, 'ranker_model1.pkl')
RANKER_MODEL2_PATH = os.path.join(DIR, 'ranker_model2.pkl')

RECS_NN_VAL_PATH = os.path.join(DIR, 'recs_nn_val1.parquet')

In [2]:
TOP_K_CLUSTERS = [ 
    937, 6849, 4873, 7052,  789, 4938, 5998, 5124, 4083,  345,  721,
    4018, 6995, 3334, 4327, 7401, 3684,  292, 7454, 5452, 1023, 6674,
    3366, 4236, 6983, 4647, 2214, 2895, 3205, 4031, 2578,   42, 7855,
    931, 3107, 2000, 7532, 6761, 1131, 3717, 2351, 2728, 4929, 3027,
    612,   21, 1902,  807, 4001, 3771, 1705,  602, 1020, 6428, 6699,
    6271,  554, 4308, 7589, 7002, 1997,  696,  595, 6675, 1751,  923,
    6711,  999, 1666, 1263,  919, 7602, 2285, 4543, 6051, 4540, 4828,
    3543, 6928, 1886, 6029, 5320, 2924, 7449, 4906, 7757, 1077, 5378,
    6189, 1747, 7691, 2595,  811,  103, 7043, 1339, 1574, 2570, 1249,
    735, 3173, 4739, 2152, 2226, 6021, 7739, 7777, 5187, 5299, 2604,
    6569, 5893,  466, 3483, 3640, 3870, 1442, 7114, 1338, 7747, 1867,
    2702, 3046, 1182, 1409, 4663, 4932, 1570, 6053, 6071, 3733,  712,
    3549, 6668, 1006, 4358, 4285, 3668,  885, 4129, 3293,  407, 4392,
    3555, 5812,  129,  163, 3018, 7752, 6998, 5949, 1266, 6656, 2786,
    2199, 2644, 4201, 3514, 6147, 4426, 7495, 5096, 5653,  341, 1826,
    5380,  587, 4062, 6069, 2881, 1377, 6548, 2685, 2629, 7028, 6831,
    7181, 3251, 3948, 1357, 4438, 1138, 7528, 6149, 7514, 4835, 3938,
    1932, 3358, 2503,   11, 1623, 4028, 1890, 6696,  354,  960, 1765,
    3699, 7636,
]

In [3]:
import lightgbm
import pandas as pd
import numpy as np
import pickle

from collections import defaultdict

def most_common(array):
    elements, counts = np.unique(array, return_counts=True)
    return elements[np.argpartition(counts, kth=-1)[-1]]

def apply_rank(col, df):
    if len(df) == 0:
        return []
    _, index, num_ranges = np.unique(df[col], return_counts=True, return_index=True)
    num_ranges = num_ranges[index.argsort()]
    arange = np.arange(num_ranges.max(), dtype=int)
    ranks = np.hstack([arange[:i] for i in num_ranges])
    return ranks

def get_mean_diff_dt(array):
    if len(array) == 1:
        return -1
    np_array = np.array(array)
    np_array[1:] - np_array[:-1]
    return (np_array[1:] - np_array[:-1]).mean()
    
def create_features_simple(table, train, users, clusters):
    table['count_item_id'] = (table.cluster_id.map(train['cluster_id'].value_counts()).fillna(0) / len(train)).astype(np.float32)

    table['num_orders'] = table['user_id'].map(
        train[['order_id', 'user_id']].drop_duplicates()['user_id'].value_counts()
    ).astype(np.int16)

    table['num_order_with_target_item'] = table['ui'].map(
        train[['order_id', 'ui']].drop_duplicates()['ui'].value_counts()
    ).fillna(0).astype(np.int16)
    
    last_order_ui = train[train.dt == \
          train['user_id'].map(
                train[['user_id', 'dt']].drop_duplicates().groupby('user_id').max()['dt']
    )].ui.unique()

    table['was_in_last_order'] = table['ui'].isin(last_order_ui).astype(np.int8)
    del last_order_ui

    prod_quantity = train.groupby('ui')['product_quantity'].sum()
    table['prod_quantity'] = table['ui'].map(prod_quantity).fillna(0).astype(np.int16)
    del prod_quantity
    
    prev_order_ui = train['dt'].max() - train.groupby('ui')['dt'].max()
    table['prev_order_ui'] = table['ui'].map(prev_order_ui).fillna(-1).astype(np.float32)
    del prev_order_ui
    
    mask = ~train[['user_id', 'order_id']].duplicated()
    
    table['user_retailer_most_common'] = table['user_id'].map(
        train[mask].groupby('user_id').retailer_id.apply(most_common)
    ).astype(np.int8)
    
    user_city_most_common = table['user_id'].map(
        train[mask].groupby('user_id').city_id.apply(most_common)
    ).astype(np.int16)
    
    del mask
    
    item_city_vc = (train['cluster_id'] * 100 + train['city_id']).value_counts()
    item_user_city = table['cluster_id'] * 100 + user_city_most_common
    table['user_item_city_vc'] = item_user_city.map(item_city_vc).fillna(0).astype(np.float32)
    del item_city_vc
    del item_user_city
    
    for col in ['cluster_size', 'd_mean', 'd_median']:
        table['cluster_' + col] = table['cluster_id'].map(
            clusters.set_index('cluster_id')[col]
        )
        table['cluster_' + col] = table['cluster_id'].map(
            clusters.set_index('cluster_id')[col]
        )
        
    short_train = train[train.user_id.isin(users)]
    
    table['product_quantity_sum'] = table.user_id.map(
          short_train.groupby('user_id').product_quantity.sum()
    )
    table['user_retailer_num'] = table.user_id.map(
        short_train.groupby('user_id').retailer_id.nunique()
    ).astype(np.int8)
    table['user_city_num'] = table.user_id.map(
        short_train.groupby('user_id').city_id.nunique()
    ).astype(np.int8)
    table['user_product_price_mean'] = table.user_id.map(
        short_train.groupby('user_id').product_price.mean()
    )
    table['user_product_discount_mean'] = table.user_id.map(
        (short_train.product_discount != 0).groupby(short_train.user_id).mean()
    ).astype(np.float16)
    table['user_num_clusters'] = table['user_id'].map(
        short_train[['cluster_id', 'user_id']].drop_duplicates()['user_id'].value_counts()
    ).astype(np.int16)
    table['last_user_city_id'] = table['user_id'].map(
        short_train.groupby('user_id').city_id.last()
    )
    table['last_user_retailer_id'] = table['user_id'].map(
        short_train.groupby('user_id').retailer_id.last()
    )
    
    table['user_most_common_cluster_id'] = table['user_id'].map(
        short_train.groupby('user_id').cluster_id.apply(most_common)
    )
    del short_train
    
    mask = ~train[['user_id', 'order_id', 'cluster_id']].duplicated()

    table['cluster_quantity_mean'] = table['cluster_id'].map(
        train.groupby('cluster_id').product_quantity.mean().astype(np.float16)
    )

    table['cluster_city_count'] = table['cluster_id'].map(
        train[mask].groupby('cluster_id').city_id.nunique()
    ).astype(np.float16)
    
    table['cluster_num_stores'] = table['cluster_id'].map(
        train[mask].groupby('cluster_id').store_id.nunique()
    ).astype(np.float16)
    del mask

    table['cluster_product_price_mean'] = table['cluster_id'].map(
        train.groupby('cluster_id').product_price.mean()
    ).astype(np.float16)

    table['cluster_mean_discount'] = table['cluster_id'].map(
        (train.product_discount == 0).groupby(train.cluster_id).mean().astype(np.float16)
    )

    table['num_users_bought_cluster'] = table['cluster_id'].map(
        train.groupby('cluster_id').user_id.nunique()
    ).fillna(0).astype(np.float16)

    table['num_orders_cluster'] = table['cluster_id'].map(
        train.groupby('cluster_id').order_id.nunique()
    ).fillna(0).astype(np.float16)
    
    mask = ~train[['order_id', 'cluster_id']].duplicated()
    short_train = train[mask]

    city_retailer = short_train.city_id.astype(np.int16) * 100 + short_train.retailer_id
    city_retailer_cluster = city_retailer.astype(np.int64) * 10000 + short_train.cluster_id

    city_retailer_user = user_city_most_common.astype(np.int16) * 100 + \
        table['user_retailer_most_common']
    city_retailer_cluster_user = city_retailer_user.astype(np.int64)*10000 + table.cluster_id

    table['f1'] = city_retailer_user.map(
        city_retailer.value_counts()
    ).fillna(0).astype(np.float32)

    table['f2'] = city_retailer_cluster_user.map(
        city_retailer_cluster.value_counts()
    ).fillna(0).astype(np.float32)

    table['f3'] = table['f2'] \
        / table['f1'] 
    
    
    del city_retailer_user
    del city_retailer_cluster_user

    city_retailer_user = table['last_user_city_id'].astype(np.int16) * 100 + \
        table['last_user_retailer_id']
    city_retailer_cluster_user = city_retailer_user.astype(np.int64)*10000 + table.cluster_id

    f4 = city_retailer_user.map(
        city_retailer.value_counts()
    ).fillna(0).astype(np.float32)

    table['f5'] = city_retailer_cluster_user.map(
        city_retailer_cluster.value_counts()
    ).fillna(0).astype(np.float32)

    table['f6'] = table['f5'] \
        / f4 
    del f4
    
    del city_retailer
    del city_retailer_user
    del city_retailer_cluster_user
    del city_retailer_cluster

    ui_vc = train.ui.value_counts()
    rnk_vc = train[['user_id', 'ui', 'cluster_id']].drop_duplicates()
    rnk_vc['vc'] = rnk_vc.ui.map(ui_vc)
    rnk_vc = rnk_vc.sort_values(['user_id', 'vc'], ascending=False)
    rnk_vc['rnk_user_id_ui'] = apply_rank('user_id', rnk_vc)
    table['rnk_user_id_ui'] = table.ui.map(rnk_vc.set_index('ui')['rnk_user_id_ui']
                                          ).fillna(10000).astype(np.int16)
    del ui_vc

    rnk_vc = rnk_vc.sort_values(['cluster_id', 'vc'], ascending=False)
    rnk_vc['rnk_cluster_id_ui'] = apply_rank('cluster_id', rnk_vc)
    table['rnk_cluster_id_ui'] = table.ui.map(rnk_vc.set_index('ui')['rnk_cluster_id_ui']
                                          ).fillna(10000).astype(np.int16)
    del rnk_vc

    rnk_vc = train['cluster_id'].value_counts().to_frame()
    rnk_vc['rnk_cluster_id'] = np.arange(len(rnk_vc))
    table['rnk_cluster_id'] = table.cluster_id.map(rnk_vc['rnk_cluster_id']
                                                  ).fillna(10000).astype(np.int16)
    del rnk_vc

    cluster_city_vc = (train['city_id'].astype(np.int32) * 10000 + train['cluster_id']
                      ).value_counts()
    rnk_vc = train[['city_id', 'cluster_id']].drop_duplicates()
    rnk_vc['cluster_city'] = rnk_vc['city_id'].astype(np.int32) * 10000 + rnk_vc['cluster_id']
    rnk_vc['vc'] = rnk_vc['cluster_city'].map(cluster_city_vc)
    rnk_vc = rnk_vc.sort_values(['city_id', 'vc'], ascending=False)
    rnk_vc['rnk_cluster_city'] = apply_rank('city_id', rnk_vc)
    user_city_cluster = table['last_user_city_id'].astype(np.int32) * 10000 \
        + table['cluster_id']
    table['rnk_cluster_city'] = user_city_cluster.map(
        rnk_vc.set_index('cluster_city')['rnk_cluster_city']
    ).fillna(10000).astype(np.int16)
    del cluster_city_vc
    del rnk_vc
    del user_city_cluster

    cluster_retailer_vc = (train['retailer_id'].astype(np.int32) * 10000 + train['cluster_id']
                      ).value_counts()
    rnk_vc = train[['retailer_id', 'cluster_id']].drop_duplicates()
    rnk_vc['cluster_retailer'] = rnk_vc['retailer_id'].astype(np.int32) * 10000 + rnk_vc['cluster_id']
    rnk_vc['vc'] = rnk_vc['cluster_retailer'].map(cluster_retailer_vc)
    rnk_vc = rnk_vc.sort_values(['retailer_id', 'vc'], ascending=False)
    rnk_vc['rnk_cluster_retailer'] = apply_rank('retailer_id', rnk_vc)
    user_retailer_cluster = table['last_user_retailer_id'].astype(np.int32) * 10000 \
        + table['cluster_id']
    table['rnk_cluster_retailer'] = user_retailer_cluster.map(
        rnk_vc.set_index('cluster_retailer')['rnk_cluster_retailer']
    ).fillna(10000).astype(np.int16)
    user_retailer_cluster = table['user_retailer_most_common'].astype(np.int32) * 10000 \
        + table['cluster_id']
    table['rnk_cluster_retailer2'] = user_retailer_cluster.map(
        rnk_vc.set_index('cluster_retailer')['rnk_cluster_retailer']
    ).fillna(10000).astype(np.int16)
    del cluster_retailer_vc
    del rnk_vc
    del user_retailer_cluster

    cluster_retailer_city_vc = (train['city_id'].astype(np.int64) * 10000000 + \
        train['retailer_id'].astype(np.int64) * 10000 + \
        train['cluster_id']).value_counts()
    rnk_vc = train[['retailer_id', 'cluster_id', 'city_id']].drop_duplicates()
    rnk_vc['cluster_retailer_city'] = (rnk_vc['city_id'].astype(np.int64) * 10000000 + \
        rnk_vc['retailer_id'].astype(np.int64) * 10000 + \
        rnk_vc['cluster_id'])
    rnk_vc['vc'] = rnk_vc['cluster_retailer_city'].map(cluster_retailer_city_vc)
    rnk_vc['retailer_city'] = (rnk_vc['city_id'].astype(np.int64) * 1000 + \
        rnk_vc['retailer_id'].astype(np.int64))
    rnk_vc = rnk_vc.sort_values(['retailer_city', 'vc'], ascending=False)
    rnk_vc['rnk_cluser_city_retailer'] = apply_rank('retailer_city', rnk_vc)
    user_retailer_city_cluster = (table['last_user_city_id'].astype(np.int64) * 10000000 + \
        table['last_user_retailer_id'].astype(np.int64) * 10000 + \
        table['cluster_id'])
    table['rnk_cluster_retailer_city'] = user_retailer_city_cluster.map(
        rnk_vc.set_index('cluster_retailer_city')['rnk_cluser_city_retailer']
    ).fillna(10000).astype(np.int16)
    user_retailer_city_cluster = (table['last_user_city_id'].astype(np.int64) * 10000000 + \
        table['user_retailer_most_common'].astype(np.int64) * 10000 + \
        table['cluster_id'])
    table['rnk_cluster_retailer_city2'] = user_retailer_city_cluster.map(
        rnk_vc.set_index('cluster_retailer_city')['rnk_cluser_city_retailer']
    ).fillna(10000).astype(np.int16)
    del cluster_retailer_city_vc
    del rnk_vc
    del user_retailer_city_cluster

    
    return table

def create_table(train, recs_nn, recs_mf, users):
    
    recs_nn['rnk'] = apply_rank('user_id', recs_nn)
    recs_mf['rnk'] = apply_rank('user_id', recs_mf)

    mask1 = recs_nn['user_id'].isin(users)
    mask2 = ~recs_mf.ui.isin(recs_nn.ui) & recs_mf['user_id'].isin(users)
    mask3 = ~(train.ui.isin(recs_nn.ui) | train.ui.isin(recs_mf.ui) \
              | train.ui.duplicated())  & train['user_id'].isin(users)
    
    table = pd.concat([
        recs_nn[['user_id', 'cluster_id']][mask1], 
        recs_mf[['user_id', 'cluster_id']][mask2], 
        train[['user_id', 'cluster_id']][mask3]
    ])
    table.reset_index(drop=True, inplace=True)
    del mask1
    del mask2
    del mask3
    table['ui'] = table['user_id'].astype(np.int64) * 10000 + table['cluster_id']
    
    table['rnk'] = table['ui'].map(
        recs_nn.set_index('ui')['rnk']
    ).fillna(10000).astype(np.int16)
    
    table['score'] = table['ui'].map(
        recs_nn.set_index('ui')['scores']
    ).fillna(-100).astype(np.float32)
    
    recs_nn = recs_nn[~recs_nn.ui.isin(train.ui)]
    recs_nn['rnk2'] = apply_rank('user_id', recs_nn)
    table['rnk2'] = table['ui'].map(
        recs_nn.set_index('ui')['rnk']
    ).fillna(10000).astype(np.int16)
    
    table['rnk3'] = table['ui'].map(
        recs_mf.set_index('ui')['rnk']
    ).fillna(10000).astype(np.int16)
    
    table['score2'] = table['ui'].map(
        recs_mf.set_index('ui')['score']
    ).fillna(-100).astype(np.float32)
    
    recs_mf = recs_mf[~recs_mf.ui.isin(train.ui)]
    recs_mf['rnk2'] = apply_rank('user_id', recs_mf)
    table['rnk4'] = table['ui'].map(
        recs_mf.set_index('ui')['rnk2']
    ).fillna(10000).astype(np.int16)
    
    return table
    

In [4]:
def get_recs(pred, users, items, already_bought, weights, num_recs=20):
    fix_pred = pred * (1.37 - already_bought) * (weights ** 1.5)
    indexes = (-fix_pred).argsort()
    recs = defaultdict(list)
    for user_id, item_id in zip(users[indexes], items[indexes]):
        if len(recs[user_id]) < num_recs:
            recs[user_id].append(item_id)
    return recs


def get_cluster_weights(dataset: pd.DataFrame) -> pd.DataFrame:

    cluster_popularity = dataset["cluster_id"].value_counts().sort_values(ascending=True).reset_index()
    cluster_popularity.columns = ["cluster_id", "cnt"]
    cluster_popularity["rank"] = cluster_popularity["cnt"].rank(method="dense") + 1
    cluster_popularity["w"] = 1 / np.log10(cluster_popularity["rank"])

    return cluster_popularity[["cluster_id", "w"]]

In [5]:
def get_table(train_path, recs_nn_path, recs_mf_path, 
              users, create_features_func, val_path=None):

    train = pd.read_parquet(train_path)
    train['product_price'] = train['product_price'].astype(np.float16)
    train['product_discount'] = train['product_discount'].astype(np.float16)

    recs_nn = pd.read_parquet(recs_nn_path)
    recs_mf = pd.read_parquet(recs_mf_path)
    clusters = pd.read_parquet(CLUSTERS_PATH)
    
    for df in [train, recs_nn, recs_mf]:
        df['ui'] = df['user_id'].astype(np.int64) * 10000 + df['cluster_id']
    
    table = create_table(train, recs_nn, recs_mf, users)
    del recs_nn
    del recs_mf

    table = create_features_func(table, train, users, clusters)
    del train
    del clusters

    X = table.drop(['user_id', 'ui'], axis=1).to_numpy(dtype=np.float32)

    if val_path is None:
        return X
    
    val = pd.read_parquet(val_path)
    val['ui'] = val['user_id'].astype(np.int64) * 10000 + val['cluster_id']
    y = np.array(table['ui'].isin(val['ui']))
    
    return X, y

In [6]:
def fit_first_ranker(ranker_model_path, users):

    X, y = get_table(TRAIN_VAL_PATH, RECS_NN_VAL_PATH, RECS_VAL_PATH, 
                         users, create_features_simple, VAL_PATH)
    
    lgb_table = lightgbm.Dataset(X, y)

    params = {
        'objective': 'binary',
        'num_leaves': 50,
        'learning_rate': 0.1,
        'verbose': 0,
        'seed': 1,
    }

    gbm = lightgbm.train(params, lgb_table, num_boost_round=1000)
    
    pickle.dump(gbm, open(ranker_model_path, 'wb'))

In [7]:
def create_first_layer_models():
    val = pd.read_parquet(VAL_PATH)
    users = val.user_id.unique()
    del val
    users1 = users[users % 2 == 0]
    users2 = users[users % 2 == 1]
    del users
    fit_first_ranker(RANKER_MODEL1_PATH, users1)
    fit_first_ranker(RANKER_MODEL2_PATH, users2)

In [8]:
!pip install memory_profiler



In [9]:
%load_ext memory_profiler

In [10]:
%%time
%%memit
create_first_layer_models()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
peak memory: 30635.54 MiB, increment: 30448.21 MiB
CPU times: user 6h 40min 19s, sys: 6min 44s, total: 6h 47min 3s
Wall time: 1h 16min 21s


In [10]:
def get_some_data(train_path, recs_nn_path, recs_mf_path, users):
    
    train = pd.read_parquet(train_path)
    recs_nn = pd.read_parquet(recs_nn_path)
    recs_mf = pd.read_parquet(recs_mf_path)
    clusters = pd.read_parquet(CLUSTERS_PATH)
    
    for df in [train, recs_nn, recs_mf]:
        df['ui'] = df['user_id'].astype(np.int64) * 10000 + df['cluster_id']
    
    table = create_table(train, recs_nn, recs_mf, users)
    
    already_bought = np.array(table['ui'].isin(train['ui']))
    cluster_weights = get_cluster_weights(train)
    weights = np.array(table.cluster_id.map(
        cluster_weights.set_index('cluster_id')['w']
    ).fillna(cluster_weights['w'].max()))
    del cluster_weights
    
    return (
        np.array(table['user_id']), 
        np.array(table['cluster_id']), 
        already_bought, 
        weights
    )

def _create_top_k(train_path, recs_nn_path, recs_mf_path, 
                 users, model_path, top_k_path, model_path2=None, k=60):
    
    X = get_table(train_path, recs_nn_path, recs_mf_path, users, create_features_simple)
    print(0)
    if model_path2 is None:
        ranker_model = pickle.load(open(model_path, 'rb'))
        pred = ranker_model.predict(X)
    else:
        ranker_model = pickle.load(open(model_path, 'rb'))
        pred1 = ranker_model.predict(X)
        ranker_model = pickle.load(open(model_path2, 'rb'))
        pred2 = ranker_model.predict(X)
        pred = np.mean([pred1, pred2], axis=0)
        
    del X
    print(1)
    users, items, already_bought, weights = get_some_data(
        train_path, recs_nn_path, recs_mf_path, users
    )
    recs = get_recs(pred, users, items, already_bought, weights, num_recs=k)
    
    users = []
    items = []
    for user_id in recs:
        users += [user_id] * len(recs[user_id])
        items += recs[user_id]
    del recs

    top_k = pd.DataFrame()
    top_k['user_id'] = users
    top_k['cluster_id'] = items
    top_k.to_parquet(top_k_path)

def create_top_k():
    
    val = pd.read_parquet(VAL_PATH)
    users = val.user_id.unique()
    del val
    users1 = users[users % 2 == 0]
    users2 = users[users % 2 == 1]
    del users
    
    _create_top_k(TRAIN_VAL_PATH, RECS_NN_VAL_PATH, RECS_VAL_PATH,
             users1, RANKER_MODEL2_PATH, TOPK_VAL1_PATH)
    _create_top_k(TRAIN_VAL_PATH, RECS_NN_VAL_PATH, RECS_VAL_PATH,
         users2, RANKER_MODEL1_PATH, TOPK_VAL2_PATH)
    
    topk_val1 = pd.read_parquet(TOPK_VAL1_PATH)
    topk_val2 = pd.read_parquet(TOPK_VAL2_PATH)
    topk_val = pd.concat([topk_val1, topk_val2])
    topk_val.to_parquet(TOPK_VAL_PATH, index=False)

In [12]:
%%time
%%memit
create_top_k()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0
1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


peak memory: 30802.16 MiB, increment: 29788.19 MiB
CPU times: user 3h 40min 29s, sys: 4min 19s, total: 3h 44min 48s
Wall time: 1h 3min 38s


In [11]:
def create_fit_table(train, table, clusters, recs_nn_path, recs_mf_path):
    
    users = table.user_id.unique()
    
    recs_nn = pd.read_parquet(recs_nn_path)
    recs_mf = pd.read_parquet(recs_mf_path)
    
    for df in [train, recs_nn, recs_mf]:
        df['ui'] = df['user_id'].astype(np.int64) * 10000 + df['cluster_id']
        
    recs_nn['rnk'] = apply_rank('user_id', recs_nn)
    recs_mf['rnk'] = apply_rank('user_id', recs_mf)
    
    table['rnk'] = table['ui'].map(
        recs_nn.set_index('ui')['rnk']
    ).fillna(10000).astype(np.int16)
    
    table['score'] = table['ui'].map(
        recs_nn.set_index('ui')['scores']
    ).fillna(-100).astype(np.float32)
    
    mask = recs_nn.ui.isin(train.ui)
    
    recs_short = recs_nn[~mask]
    recs_short['rnk'] = apply_rank('user_id', recs_short)
    table['rnk2'] = table['ui'].map(
        recs_short.set_index('ui')['rnk']
    ).fillna(10000).astype(np.int16)
    
#     recs_short = recs_nn[mask]
#     recs_short['rnk'] = apply_rank('user_id', recs_short)
#     table['rnk3'] = table['ui'].map(
#         recs_short.set_index('ui')['rnk']
#     ).fillna(10000).astype(np.int16)
    del recs_nn
    
    table['rnk4'] = table['ui'].map(
        recs_mf.set_index('ui')['rnk']
    ).fillna(10000).astype(np.int16)
    
#     table['score2'] = table['ui'].map(
#         recs_mf.set_index('ui')['score']
#     ).fillna(-100).astype(np.float32)
    
    mask = recs_mf.ui.isin(train.ui)
    
    recs_short = recs_mf[~mask]
    recs_short['rnk'] = apply_rank('user_id', recs_short)
    table['rnk5'] = table['ui'].map(
        recs_short.set_index('ui')['rnk']
    ).fillna(10000).astype(np.int16)
    
#     recs_short = recs_mf[mask]
#     recs_short['rnk'] = apply_rank('user_id', recs_short)
#     table['rnk6'] = table['ui'].map(
#         recs_short.set_index('ui')['rnk']
#     ).fillna(10000).astype(np.int16)
    del recs_mf
    
#     count_user_id = table.user_id.map(train['user_id'].value_counts()).fillna(0).astype(np.int16)
    table['count_item_id'] = (table.cluster_id.map(train['cluster_id'].value_counts()).fillna(0) / len(train)).astype(np.float32)

    table['num_orders'] = table['user_id'].map(
        train[['order_id', 'user_id']].drop_duplicates()['user_id'].value_counts()
    ).astype(np.int16)

    table['num_order_with_target_item'] = table['ui'].map(
        train[['order_id', 'ui']].drop_duplicates()['ui'].value_counts()
    ).fillna(0).astype(np.int16)
    
    last_order_ui = train[train.dt == \
          train['user_id'].map(
                train[['user_id', 'dt']].drop_duplicates().groupby('user_id').max()['dt']
    )].ui.unique()

    table['was_in_last_order'] = table['ui'].isin(last_order_ui).astype(np.int8)
    del last_order_ui

    prod_quantity = train.groupby('ui')['product_quantity'].sum()
    table['prod_quantity'] = table['ui'].map(prod_quantity).fillna(0).astype(np.int16)
    del prod_quantity
    
    mask = ~train[['user_id', 'order_id']].duplicated()
    
    table['user_retailer_most_common'] = table['user_id'].map(
        train[mask].groupby('user_id').retailer_id.apply(most_common)
    ).astype(np.int8)
    
    user_city_most_common = table['user_id'].map(
        train[mask].groupby('user_id').city_id.apply(most_common)
    ).astype(np.int16)
    
    del mask
    
#     item_retailer_vc = (train['cluster_id'] * 100 + train['retailer_id']).value_counts()
#     item_user_retailer = table['cluster_id'] * 100 + table['user_retailer_most_common']
#     table['user_item_retailer_vc'] = item_user_retailer.map(item_retailer_vc).fillna(0).astype(np.float32)
#     del item_retailer_vc
#     del item_user_retailer
    
    item_city_vc = (train['cluster_id'] * 100 + train['city_id']).value_counts()
    item_user_city = table['cluster_id'] * 100 + user_city_most_common
    table['user_item_city_vc'] = item_user_city.map(item_city_vc).fillna(0).astype(np.float32)
    del item_city_vc
    del item_user_city
    
    for col in ['cluster_size', 'd_mean', 'd_median']:
        table['cluster_' + col] = table['cluster_id'].map(
            clusters.set_index('cluster_id')[col]
        )
        table['cluster_' + col] = table['cluster_id'].map(
            clusters.set_index('cluster_id')[col]
        )
        
#     # user features 
    short_train = train[train.user_id.isin(users)]
    
    ui_dt = defaultdict(list)
    short_train3 = short_train[~short_train[['ui', 'order_id']].duplicated()]
    for ui, dt in zip(short_train3['ui'], short_train3['dt']):
        ui_dt[ui].append(dt)
    del short_train3
    table['ui_dt_diff_mean'] = table.ui.map(
        {key: get_mean_diff_dt(value) for key, value in ui_dt.items()}
    ).fillna(-1).astype(np.float32)
    del ui_dt
    
    table['product_quantity_sum'] = table.user_id.map(
          short_train.groupby('user_id').product_quantity.sum()
    )
    table['user_retailer_num'] = table.user_id.map(
        short_train.groupby('user_id').retailer_id.nunique()
    ).astype(np.int8)
#     table['user_city_num'] = table.user_id.map(
#         short_train.groupby('user_id').city_id.nunique()
#     ).astype(np.int8)
    table['user_product_price_mean'] = table.user_id.map(
        short_train.groupby('user_id').product_price.mean()
    )
#     table['user_product_price_sum'] = table.user_id.map(
#         short_train.product_price.astype(np.float32).groupby(short_train.user_id).sum()
#     )
    table['user_product_discount_mean'] = table.user_id.map(
        (short_train.product_discount != 0).groupby(short_train.user_id).mean()
    ).astype(np.float32)
    table['user_num_clusters'] = table['user_id'].map(
        short_train[['cluster_id', 'user_id']].drop_duplicates()['user_id'].value_counts()
    ).astype(np.int16)
    table['last_user_city_id'] = table['user_id'].map(
        short_train.groupby('user_id').city_id.last()
    )
    table['last_user_retailer_id'] = table['user_id'].map(
        short_train.groupby('user_id').retailer_id.last()
    )
#     table['user_mean_clusters_in_order'] = table['user_id'].map(
#         short_train.groupby(['user_id', 'order_id']).cluster_id.nunique().reset_index() \
#         .groupby('user_id').cluster_id.mean()
#     ).astype(np.float16)
    table['user_most_common_cluster_id'] = table['user_id'].map(
        short_train.groupby('user_id').cluster_id.apply(most_common)
    )
    del short_train
    
    # item features 
    
    mask = ~train[['user_id', 'order_id', 'cluster_id']].duplicated()
    
#     table['cluster_quantity_sum'] = table['cluster_id'].map(
#         train.groupby('cluster_id').product_quantity.sum().astype(np.float32)
#     )
    table['cluster_quantity_mean'] = table['cluster_id'].map(
        train.groupby('cluster_id').product_quantity.mean().astype(np.float32)
    )

    for retailer_id in [0, 1, 7]: # [1, 7, 0, 16, 6, 4, 19, 12, 15]
        table[f'cluster_retailer_{retailer_id}'] = table['cluster_id'].map(
            (train[mask].retailer_id == retailer_id).groupby(train[mask].cluster_id).mean(
            ).astype(np.float32)
        )
    
    table['cluster_city_count'] = table['cluster_id'].map(
        train[mask].groupby('cluster_id').city_id.nunique()
    ).astype(np.float32)
    

#     table['last_dt_delta'] = table['cluster_id'].map(
#         train.dt.max() - train.groupby('cluster_id').dt.max()
#     ).astype(np.float32)

    table['cluster_num_stores'] = table['cluster_id'].map(
        train[mask].groupby('cluster_id').store_id.nunique()
    ).astype(np.float32)
    del mask

    table['cluster_product_price_mean'] = table['cluster_id'].map(
        train.groupby('cluster_id').product_price.mean()
    ).astype(np.float32)

    table['cluster_mean_discount'] = table['cluster_id'].map(
        (train.product_discount == 0).groupby(train.cluster_id).mean().astype(np.float32)
    )

    table['num_users_bought_cluster'] = table['cluster_id'].map(
        train.groupby('cluster_id').user_id.nunique()
    ).fillna(0).astype(np.float32)

    table['num_orders_cluster'] = table['cluster_id'].map(
        train.groupby('cluster_id').order_id.nunique()
    ).fillna(0).astype(np.float32)
    
#     more features
    
    mask = ~train[['order_id', 'cluster_id']].duplicated()
    short_train = train[mask]

    city_retailer = short_train.city_id.astype(np.int16) * 100 + short_train.retailer_id
    city_retailer_cluster = city_retailer.astype(np.int64) * 10000 + short_train.cluster_id

    city_retailer_user = user_city_most_common.astype(np.int16) * 100 + \
        table['user_retailer_most_common']
    city_retailer_cluster_user = city_retailer_user.astype(np.int64)*10000 + table.cluster_id

    table['f1'] = city_retailer_user.map(
        city_retailer.value_counts()
    ).fillna(0).astype(np.float32)

    table['f2'] = city_retailer_cluster_user.map(
        city_retailer_cluster.value_counts()
    ).fillna(0).astype(np.float32)

    table['f3'] = table['f2'] \
        / table['f1'] 
    
    
    del city_retailer_user
    del city_retailer_cluster_user

    city_retailer_user = table['last_user_city_id'].astype(np.int16) * 100 + \
        table['last_user_retailer_id']
    city_retailer_cluster_user = city_retailer_user.astype(np.int64)*10000 + table.cluster_id

    f4 = city_retailer_user.map(
        city_retailer.value_counts()
    ).fillna(0).astype(np.float32)

    table['f5'] = city_retailer_cluster_user.map(
        city_retailer_cluster.value_counts()
    ).fillna(0).astype(np.float32)

    table['f6'] = table['f5'] \
        / f4 
    del f4
    
    del city_retailer
    del city_retailer_user
    del city_retailer_cluster_user
    del city_retailer_cluster
    
    #more and more features
    
    short_train = train[train.user_id.isin(users)]
    short_train2 = short_train[~short_train[['user_id', 'order_id']].duplicated()]
    
    table['time_from_order_with_target_item'] = table.ui.map(
        short_train.dt.max() - short_train.groupby('ui').dt.last()
    ).fillna(-1).astype(np.float32)
    
    user_dt = defaultdict(list)
    for user_id, dt in zip(short_train2['user_id'], short_train2['dt']):
        user_dt[user_id].append(dt)
    del short_train2
    
    table['user_dt_diff_mean'] = table.user_id.map(
        {key: get_mean_diff_dt(value) for key, value in user_dt.items()}
    ).fillna(-1).astype(np.float32)
    del user_dt
    
    table['share_order_with_target_item'] = (
        table['num_order_with_target_item'] / table['num_orders'] 
    ).astype(np.float32)
    
    table['ui_num'] = table.ui.map(short_train.ui.value_counts()).fillna(0).astype(np.int16)
#     table['share_clusters_with_target_item'] = (
#         table['ui_num']/ table['count_user_id']
#     ).astype(np.float32)

    table['share_quatity'] = (
        table['prod_quantity'] / table['product_quantity_sum']
    ).astype(np.float32)

    short_train4 = short_train[
        short_train.user_id.map(short_train.groupby('user_id').retailer_id.last()) == \
        short_train.retailer_id
    ]
        
    table['num_order_with_last_retailer'] = table['user_id'].map(
        short_train4[['user_id', 'order_id']].drop_duplicates()['user_id'].value_counts()
    ).astype(np.int16)

    table['num_order_with_target_item_last_retailer'] = table['ui'].map(
        short_train4[['order_id', 'ui']].drop_duplicates()['ui'].value_counts()
    ).fillna(0).astype(np.int16)
    del short_train4

    table['share_order_with_target_item_last_retailer'] = (
        table['num_order_with_target_item_last_retailer'] / table['num_order_with_last_retailer']
    ).astype(np.float32)
    

    
    
    ui_vc = train.ui.value_counts()
    rnk_vc = train[['user_id', 'ui', 'cluster_id']].drop_duplicates()
    rnk_vc['vc'] = rnk_vc.ui.map(ui_vc)
    rnk_vc = rnk_vc.sort_values(['user_id', 'vc'], ascending=False)
    rnk_vc['rnk_user_id_ui'] = apply_rank('user_id', rnk_vc)
    table['rnk_user_id_ui'] = table.ui.map(rnk_vc.set_index('ui')['rnk_user_id_ui']
                                          ).fillna(10000).astype(np.int16)
    del ui_vc

    rnk_vc = rnk_vc.sort_values(['cluster_id', 'vc'], ascending=False)
    rnk_vc['rnk_cluster_id_ui'] = apply_rank('cluster_id', rnk_vc)
    table['rnk_cluster_id_ui'] = table.ui.map(rnk_vc.set_index('ui')['rnk_cluster_id_ui']
                                          ).fillna(10000).astype(np.int16)
    del rnk_vc

    rnk_vc = train['cluster_id'].value_counts().to_frame()
    rnk_vc['rnk_cluster_id'] = np.arange(len(rnk_vc))
    table['rnk_cluster_id'] = table.cluster_id.map(rnk_vc['rnk_cluster_id']
                                                  ).fillna(10000).astype(np.int16)
    del rnk_vc

    cluster_city_vc = (train['city_id'].astype(np.int32) * 10000 + train['cluster_id']
                      ).value_counts()
    rnk_vc = train[['city_id', 'cluster_id']].drop_duplicates()
    rnk_vc['cluster_city'] = rnk_vc['city_id'].astype(np.int32) * 10000 + rnk_vc['cluster_id']
    rnk_vc['vc'] = rnk_vc['cluster_city'].map(cluster_city_vc)
    rnk_vc = rnk_vc.sort_values(['city_id', 'vc'], ascending=False)
    rnk_vc['rnk_cluster_city'] = apply_rank('city_id', rnk_vc)
    user_city_cluster = table['last_user_city_id'].astype(np.int32) * 10000 \
        + table['cluster_id']
    table['rnk_cluster_city'] = user_city_cluster.map(
        rnk_vc.set_index('cluster_city')['rnk_cluster_city']
    ).fillna(10000).astype(np.int16)
    del cluster_city_vc
    del rnk_vc
    del user_city_cluster

    cluster_retailer_vc = (train['retailer_id'].astype(np.int32) * 10000 + train['cluster_id']
                      ).value_counts()
    rnk_vc = train[['retailer_id', 'cluster_id']].drop_duplicates()
    rnk_vc['cluster_retailer'] = rnk_vc['retailer_id'].astype(np.int32) * 10000 + rnk_vc['cluster_id']
    rnk_vc['vc'] = rnk_vc['cluster_retailer'].map(cluster_retailer_vc)
    rnk_vc = rnk_vc.sort_values(['retailer_id', 'vc'], ascending=False)
    rnk_vc['rnk_cluster_retailer'] = apply_rank('retailer_id', rnk_vc)
    user_retailer_cluster = table['last_user_retailer_id'].astype(np.int32) * 10000 \
        + table['cluster_id']
    table['rnk_cluster_retailer'] = user_retailer_cluster.map(
        rnk_vc.set_index('cluster_retailer')['rnk_cluster_retailer']
    ).fillna(10000).astype(np.int16)
    user_retailer_cluster = table['user_retailer_most_common'].astype(np.int32) * 10000 \
        + table['cluster_id']
    table['rnk_cluster_retailer2'] = user_retailer_cluster.map(
        rnk_vc.set_index('cluster_retailer')['rnk_cluster_retailer']
    ).fillna(10000).astype(np.int16)
    del cluster_retailer_vc
    del rnk_vc
    del user_retailer_cluster

    cluster_retailer_city_vc = (train['city_id'].astype(np.int64) * 10000000 + \
        train['retailer_id'].astype(np.int64) * 10000 + \
        train['cluster_id']).value_counts()
    rnk_vc = train[['retailer_id', 'cluster_id', 'city_id']].drop_duplicates()
    rnk_vc['cluster_retailer_city'] = (rnk_vc['city_id'].astype(np.int64) * 10000000 + \
        rnk_vc['retailer_id'].astype(np.int64) * 10000 + \
        rnk_vc['cluster_id'])
    rnk_vc['vc'] = rnk_vc['cluster_retailer_city'].map(cluster_retailer_city_vc)
    rnk_vc['retailer_city'] = (rnk_vc['city_id'].astype(np.int64) * 1000 + \
        rnk_vc['retailer_id'].astype(np.int64))
    rnk_vc = rnk_vc.sort_values(['retailer_city', 'vc'], ascending=False)
    rnk_vc['rnk_cluser_city_retailer'] = apply_rank('retailer_city', rnk_vc)
    user_retailer_city_cluster = (table['last_user_city_id'].astype(np.int64) * 10000000 + \
        table['last_user_retailer_id'].astype(np.int64) * 10000 + \
        table['cluster_id'])
    table['rnk_cluster_retailer_city'] = user_retailer_city_cluster.map(
        rnk_vc.set_index('cluster_retailer_city')['rnk_cluser_city_retailer']
    ).fillna(10000).astype(np.int16)
    user_retailer_city_cluster = (table['last_user_city_id'].astype(np.int64) * 10000000 + \
        table['user_retailer_most_common'].astype(np.int64) * 10000 + \
        table['cluster_id'])
    table['rnk_cluster_retailer_city2'] = user_retailer_city_cluster.map(
        rnk_vc.set_index('cluster_retailer_city')['rnk_cluser_city_retailer']
    ).fillna(10000).astype(np.int16)
    del cluster_retailer_city_vc
    del rnk_vc
    del user_retailer_city_cluster
    
    short_train = train[['cluster_id', 'user_id']][
        train.user_id.isin(users) & (~train[['ui', 'order_id']].duplicated())
    ]
    
    vc = short_train['user_id'].value_counts()
    for cluster_id in TOP_K_CLUSTERS[:140]:
        table[f'f102_{cluster_id}'] = table.user_id.map(
            (short_train.cluster_id == cluster_id).groupby(short_train.user_id).sum() / vc
        ).astype(np.float16)
    
    return table

In [12]:
def get_lgb_table(path1, path2, path3, path4, path5):

    train_val = pd.read_parquet(path1)
    train_val['product_price'] = train_val['product_price'].astype(np.float32)
    train_val['product_discount'] = train_val['product_discount'].astype(np.float32)
    val = pd.read_parquet(path2)
    clusters = pd.read_parquet(CLUSTERS_PATH)
    table = pd.read_parquet(path5)

    for df in [train_val, val, table]:
        df['ui'] = df['user_id'].astype(np.int64) * 10000 + df['cluster_id']
    print(0)
    table = create_fit_table(train_val, table, clusters,
                            path3, path4)
    print(1)
    del train_val
    del clusters
    
    y  = np.array(table['ui'].isin(val['ui']))
    del val
    print(2)
    del table['user_id']
    del table['ui']
    X = table.to_numpy(dtype=np.float32)
    del table
    
    return X, y


def fit_ranker():

    X, y = get_lgb_table(TRAIN_VAL_PATH, VAL_PATH, RECS_NN_VAL_PATH, 
                         RECS_VAL_PATH, TOPK_VAL_PATH)
    
    lgb_table = lightgbm.Dataset(X, y)
    print(3)

    params = {
        'objective': 'binary',
        'num_leaves': 100,
        'learning_rate': 0.03,
        'verbose': 0,
        'seed': 1,
    }
    
    gbm = lightgbm.train(params, lgb_table, num_boost_round=5000)
    pickle.dump(gbm, open(RANKER_MODEL_PATH, 'wb'))

In [13]:
%%time
%memit fit_ranker()

0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

1
2
3
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
peak memory: 32202.47 MiB, increment: 32014.02 MiB
CPU times: user 9h 36min 28s, sys: 10min 39s, total: 9h 47min 7s
Wall time: 1h 30min 50s
