In [1]:
# ランク学習
# 学習データ5週分(検証データ1週含む)
# 候補作り12週分
# trendingで候補作り
# articlesとcustomersの特徴量をtarget_weekごとに作る
# 候補の良さを測る
# New: pairで候補作り✅
# New: 顧客毎の最終週で候補作り✅
# New: 推論時に並列化❌
# New: 推論時のバッチサイズを5万customerに✅
# New: アンダーフィッティング気味なので、より学習させる
# New: kangolで候補作り
# MAP@12 (all): 0.030636
# MAP@12 (cold start): 0.006791

EXP = '022'

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from joblib import Parallel, delayed
from psutil import cpu_count

from pathlib import Path
import pickle
import gc
import os
from time import time
import warnings

tqdm.pandas()
pd.options.display.max_columns = None
warnings.simplefilter('ignore', pd.errors.PerformanceWarning)
warnings.simplefilter('ignore', UserWarning)
data_path = Path('../input/h-and-m-personalized-fashion-recommendations/')

In [3]:
transactions = pd.read_csv(
    data_path / f'transactions_train.csv',
    # set dtype or pandas will drop the leading '0' and convert to int
    dtype={'article_id': 'int32'},
    parse_dates=['t_dat'])
customers = pd.read_csv(data_path / 'customers.csv')
articles = pd.read_csv(
    '../input/h-and-m-personalized-fashion-recommendations/articles.csv', 
    dtype={'article_id': 'int32'})

t_max = transactions['t_dat'].max()
transactions['t_diff'] = (t_max - transactions['t_dat']).dt.days
transactions['week'] = transactions['t_diff'] // 7

customers.loc[~customers['fashion_news_frequency'].isin(['Regularly', 'Monthly']), 'fashion_news_frequency'] = None

id_to_index_dict = dict(zip(customers["customer_id"], customers.index))
index_to_id_dict = dict(zip(customers.index, customers["customer_id"]))
transactions["customer_id"] = transactions["customer_id"].map(id_to_index_dict).astype('int32')
customers['customer_id'] = customers['customer_id'].map(id_to_index_dict).astype('int32')

print(transactions.shape)
display(transactions.tail())

(31788324, 7)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,t_diff,week
31788319,2020-09-22,1371691,929511001,0.059305,2,0,0
31788320,2020-09-22,1371691,891322004,0.042356,2,0,0
31788321,2020-09-22,1371721,918325001,0.043203,1,0,0
31788322,2020-09-22,1371747,833459002,0.006763,1,0,0
31788323,2020-09-22,1371960,898573003,0.033881,2,0,0


In [4]:
def generate_candidates_trending(transactions: pd.DataFrame, customers: np.ndarray, target_week: int):
    df = transactions.query("customer_id in @customers").copy()
    df['week'] = df['week'] - target_week
    df = df.query('week >= 1')
    
    weekly_sales = df.groupby(['week', 'article_id'])['article_id'].count().rename('count').reset_index()
    df = df.merge(weekly_sales, on=['week', 'article_id'], how='left')
    
    weekly_sales = weekly_sales.reset_index().set_index('article_id')
    df = df.join(weekly_sales.loc[weekly_sales['week']==1, ['count']], on='article_id', rsuffix='_targ')
    df['count_targ'].fillna(0, inplace=True)
    df['quotient'] = df['count_targ'] / df['count']
    
    t_max = df['t_dat'].max()
    df['x'] = ((t_max - df['t_dat']) / np.timedelta64(1, 'D')).astype(int)
    df['dummy_1'] = 1
    df['x'] = df[['x', 'dummy_1']].max(axis=1)    
    a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
    df['y'] = a / np.sqrt(df['x']) + b * np.exp(-c*df['x']) - d
    df['dummy_0'] = 0 
    df['y'] = df[["y", "dummy_0"]].max(axis=1)
    df['trending_value'] = df['quotient'] * df['y']
    df = df.groupby(['customer_id', 'article_id']).agg({'trending_value': 'sum'}).reset_index()
    df = df.loc[df['trending_value'] > 0]
    # df['rank'] = df.groupby("customer_id")["value"].rank("dense", ascending=False)
    # df = df.loc[df['rank'] <= 12]
    df['isin_trending'] = 1

    return df[['customer_id', 'article_id', 'trending_value', 'isin_trending']]

In [5]:
def generate_candidates_recently(transactions: pd.DataFrame, customers: np.ndarray, target_week: int):
    df = transactions.query("customer_id in @customers").copy()
    df['week'] = df['week'] - target_week
    df = df.query("week >= 1")
    df = df.query("week <= 12")
    
    for w in df['week'].unique()[::-1]:
        tmp = df.query('week == @w').groupby(['customer_id', 'article_id'])['article_id'].count().rename(f'count_{w}w').reset_index().copy()
        if w == 1:
            purchase_df = tmp
            continue
        purchase_df = purchase_df.merge(tmp, how='outer', on=['customer_id', 'article_id'])
        
    purchase_df['isin_recently'] = 1
    
    return purchase_df

In [6]:
def generate_candidates_popular(transactions: pd.DataFrame, customers: np.ndarray, target_week: int):
    '''make_customers_featureが購入数から特徴量を作ってくれるから、この関数は候補のペアだけ返す'''
    df = transactions.copy()
    df['week'] = df['week'] - target_week
    df = df.query("week >= 1")
    df = df.query("week <= 4")

    dummy_count_df = df.groupby(['article_id', 'week'])['week'].count().rename('dummy_count').reset_index().copy()
    dummy_count_df['rank_in_week'] = dummy_count_df.groupby('week')['dummy_count'].rank(method='min', ascending=False)
    dummy_articles = dummy_count_df.query('rank_in_week <= 12')['article_id'].unique()

    dummy_df = pd.DataFrame(
        np.concatenate(
            [np.repeat(customers, repeats=len(dummy_articles)).reshape(-1, 1),
            np.repeat(dummy_articles[None, :], repeats=len(customers), axis=0).reshape(-1, 1)],
            axis=1),
        columns = ['customer_id', 'article_id']
    )
    
    dummy_df['isin_popular'] = 1
    
    return dummy_df

In [16]:
def generate_candidates_lastw(transactions: pd.DataFrame, customers: np.ndarray, target_week: int):
    '''顧客毎の最終週を取り出す'''
    df = transactions.query("customer_id in @customers").copy()
    df['week'] = df['week'] - target_week
    df = df.query("week >= 1").copy()

    df['max_dat'] = df['customer_id'].map(df.groupby('customer_id')['t_dat'].max())
    df['max_diff'] = (df['max_dat'] - df['t_dat']).dt.days
    df = df.query('max_diff <= 6')
    df = df.merge(df.groupby(['customer_id', 'article_id'])['article_id'].count().rename('count_lastw').reset_index(), on=['customer_id', 'article_id'])
    df = df.sort_values('t_dat', ascending=True)
    df = df.drop_duplicates(['customer_id', 'article_id'], keep='last')
    df = df.rename(columns={'t_dat': 'last_dat'})
    df['last_dat'] = df['last_dat'].astype(np.int64) // 10 ** 9
    df['isin_lastw'] = 1
    
    return df[['customer_id', 'article_id', 'count_lastw', 'last_dat', 'isin_lastw']]

In [17]:
def generate_candidates_pairs(transactions: pd.DataFrame, customers: np.ndarray, target_week: int):
    df = transactions.query("customer_id in @customers").copy()
    df['week'] = df['week'] - target_week
    df = df.query("week >= 1")
    df = df.query('week <= 12').copy()
    
    df = df.merge(df.groupby(['customer_id', 'article_id'])['article_id'].count().rename('count').reset_index(), on=['customer_id', 'article_id']).drop_duplicates(['customer_id', 'article_id'], keep='last')
    pairs = pd.read_parquet(f'../input/hmitempairs/pairs_cudf_fold{target_week}.parquet', dtype='int32')
    df = df.merge(pairs, on='article_id', how='inner')
    df = df.drop('article_id', axis=1).rename(columns={'pair_article_id': 'article_id'})
    df['count_pair'] = df['count'] * df['pair_ratio']
    df['isin_pair'] = 1

    return df[['customer_id', 'article_id', 'count_pair', 'isin_pair']]

In [10]:
def make_customers_feature(customers: pd.DataFrame, transactions: pd.DataFrame, target_week: int, debug: bool = False):
    df = transactions.copy()
    customers_feature = customers.drop(['postal_code'], axis=1).copy()
    customers_feature.loc[~customers_feature['fashion_news_frequency'].isin(['Regularly', 'Monthly']), 'fashion_news_frequency'] = None
    customers_feature[['FN', 'Active']] = customers_feature[['FN', 'Active']].fillna(0)

    # リーク防止
    df['week'] = df['week'] - target_week
    df = df.query('week >= 1')
    
    if debug == True:
        df = df.query('week <= 24')

    weekly_purchase = df.groupby(['customer_id', 'week'])['week'].count().rename('purchase').reset_index()
    
    for agg_name in ['max', 'min', 'mean', 'sum']:
        agg_sr = weekly_purchase.groupby('customer_id')['purchase'].agg(agg_name)
        customers_feature[f'purchase_{agg_name}_groupby_customer'] = customers_feature['customer_id'].map(agg_sr)
    
    for w in df['week'].unique()[::-1]:
        tmp = weekly_purchase[weekly_purchase['week']==w]
        tmp = tmp[['customer_id', 'purchase']].set_index('customer_id')['purchase']
        customers_feature[f'purchase_{w}w'] = customers_feature['customer_id'].map(tmp).fillna(0)
        for agg_name in ['max', 'min', 'mean', 'sum']:
            customers_feature[f'purchase_{agg_name}_groupby_customer_ratio_{w}w'] = customers_feature[f'purchase_{w}w'] / customers_feature[f'purchase_{agg_name}_groupby_customer']
            customers_feature[f'purchase_{agg_name}_groupby_customer_diff_{w}w'] = customers_feature[f'purchase_{w}w'] - customers_feature[f'purchase_{agg_name}_groupby_customer']

    unique_transactions = df[['customer_id', 'article_id', 'week']].drop_duplicates()
    unique_transactions['rank'] = unique_transactions.groupby(['customer_id', 'article_id'])['week'].rank(method='dense', ascending=False)

    customers_feature['repurchase_article'] = customers_feature['customer_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['customer_id', 'article_id']).groupby('customer_id')['article_id'].count()).fillna(0)
    customers_feature['purchase_article'] = customers_feature['customer_id'].map(unique_transactions.drop_duplicates(subset=['customer_id', 'article_id']).groupby('customer_id')['article_id'].count())
    customers_feature['repurchase_article_percent'] = customers_feature['repurchase_article'] / customers_feature['purchase_article']

    customers_feature['repurchase_week'] = customers_feature['customer_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['customer_id', 'week']).groupby('customer_id')['week'].count()).fillna(0)
    customers_feature['purchase_week'] = customers_feature['customer_id'].map(
        unique_transactions.drop_duplicates(subset=['customer_id', 'week']).groupby('customer_id')['week'].count())
    customers_feature['repurchase_week_percent'] = customers_feature['repurchase_week'] / customers_feature['purchase_week']

    customers_feature['repurchase_article_and_week'] = customers_feature['customer_id'].map(
        unique_transactions.query('rank >= 2').groupby('customer_id')['customer_id'].count()).fillna(0)
    customers_feature['purchase_article_and_week'] = customers_feature['customer_id'].map(
        unique_transactions.groupby('customer_id')['customer_id'].count())
    customers_feature['repurchase_article_and_week_percent'] = customers_feature['repurchase_article_and_week'] / customers_feature['purchase_article_and_week']
        
    return customers_feature

In [11]:
def make_articles_feature(articles: pd.DataFrame, transactions: pd.DataFrame, target_week: int, debug: bool = False):
    df = transactions.copy()
    articles_feature = articles.drop(
        ['prod_name', 'product_type_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'prod_name', 'department_name', 'detail_desc'], 
        axis=1).copy()
    
    # リーク防止
    df['week'] = df['week'] - target_week
    df = df.query('week >= 1')
    
    if debug == True:
        df = df.query('week <= 24')

    weekly_sale = df.groupby(['article_id', 'week'])['week'].count().rename('sale').reset_index()
    
    for agg_name in ['max', 'min', 'mean', 'sum']:
        agg_sr = weekly_sale.groupby('article_id')['sale'].agg(agg_name)
        articles_feature[f'sale_{agg_name}_groupby_article'] = articles_feature['article_id'].map(agg_sr)
    
    for w in df['week'].unique()[::-1]:
        tmp = weekly_sale[weekly_sale['week']==w]
        tmp = tmp[['article_id', 'sale']].set_index('article_id')['sale']
        articles_feature[f'sale_{w}w'] = articles_feature['article_id'].map(tmp).fillna(0)
        for agg_name in ['max', 'min', 'mean', 'sum']:
            articles_feature[f'sale_{agg_name}_groupby_article_ratio_{w}w'] = articles_feature[f'sale_{w}w'] / articles_feature[f'sale_{agg_name}_groupby_article']
            articles_feature[f'sale_{agg_name}_groupby_article_diff_{w}w'] = articles_feature[f'sale_{w}w'] - articles_feature[f'sale_{agg_name}_groupby_article']

    unique_transactions = df[['article_id', 'customer_id', 'week']].drop_duplicates()
    unique_transactions['rank'] = unique_transactions.groupby(['article_id', 'customer_id'])['week'].rank(method='dense', ascending=False)

    articles_feature['resale_customer'] = articles_feature['article_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['article_id', 'customer_id']).groupby('article_id')['customer_id'].count()).fillna(0)
    articles_feature['sale_customer'] = articles_feature['article_id'].map(unique_transactions.drop_duplicates(subset=['article_id', 'customer_id']).groupby('article_id')['customer_id'].count())
    articles_feature['resale_customer_percent'] = articles_feature['resale_customer'] / articles_feature['sale_customer']

    articles_feature['resale_week'] = articles_feature['article_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['article_id', 'week']).groupby('article_id')['week'].count()).fillna(0)
    articles_feature['sale_week'] = articles_feature['article_id'].map(
        unique_transactions.drop_duplicates(subset=['article_id', 'week']).groupby('article_id')['week'].count())
    articles_feature['resale_week_percent'] = articles_feature['resale_week'] / articles_feature['sale_week']

    articles_feature['resale_customer_and_week'] = articles_feature['article_id'].map(
        unique_transactions.query('rank >= 2').groupby('article_id')['article_id'].count()).fillna(0)
    articles_feature['sale_customer_and_week'] = articles_feature['article_id'].map(
        unique_transactions.groupby('article_id')['article_id'].count())
    articles_feature['resale_customer_and_week_percent'] = articles_feature['resale_customer_and_week'] / articles_feature['sale_customer_and_week']
    
    return articles_feature

In [12]:
def compress_df(
    df: pd.DataFrame, 
    category_columns: list =['club_member_status', 'fashion_news_frequency', 'product_group_name', 'index_code'], 
    verbose: bool =True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        bar = tqdm(df.columns, leave=False)
    else:
        bar = df.columns
    for col in bar:
        col_type = df[col].dtypes
        if col in category_columns:
            if verbose:
                bar.set_description(f"{col}(category)")
            df[col] = df[col].astype('category')
        elif col_type in numerics:
            if verbose:
                bar.set_description(f"{col}(num)")
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [28]:
def make_data_df(transactions: pd.DataFrame, week: int, is_labeled: bool, use_customers: np.ndarray = None, metric_verbose: bool = True, compress_verbose: bool = False):
    strategy_flags = ['isin_trending', 'isin_recently', 'isin_popular', 'isin_lastw']
    kwargs = {'how': 'outer', 'on': ['customer_id', 'article_id'], 'copy': True}
    
    if is_labeled:
        if week < 0:
            raise ValueError(f"can't label when week={week}.")
        data_actual = transactions.query("week == @week")[['customer_id', 'article_id']].drop_duplicates()
        data_actual['label'] = 1
    
    if use_customers is not None:
        data_customers = use_customers
    elif week >= 0:
        data_customers = transactions.query("week == @week")['customer_id'].unique()
    else:
        raise ValueError('set use_customers as something when week=-1.')
            
    data_candidates_trending = generate_candidates_trending(transactions, data_customers, week)
    data_candidates_recently = generate_candidates_recently(transactions, data_customers, week)
    data_candidates_popular = generate_candidates_popular(transactions, data_customers, week)    
    data_candidates_lastw = generate_candidates_lastw(transactions, data_customers, week)
    data_df = data_candidates_trending.merge(data_candidates_recently.merge(data_candidates_popular.merge(data_candidates_lastw, **kwargs), **kwargs), **kwargs)
    data_df[strategy_flags] = data_df[strategy_flags].fillna(0)
    if is_labeled:        
        data_df = data_df.merge(data_actual, how='left', on=['customer_id', 'article_id'])
        data_df['label'] = data_df['label'].fillna(0)
    data_df = compress_df(data_df, verbose=compress_verbose)
    data_customers_feature = compress_df(make_customers_feature(customers, transactions, target_week=week, debug=True), verbose=compress_verbose)
    data_articles_feature = compress_df(make_articles_feature(articles, transactions, target_week=week, debug=True), verbose=compress_verbose)
    data_df = data_df.merge(data_customers_feature, how='left', on=['customer_id'], copy=True)
    data_df = data_df.merge(data_articles_feature, how='left', on=['article_id'], copy=True)
    # data_df = compress_df(data_df, verbose=compress_verbose)
    
    if metric_verbose:
        print(f"[Info] data shape: {data_df.shape}")
        print(f"[Info] candidates per a customer: {len(data_df) / len(data_customers):.1f}")
        display(data_df[strategy_flags].astype(float).mean())
        if is_labeled:
            print(f"[Info] Precision: {data_df['label'].sum() / len(data_df):.5f}")
            print(f"[Info] Recall: {data_df['label'].sum() / len(data_actual):.5f}")

    return data_df

In [24]:
# ランク学習
params = {
    'objective': 'lambdarank',
    'boosting': 'gbdt',
    'num_iterations': 1000,
    'learning_rate': 0.1,
    'num_leaves': 31,
    'num_threads': os.cpu_count(),
    'min_data_in_leaf': 20,
    'max_depth': -1,
    'bagging_freq': 5,
    'bagging_fraction': 0.75,
    'metric': ['ndcg'],
    'eval_at': [12],  # 上位何件のランキングをnDCGとMAPの算出に用いるか
    'random_state': 41,
    'verbosity': 0  # 0: warnings, 1: info
}

In [23]:
# train lgb ranker
best_iterations = []
feature_importance_dfs = []
oof_weeks = [4, 3, 2, 1]

for i, w in enumerate(tqdm(oof_weeks)):
    print(f"\ntarget_week(fold): {w}")
    if i == 0:
        compress_verbose=True
    else:
        compress_verbose=False
    
    tr_df = make_data_df(transactions, w, is_labeled=True, metric_verbose=True, compress_verbose=compress_verbose)
    val_df = make_data_df(transactions, w-1, is_labeled=True, metric_verbose=False, compress_verbose=compress_verbose)
        
    if i == 0:
        exclude_columns = ['target_week', 'customer_id', 'article_id', 'label']
        cols = [c for c in tr_df.columns.tolist() if c not in exclude_columns]
        with open(f'../models/lgb_rank/{EXP}_cols.pkl', 'wb') as f:
            pickle.dump(cols, f)

    tr_df = tr_df.sort_values('customer_id').reset_index(drop=True)
    train_query = tr_df.groupby('customer_id')['customer_id'].count().to_list()
    dtrain = lgb.Dataset(tr_df[cols], label=tr_df['label'], group=train_query)
    val_df = val_df.sort_values('customer_id').reset_index(drop=True)    
    val_query = val_df.groupby('customer_id')['customer_id'].count().to_list()
    dval = lgb.Dataset(val_df[cols], reference=dtrain, label=val_df['label'], group=val_query)

    model = lgb.train(
        params, dtrain, valid_sets=[dtrain, dval], 
        callbacks=[lgb.early_stopping(10, first_metric_only=True), lgb.log_evaluation(10)])
    with open(f'../models/lgb_rank/{EXP}_model_fold{w}.pkl', 'wb') as f:
        pickle.dump(model, f)
    
    best_iterations.append(model.best_iteration)
    feature_importance_dfs.append(pd.DataFrame({'feature': model.feature_name(), 'importance(gain)': model.feature_importance('gain'), 'fold': w}))

  0%|          | 0/4 [00:00<?, ?it/s]


target_week(fold): 4


  0%|          | 0/22 [00:00<?, ?it/s]

Mem. usage decreased to 245.65 Mb (67.4% reduction)


  0%|          | 0/235 [00:00<?, ?it/s]

Mem. usage decreased to 614.96 Mb (74.9% reduction)


  0%|          | 0/242 [00:00<?, ?it/s]

Mem. usage decreased to 48.62 Mb (75.0% reduction)
[Info] data shape: (4292962, 497)
[Info] candidates per a customer: 59.6


isin_trending    0.268328
isin_recently    0.138038
isin_popular     0.704751
isin_lastw       0.060702
dtype: float64

[Info] Precision: 0.00375
[Info] Recall: 0.06973


  0%|          | 0/22 [00:00<?, ?it/s]

Mem. usage decreased to 282.39 Mb (67.4% reduction)


  0%|          | 0/235 [00:00<?, ?it/s]

Mem. usage decreased to 614.96 Mb (74.9% reduction)


  0%|          | 0/242 [00:00<?, ?it/s]

Mem. usage decreased to 48.62 Mb (75.0% reduction)
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93564
[LightGBM] [Info] Number of data points in the train set: 4292962, number of used features: 494
Training until validation scores don't improve for 10 rounds
[10]	training's ndcg@12: 0.888205	valid_1's ndcg@12: 0.890693
[20]	training's ndcg@12: 0.890291	valid_1's ndcg@12: 0.891493
[30]	training's ndcg@12: 0.892328	valid_1's ndcg@12: 0.892025
Early stopping, best iteration is:
[28]	training's ndcg@12: 0.891872	valid_1's ndcg@12: 0.892184
Evaluated only: ndcg@12

target_week(fold): 3
[Info] data shape: (4935164, 497)
[Info] candidates per a customer: 61.5


isin_trending    0.257627
isin_recently    0.126501
isin_popular     0.715504
isin_lastw       0.057123
dtype: float64

[Info] Precision: 0.00326
[Info] Recall: 0.06302
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93846
[LightGBM] [Info] Number of data points in the train set: 4935164, number of used features: 494
Training until validation scores don't improve for 10 rounds
[10]	training's ndcg@12: 0.898978	valid_1's ndcg@12: 0.888552
[20]	training's ndcg@12: 0.901736	valid_1's ndcg@12: 0.88925
Early stopping, best iteration is:
[19]	training's ndcg@12: 0.901578	valid_1's ndcg@12: 0.889374
Evaluated only: ndcg@12

target_week(fold): 2
[Info] data shape: (4256778, 497)
[Info] candidates per a customer: 56.1


isin_trending    0.265981
isin_recently    0.138183
isin_popular     0.694670
isin_lastw       0.061846
dtype: float64

[Info] Precision: 0.00386
[Info] Recall: 0.06909
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93637
[LightGBM] [Info] Number of data points in the train set: 4256778, number of used features: 494
Training until validation scores don't improve for 10 rounds
[10]	training's ndcg@12: 0.896805	valid_1's ndcg@12: 0.893054
[20]	training's ndcg@12: 0.898992	valid_1's ndcg@12: 0.893345
Early stopping, best iteration is:
[17]	training's ndcg@12: 0.898463	valid_1's ndcg@12: 0.893662
Evaluated only: ndcg@12

target_week(fold): 1
[Info] data shape: (3465389, 497)
[Info] candidates per a customer: 48.1


isin_trending    0.305411
isin_recently    0.163816
isin_popular     0.644254
isin_lastw       0.072046
dtype: float64

[Info] Precision: 0.00438
[Info] Recall: 0.06655
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93861
[LightGBM] [Info] Number of data points in the train set: 3465389, number of used features: 494
Training until validation scores don't improve for 10 rounds
[10]	training's ndcg@12: 0.898925	valid_1's ndcg@12: 0.872744
[20]	training's ndcg@12: 0.900882	valid_1's ndcg@12: 0.873022
[30]	training's ndcg@12: 0.902383	valid_1's ndcg@12: 0.873488
[40]	training's ndcg@12: 0.90404	valid_1's ndcg@12: 0.873422
Early stopping, best iteration is:
[32]	training's ndcg@12: 0.902724	valid_1's ndcg@12: 0.873584
Evaluated only: ndcg@12


In [27]:
# predict val data
val_df = make_data_df(transactions, 0, is_labeled=False, metric_verbose=True, compress_verbose=False)
val_pred = np.zeros(len(val_df))
with tqdm(oof_weeks) as pbar:
    for w in pbar:
        pbar.set_description(f"model's target_week(fold): {w}")
        with open(f"../models/lgb_rank/{EXP}_model_fold{w}.pkl", 'rb') as f:
            model = pickle.load(f)
        val_pred += model.predict(val_df[cols], num_iteration=model.best_iteration)
val_pred = val_pred/len(oof_weeks)
print(np.sort(val_pred))

[Info] data shape: (3297798, 496)
[Info] candidates per a customer: 47.8


isin_trending    0.280896
isin_recently    0.150857
isin_popular     0.669382
isin_lastw       0.070936
dtype: float64

  0%|          | 0/4 [00:00<?, ?it/s]

[-1.96575959 -1.96422425 -1.96343822 ...  1.87612017  1.87854642
  1.89461048]


In [29]:
# val top rank articles
val_df2 = val_df.copy()
val_df2['predict_score'] = val_pred
val_df2 = val_df2.sort_values('predict_score', ascending=False).drop_duplicates(['customer_id', 'article_id'], keep='first').reset_index(drop=True)
val_df2['rank'] = val_df2.groupby('customer_id')['predict_score'].rank('dense', ascending=False)
val_df2 = val_df2[val_df2['rank'] <= 12]
# val_df2['article_id'] = le.inverse_transform(val_df2['article_id'])
val_df2['article_id'] = ' 0' + val_df2['article_id'].astype(str)
val_pred_sr = val_df2.groupby('customer_id')['article_id'].sum()
display(val_pred_sr.head(3))

customer_id
80      0671607001 0436261001 0448509014 0706016001 0...
86      0621381012 0889036004 0640021012 0880017001 0...
107     0556255001 0399136061 0732842014 0732842021 0...
Name: article_id, dtype: object

In [31]:
# most popular items
transactions_last_week = transactions.loc[transactions.week == 1]
top12 = ' 0' + ' 0'.join(transactions_last_week.article_id.value_counts().index.astype('str')[:12])
print("Top 12 popular items:")
print( top12 )

customers['age_bin'] = pd.cut(customers['age'], bins=[10, 20, 30, 40, 50, 60, 70, 100], labels=False)
transactions_last_week = transactions_last_week.merge(customers[['customer_id', 'age', 'age_bin']], how='left')
popular_items = transactions_last_week.groupby('age_bin')['article_id'].value_counts()
popular_items_dict = {}
for index in popular_items.index.levels[0]:
    popular_items_dict[index] = ' 0'+' 0'.join(popular_items[index][:12].index.astype('str'))
popular_items_sr = pd.Series(popular_items_dict, name='top_12_popular_items', dtype='str')
display(popular_items_sr)

Top 12 popular items:
 0909370001 0865799006 0918522001 0924243001 0448509014 0751471001 0809238001 0918292001 0762846027 0809238005 0673677002 0923758001


0.0     0685814003 0448509014 0918522001 0715624001 0...
1.0     0909370001 0865799006 0924243001 0809238001 0...
2.0     0909370001 0865799006 0918525001 0909371001 0...
3.0     0909370001 0751471001 0673677002 0910601003 0...
4.0     0918522001 0751471001 0751471043 0910601003 0...
5.0     0918522001 0908799002 0896152002 0924243001 0...
6.0     0736870001 0796210001 0908799002 0865799006 0...
Name: top_12_popular_items, dtype: object

In [32]:
# val sub
submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

submission['prediction_lgb'] = submission['customer_id'].map(id_to_index_dict).map(val_pred_sr)
submission['prediction_lgb'] = submission['prediction_lgb'].fillna('')

submission['age_bin'] = submission['customer_id'].map(id_to_index_dict).map(customers.set_index('customer_id')['age_bin'])
submission['prediction_popular'] = submission['age_bin'].map(popular_items_sr)
submission['prediction_popular'] = submission['prediction_popular'].fillna(top12).astype('str')

submission['prediction'] = submission['prediction_lgb'] + submission['prediction_popular']
submission['prediction'] = submission['prediction'].str.strip()
submission['prediction'] = submission['prediction'].str[:131]
display(submission.head(3))
submission[['customer_id', 'prediction']].to_csv(f'../submissions/{EXP}_submission_fold1.csv', index=False)

Unnamed: 0,customer_id,prediction,prediction_lgb,age_bin,prediction_popular
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0909370001 0751471001 0673677002 0910601003 07...,,3.0,0909370001 0751471001 0673677002 0910601003 0...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0909370001 0865799006 0924243001 0809238001 04...,,1.0,0909370001 0865799006 0924243001 0809238001 0...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0909370001 0865799006 0924243001 0809238001 04...,,1.0,0909370001 0865799006 0924243001 0809238001 0...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0918522001 0751471001 0751471043 0910601003 09...,,4.0,0918522001 0751471001 0751471043 0910601003 0...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0918522001 0751471001 0751471043 0910601003 09...,,4.0,0918522001 0751471001 0751471043 0910601003 0...


In [33]:
del val_df, val_query, dval, val_pred, model
del transactions_last_week, top12, popular_items, popular_items_dict, popular_items_sr
del val_df2, val_pred_sr, 
del submission
gc.collect()

30887

In [35]:
# train last target_week(=0) data
tr_df = make_data_df(transactions, week=0, is_labeled=True, metric_verbose=True, compress_verbose=False)

tr_df = tr_df.sort_values('customer_id').reset_index(drop=True)
train_query = tr_df.groupby('customer_id')['customer_id'].count().to_list()
dtrain = lgb.Dataset(tr_df[cols], label=tr_df['label'], group=train_query)

params['num_iterations'] = int(np.mean(best_iterations))
model = lgb.train(
    params, dtrain, valid_sets=[dtrain], callbacks=[lgb.log_evaluation(10)])
with open(f"../models/lgb_rank/{EXP}_model_fold0.pkl", 'wb') as f:
    pickle.dump(model, f)
    
feature_importance_dfs.append(pd.DataFrame({'feature': model.feature_name(), 'importance(gain)': model.feature_importance('gain'), 'fold': 0}))

[Info] data shape: (3297798, 498)
[Info] candidates per a customer: 47.8


isin_trending    0.280896
isin_recently    0.150857
isin_popular     0.669382
isin_lastw       0.070936
dtype: float64

[Info] Precision: 0.00497
[Info] Recall: 0.07673
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[10]	training's ndcg@12: 0.883283
[20]	training's ndcg@12: 0.88527


In [36]:
feature_importance_df = pd.concat(feature_importance_dfs, ignore_index=True, axis=0)
display(feature_importance_df.groupby(['feature'])[['importance(gain)']].mean().sort_values('importance(gain)', ascending=False).head(20))

Unnamed: 0_level_0,importance(gain)
feature,Unnamed: 1_level_1
trending_value,26358.976263
sale_max_groupby_article_ratio_1w,9115.912945
last_dat,8841.514284
purchase_sum_groupby_customer_ratio_1w,4210.994539
purchase_max_groupby_customer_ratio_1w,3634.056604
sale_mean_groupby_article_ratio_1w,2263.607624
sale_1w,2229.343816
count_lastw,2137.828874
purchase_mean_groupby_customer_ratio_1w,1983.360392
age,1898.039464


In [37]:
del tr_df, train_query, dtrain, params, model, best_iterations, feature_importance_dfs
gc.collect()

445

In [45]:
# predict test data
BATCH_SIZE = 50_000
submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
test_customers = submission['customer_id'].map(id_to_index_dict).unique()

def process(i):
    if i == (len(test_customers)//BATCH_SIZE):
        test_customers_batch = test_customers[i*BATCH_SIZE : ]
    else:
        test_customers_batch = test_customers[i*BATCH_SIZE : (i+1)*BATCH_SIZE]

    test_df = make_data_df(transactions, week=-1, is_labeled=False, use_customers=test_customers_batch, metric_verbose=False, compress_verbose=False)

    pred = np.zeros(len(test_df))
    all_weeks = oof_weeks + [0]
    for w in all_weeks:
        with open(f"../models/lgb_rank/{EXP}_model_fold{w}.pkl", 'rb') as f:
            model = pickle.load(f)
        pred += model.predict(test_df[cols], num_iteration=model.best_iteration)    
    pred = pred/len(all_weeks)
    
    test_df['predict_score'] = pred
    test_df = test_df.sort_values('predict_score', ascending=False).drop_duplicates(['customer_id', 'article_id'], keep='first').reset_index(drop=True)
    test_df['rank'] = test_df.groupby('customer_id')['predict_score'].rank('min', ascending=False)
    test_df = test_df[test_df['rank'] <= 12]
    
    # test_df['article_id'] = le.inverse_transform(test_df['article_id'])
    test_df['article_id'] = ' 0' + test_df['article_id'].astype(str)
    return test_df.groupby('customer_id')['article_id'].sum()

# single process execution
preds = []
for i in tqdm(range(len(test_customers)//BATCH_SIZE + 1)):
    preds.append(process(i))

# # multi process execution
# # cpus = cpu_count(logical=False)
# cpus = 4
# print('cpu(core): ', cpus)
# preds = Parallel(n_jobs=cpus, verbose=0)( [delayed(process)(i) for i in range(len(test_customers)//BATCH_SIZE + 1)] )
pred_sr = pd.concat(preds, axis=0)
display(pred_sr.head())

  0%|          | 0/28 [00:00<?, ?it/s]

customer_id
0     0568601043 0568601006 0924243001 0866731001 0...
1     0924243001 0866731001 0714790020 0448509014 0...
2     0794321007 0924243001 0866731001 0924243002 0...
3     0924243001 0915529005 0866731001 0714790020 0...
4     0730683050 0896152002 0791587015 0927530004 0...
Name: article_id, dtype: object

In [46]:
del preds
gc.collect()

NameError: name 'test_df' is not defined

In [47]:
# # most popular items
# transactions_last_week = transactions.loc[transactions.week == 0]
# top12 = ' 0' + ' 0'.join(transactions_last_week.article_id.value_counts().index.astype('str')[:12])
# print("Top 12 popular items:")
# print( top12 )

# customers['age_bin'] = pd.cut(customers['age'], bins=[10, 20, 30, 40, 50, 60, 70, 100], labels=False)
# transactions_last_week = transactions_last_week.merge(customers[['customer_id', 'age', 'age_bin']], how='left')
# popular_items = transactions_last_week.groupby('age_bin')['article_id'].value_counts()
# popular_items_dict = {}
# for index in popular_items.index.levels[0]:
#     popular_items_dict[index] = ' 0'+' 0'.join(popular_items[index][:12].index.astype('str'))
# popular_items_sr = pd.Series(popular_items_dict, name='top_12_popular_items', dtype='str')
# popular_items_sr

In [48]:
# test sub
submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

submission['prediction_lgb'] = submission['customer_id'].map(id_to_index_dict).map(pred_sr)
submission['prediction_lgb'] = submission['prediction_lgb'].fillna('')

# submission['age_bin'] = submission['customer_id'].map(id_to_index_dict).map(customers.set_index('customer_id')['age_bin'])
# submission['prediction_popular'] = submission['age_bin'].map(popular_items_sr)
# submission['prediction_popular'] = submission['prediction_popular'].fillna(top12).astype('str')

submission['prediction'] = submission['prediction_lgb']
submission['prediction'] = submission['prediction'].str.strip()
submission['prediction'] = submission['prediction'].str[:131]
display(submission.head())
submission[['customer_id', 'prediction']].to_csv(f'../submissions/{EXP}_submission.csv', index=False)

Unnamed: 0,customer_id,prediction,prediction_lgb
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0924243001 0866731001 09...,0568601043 0568601006 0924243001 0866731001 0...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0866731001 0714790020 0448509014 09...,0924243001 0866731001 0714790020 0448509014 0...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0924243001 0866731001 0924243002 07...,0794321007 0924243001 0866731001 0924243002 0...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0915529005 0866731001 0714790020 09...,0924243001 0915529005 0866731001 0714790020 0...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0730683050 0896152002 0791587015 0927530004 09...,0730683050 0896152002 0791587015 0927530004 0...
