In [1]:
# ランク学習
# 学習データ5週分(検証データ1週含む)
# 候補作り12週分
# trendingで候補作り
# articlesとcustomersの特徴量をtarget_weekごとに作る
# 候補の良さを測る
EXP = '021'

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm

from pathlib import Path
import pickle
import gc
from time import time
import warnings

tqdm.pandas()
pd.options.display.max_columns = None
warnings.simplefilter('ignore', pd.errors.PerformanceWarning)
warnings.simplefilter('ignore', UserWarning)
data_path = Path('../input/h-and-m-personalized-fashion-recommendations/')

In [3]:
transactions = pd.read_csv(
    data_path / f'transactions_train.csv',
    # set dtype or pandas will drop the leading '0' and convert to int
    dtype={'article_id': 'int32'},
    parse_dates=['t_dat'])
customers = pd.read_csv(data_path / 'customers.csv')
articles = pd.read_csv(
    '../input/h-and-m-personalized-fashion-recommendations/articles.csv', 
    dtype={'article_id': 'int32'})

t_max = transactions['t_dat'].max()
transactions['t_diff'] = (t_max - transactions['t_dat']).dt.days
transactions['week'] = transactions['t_diff'] // 7

customers.loc[~customers['fashion_news_frequency'].isin(['Regularly', 'Monthly']), 'fashion_news_frequency'] = None

id_to_index_dict = dict(zip(customers["customer_id"], customers.index))
index_to_id_dict = dict(zip(customers.index, customers["customer_id"]))
transactions["customer_id"] = transactions["customer_id"].map(id_to_index_dict).astype('int32')
customers['customer_id'] = customers['customer_id'].map(id_to_index_dict).astype('int32')

print(transactions.shape)
display(transactions.tail())

(31788324, 7)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,t_diff,week
31788319,2020-09-22,1371691,929511001,0.059305,2,0,0
31788320,2020-09-22,1371691,891322004,0.042356,2,0,0
31788321,2020-09-22,1371721,918325001,0.043203,1,0,0
31788322,2020-09-22,1371747,833459002,0.006763,1,0,0
31788323,2020-09-22,1371960,898573003,0.033881,2,0,0


In [4]:
def generate_candidates_trending(transactions: pd.DataFrame, customers: np.ndarray, target_week: int):
    df = transactions.query("customer_id in @customers").copy()
    df['week'] = df['week'] - target_week
    df = df.query('week >= 1')
    
    weekly_sales = df.groupby(['week', 'article_id'])['article_id'].count().rename('count').reset_index()
    df = df.merge(weekly_sales, on=['week', 'article_id'], how='left')
    
    weekly_sales = weekly_sales.reset_index().set_index('article_id')
    df = df.join(weekly_sales.loc[weekly_sales['week']==1, ['count']], on='article_id', rsuffix='_targ')
    df['count_targ'].fillna(0, inplace=True)
    df['quotient'] = df['count_targ'] / df['count']
    
    t_max = df['t_dat'].max()
    df['x'] = ((t_max - df['t_dat']) / np.timedelta64(1, 'D')).astype(int)
    df['dummy_1'] = 1
    df['x'] = df[['x', 'dummy_1']].max(axis=1)    
    a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
    df['y'] = a / np.sqrt(df['x']) + b * np.exp(-c*df['x']) - d
    df['dummy_0'] = 0 
    df['y'] = df[["y", "dummy_0"]].max(axis=1)
    df['trending_value'] = df['quotient'] * df['y']
    df = df.groupby(['customer_id', 'article_id']).agg({'trending_value': 'sum'}).reset_index()
    df = df.loc[df['trending_value'] > 0]
    # df['rank'] = df.groupby("customer_id")["value"].rank("dense", ascending=False)
    # df = df.loc[df['rank'] <= 12]
    df = df[['customer_id', 'article_id', 'trending_value']]
    df['isin_trending'] = 1

    return df

In [5]:
def generate_candidates_recently(transactions: pd.DataFrame, customers: np.ndarray, target_week: int):
    df = transactions.query("customer_id in @customers").copy()
    df['week'] = df['week'] - target_week
    df = df.query("week >= 1")
    df = df.query("week <= 12")
    
    for w in df['week'].unique()[::-1]:
        tmp = df.query('week == @w').groupby(['customer_id', 'article_id'])['article_id'].count().rename(f'count_{w}w').reset_index().copy()
        if w == 1:
            purchase_df = tmp
            continue
        purchase_df = purchase_df.merge(tmp, how='outer', on=['customer_id', 'article_id'])
        
    purchase_df['isin_recently'] = 1
    
    return purchase_df

In [6]:
def generate_candidates_popular(transactions: pd.DataFrame, customers: np.ndarray, target_week: int):
    '''
    make_customers_featureが購入数から特徴量を作ってくれるから、この関数は候補のペアだけ返す
    '''
    df = transactions.copy()
    df['week'] = df['week'] - target_week
    df = df.query("week >= 1")
    df = df.query("week <= 4")

    dummy_count_df = df.groupby(['article_id', 'week'])['week'].count().rename('dummy_count').reset_index().copy()
    dummy_count_df['rank_in_week'] = dummy_count_df.groupby('week')['dummy_count'].rank(method='min', ascending=False)
    dummy_articles = dummy_count_df.query('rank_in_week <= 12')['article_id'].unique()

    dummy_df = pd.DataFrame(
        np.concatenate(
            [np.repeat(customers, repeats=len(dummy_articles)).reshape(-1, 1),
            np.repeat(dummy_articles[None, :], repeats=len(customers), axis=0).reshape(-1, 1)],
            axis=1),
        columns = ['customer_id', 'article_id']
    )
    
    dummy_df['isin_popular'] = 1
    
    return dummy_df

In [7]:
def make_customers_feature(customers: pd.DataFrame, transactions: pd.DataFrame, target_week: int, debug: bool = False):
    df = transactions.copy()
    customers_feature = customers.drop(['postal_code'], axis=1).copy()
    customers_feature.loc[~customers_feature['fashion_news_frequency'].isin(['Regularly', 'Monthly']), 'fashion_news_frequency'] = None
    customers_feature[['FN', 'Active']] = customers_feature[['FN', 'Active']].fillna(0)

    # リーク防止
    df['week'] = df['week'] - target_week
    df = df.query('week >= 1')
    
    if debug == True:
        df = df.query('week <= 24')

    weekly_purchase = df.groupby(['customer_id', 'week'])['week'].count().rename('purchase').reset_index()
    
    for agg_name in ['max', 'min', 'mean', 'sum']:
        agg_sr = weekly_purchase.groupby('customer_id')['purchase'].agg(agg_name)
        customers_feature[f'purchase_{agg_name}_groupby_customer'] = customers_feature['customer_id'].map(agg_sr)
    
    for w in df['week'].unique()[::-1]:
        tmp = weekly_purchase[weekly_purchase['week']==w]
        tmp = tmp[['customer_id', 'purchase']].set_index('customer_id')['purchase']
        customers_feature[f'purchase_{w}w'] = customers_feature['customer_id'].map(tmp).fillna(0)
        for agg_name in ['max', 'min', 'mean', 'sum']:
            customers_feature[f'purchase_{agg_name}_groupby_customer_ratio_{w}w'] = customers_feature[f'purchase_{w}w'] / customers_feature[f'purchase_{agg_name}_groupby_customer']
            customers_feature[f'purchase_{agg_name}_groupby_customer_diff_{w}w'] = customers_feature[f'purchase_{w}w'] - customers_feature[f'purchase_{agg_name}_groupby_customer']

    unique_transactions = df[['customer_id', 'article_id', 'week']].drop_duplicates()
    unique_transactions['rank'] = unique_transactions.groupby(['customer_id', 'article_id'])['week'].rank(method='dense', ascending=False)

    customers_feature['repurchase_article'] = customers_feature['customer_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['customer_id', 'article_id']).groupby('customer_id')['article_id'].count()).fillna(0)
    customers_feature['purchase_article'] = customers_feature['customer_id'].map(unique_transactions.drop_duplicates(subset=['customer_id', 'article_id']).groupby('customer_id')['article_id'].count())
    customers_feature['repurchase_article_percent'] = customers_feature['repurchase_article'] / customers_feature['purchase_article']

    customers_feature['repurchase_week'] = customers_feature['customer_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['customer_id', 'week']).groupby('customer_id')['week'].count()).fillna(0)
    customers_feature['purchase_week'] = customers_feature['customer_id'].map(
        unique_transactions.drop_duplicates(subset=['customer_id', 'week']).groupby('customer_id')['week'].count())
    customers_feature['repurchase_week_percent'] = customers_feature['repurchase_week'] / customers_feature['purchase_week']

    customers_feature['repurchase_article_and_week'] = customers_feature['customer_id'].map(
        unique_transactions.query('rank >= 2').groupby('customer_id')['customer_id'].count()).fillna(0)
    customers_feature['purchase_article_and_week'] = customers_feature['customer_id'].map(
        unique_transactions.groupby('customer_id')['customer_id'].count())
    customers_feature['repurchase_article_and_week_percent'] = customers_feature['repurchase_article_and_week'] / customers_feature['purchase_article_and_week']
        
    return customers_feature

In [8]:
def make_articles_feature(articles: pd.DataFrame, transactions: pd.DataFrame, target_week: int, debug: bool = False):
    df = transactions.copy()
    articles_feature = articles.drop(
        ['prod_name', 'product_type_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'prod_name', 'department_name', 'detail_desc'], 
        axis=1).copy()
    
    # リーク防止
    df['week'] = df['week'] - target_week
    df = df.query('week >= 1')
    
    if debug == True:
        df = df.query('week <= 24')

    weekly_sale = df.groupby(['article_id', 'week'])['week'].count().rename('sale').reset_index()
    
    for agg_name in ['max', 'min', 'mean', 'sum']:
        agg_sr = weekly_sale.groupby('article_id')['sale'].agg(agg_name)
        articles_feature[f'sale_{agg_name}_groupby_article'] = articles_feature['article_id'].map(agg_sr)
    
    for w in df['week'].unique()[::-1]:
        tmp = weekly_sale[weekly_sale['week']==w]
        tmp = tmp[['article_id', 'sale']].set_index('article_id')['sale']
        articles_feature[f'sale_{w}w'] = articles_feature['article_id'].map(tmp).fillna(0)
        for agg_name in ['max', 'min', 'mean', 'sum']:
            articles_feature[f'sale_{agg_name}_groupby_article_ratio_{w}w'] = articles_feature[f'sale_{w}w'] / articles_feature[f'sale_{agg_name}_groupby_article']
            articles_feature[f'sale_{agg_name}_groupby_article_diff_{w}w'] = articles_feature[f'sale_{w}w'] - articles_feature[f'sale_{agg_name}_groupby_article']

    unique_transactions = df[['article_id', 'customer_id', 'week']].drop_duplicates()
    unique_transactions['rank'] = unique_transactions.groupby(['article_id', 'customer_id'])['week'].rank(method='dense', ascending=False)

    articles_feature['resale_customer'] = articles_feature['article_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['article_id', 'customer_id']).groupby('article_id')['customer_id'].count()).fillna(0)
    articles_feature['sale_customer'] = articles_feature['article_id'].map(unique_transactions.drop_duplicates(subset=['article_id', 'customer_id']).groupby('article_id')['customer_id'].count())
    articles_feature['resale_customer_percent'] = articles_feature['resale_customer'] / articles_feature['sale_customer']

    articles_feature['resale_week'] = articles_feature['article_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['article_id', 'week']).groupby('article_id')['week'].count()).fillna(0)
    articles_feature['sale_week'] = articles_feature['article_id'].map(
        unique_transactions.drop_duplicates(subset=['article_id', 'week']).groupby('article_id')['week'].count())
    articles_feature['resale_week_percent'] = articles_feature['resale_week'] / articles_feature['sale_week']

    articles_feature['resale_customer_and_week'] = articles_feature['article_id'].map(
        unique_transactions.query('rank >= 2').groupby('article_id')['article_id'].count()).fillna(0)
    articles_feature['sale_customer_and_week'] = articles_feature['article_id'].map(
        unique_transactions.groupby('article_id')['article_id'].count())
    articles_feature['resale_customer_and_week_percent'] = articles_feature['resale_customer_and_week'] / articles_feature['sale_customer_and_week']
    
    return articles_feature

In [9]:
def compress_df(
    df: pd.DataFrame, 
    category_columns: list =['club_member_status', 'fashion_news_frequency', 'product_group_name', 'index_code'], 
    verbose: bool =True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        bar = tqdm(df.columns, leave=False)
    else:
        bar = df.columns
    for col in bar:
        col_type = df[col].dtypes
        if col in category_columns:
            if verbose:
                bar.set_description(f"{col}(category)")
            df[col] = df[col].astype('category')
        elif col_type in numerics:
            if verbose:
                bar.set_description(f"{col}(num)")
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [10]:
# ランク学習
params = {
    'objective': 'lambdarank',
    'boosting': 'gbdt',
    'num_iterations': 1000,
    'learning_rate': 0.1,
    'num_leaves': 31,
    'num_threads': 4,  # for M1 Mac
    'min_data_in_leaf': 20,
    'max_depth': -1,
    'bagging_freq': 5,
    'bagging_fraction': 0.75,
    'metric': ['ndcg'],
    'eval_at': [12],  # 上位何件のランキングをnDCGとMAPの算出に用いるか
    'random_state': 41,
    'verbosity': 1  # 0: warnings, 1: info
}

In [11]:
oof_weeks = [4, 3, 2, 1]
strategy_flags = ['isin_trending', 'isin_recently', 'isin_popular']
kwargs = {'how': 'outer', 'on': ['customer_id', 'article_id'], 'copy': True}

In [None]:
# train lgb ranker
tr_dfs = []
feature_importance_dfs = []
for i, w in enumerate(tqdm(oof_weeks)):
    print(f"\ntarget_week(fold): {w}")
    if i == 0:
        verbose=True
    else:
        verbose=False
    
    tr_actual = transactions.query("week == @w")[['customer_id', 'article_id']].drop_duplicates()
    tr_actual['label'] = 1
    tr_customers = transactions.query("week == @w")['customer_id'].unique()
    tr_candidates_trending = generate_candidates_trending(transactions, tr_customers, w)
    tr_candidates_recently = generate_candidates_recently(transactions, tr_customers, w)
    tr_candidates_popular = generate_candidates_popular(transactions, tr_customers, w)    
    tr_df = tr_candidates_trending.merge(tr_candidates_recently.merge(tr_candidates_popular, **kwargs), **kwargs)
    tr_df[strategy_flags] = tr_df[strategy_flags].fillna(0)
    tr_df = tr_df.merge(tr_actual, how='left', on=['customer_id', 'article_id'])
    tr_df['label'] = tr_df['label'].fillna(0)
    tr_df['target_week'] = w
    tr_df = compress_df(tr_df, verbose=verbose)
    tr_customers_feature = compress_df(make_customers_feature(customers, transactions, target_week=w, debug=True), verbose=verbose)
    tr_articles_feature = compress_df(make_articles_feature(articles, transactions, target_week=w, debug=True), verbose=verbose)
    tr_df = tr_df.merge(tr_customers_feature, how='left', on=['customer_id'], copy=False)
    tr_df = tr_df.merge(tr_articles_feature, how='left', on=['article_id'], copy=False)
    tr_dfs.append(tr_df)

tr_df = pd.concat(tr_dfs, axis=0)
tr_df = compress_df(tr_df, verbose=True)
print(f"[Info] candidates per a customer: {tr_df.groupby('target_week', 'customer_id')['customer_id'].count().mean():.1f}")
display(tr_df[strategy_flags].astype(float).mean())
print(f"[Info] Precision: {tr_df['label'].astype(float).mean():.5f}")
print(f"[Info] Recall(fold1): {tr_df.query('target_week == 1')['label'].sum() / len(tr_actual):.5f}")

val_actual = transactions.query("week == 0")[['customer_id', 'article_id']].drop_duplicates()
val_actual['label'] = 1
val_customers = transactions.query("week == 0")['customer_id'].unique()
val_candidates_trending = generate_candidates_trending(transactions, val_customers, 0)
val_candidates_recently = generate_candidates_recently(transactions, val_customers, 0)
val_candidates_popular = generate_candidates_popular(transactions, val_customers, 0)
val_df = val_candidates_trending.merge(val_candidates_recently.merge(val_candidates_popular, **kwargs), **kwargs)
val_df[strategy_flags] = val_df[strategy_flags].fillna(0)
val_df = val_df.merge(val_actual, how='left', on=['customer_id', 'article_id'])
val_df['label'] = val_df['label'].fillna(0)
val_df = compress_df(val_df, verbose=False)
val_customers_feature = compress_df(make_customers_feature(customers, transactions, target_week=0, debug=True), verbose=False)
val_articles_feature = compress_df(make_articles_feature(articles, transactions, target_week=0, debug=True), verbose=False)
val_df = val_df.merge(val_customers_feature, how='left', on=['customer_id'], copy=False)
val_df = val_df.merge(val_articles_feature, how='left', on=['article_id'], copy=False)

exclude_columns = ['target_week', 'customer_id', 'article_id', 'label']
cols = [c for c in tr_df.columns.tolist() if c not in exclude_columns]
with open(f'../models/lgb_rank/{EXP}_cols.pkl', 'wb') as f:
    pickle.dump(cols, f)

tr_df = tr_df.sort_values(['target_week', 'customer_id']).reset_index(drop=True)
train_query = tr_df.groupby(['target_week', 'customer_id'])['customer_id'].count().to_list()
dtrain = lgb.Dataset(tr_df[cols], label=tr_df['label'], group=train_query)
val_df = val_df.sort_values('customer_id').reset_index(drop=True)    
val_query = val_df.groupby('customer_id')['customer_id'].count().to_list()
dval = lgb.Dataset(val_df[cols], reference=dtrain, label=val_df['label'], group=val_query)

model = lgb.train(
    params, dtrain, valid_sets=[dtrain, dval], 
    callbacks=[lgb.early_stopping(10, first_metric_only=True), lgb.log_evaluation(10)])
with open(f'../models/lgb_rank/{EXP}_model_fold{w}.pkl', 'wb') as f:
    pickle.dump(model, f)

feature_importance_dfs.append(pd.DataFrame({'feature': model.feature_name(), 'importance(gain)': model.feature_importance('gain'), 'fold': w}))

  0%|          | 0/4 [00:00<?, ?it/s]


target_week(fold): 4


  0%|          | 0/20 [00:00<?, ?it/s]

Mem. usage decreased to 215.71 Mb (68.5% reduction)


  0%|          | 0/235 [00:00<?, ?it/s]

Mem. usage decreased to 614.96 Mb (74.9% reduction)


  0%|          | 0/242 [00:00<?, ?it/s]

Mem. usage decreased to 48.62 Mb (75.0% reduction)

target_week(fold): 3

target_week(fold): 2

target_week(fold): 1


  0%|          | 0/495 [00:00<?, ?it/s]

In [99]:
val_pred = np.zeros(len(val_df))
with tqdm(oof_weeks) as pbar:
    for w in pbar:
        pbar.set_description(f"model's target_week(fold): {w}")
        with open(f"../models/lgb_rank/{EXP}_model_fold{w}.pkl", 'rb') as f:
            model = pickle.load(f)
        val_pred += model.predict(val_df[cols], num_iteration=model.best_iteration)
val_pred = val_pred/len(oof_weeks)
print(np.sort(val_pred))

  0%|          | 0/1 [00:00<?, ?it/s]

[-0.72599849 -0.72573982 -0.72318887 ...  0.62941608  0.62941608
  0.63092939]


In [100]:
# most popular items
transactions_last_week = transactions.loc[transactions.week == 1]
top12 = ' 0' + ' 0'.join(transactions_last_week.article_id.value_counts().index.astype('str')[:12])
print("Top 12 popular items:")
print( top12 )

customers['age_bin'] = pd.cut(customers['age'], bins=[10, 20, 30, 40, 50, 60, 70, 100], labels=False)
transactions_last_week = transactions_last_week.merge(customers[['customer_id', 'age', 'age_bin']], how='left')
popular_items = transactions_last_week.groupby('age_bin')['article_id'].value_counts()
popular_items_dict = {}
for index in popular_items.index.levels[0]:
    popular_items_dict[index] = ' 0'+' 0'.join(popular_items[index][:12].index.astype('str'))
popular_items_sr = pd.Series(popular_items_dict, name='top_12_popular_items', dtype='str')
display(popular_items_sr)

Top 12 popular items:
 0909370001 0865799006 0918522001 0924243001 0448509014 0751471001 0809238001 0918292001 0762846027 0809238005 0673677002 0923758001


0.0     0685814003 0448509014 0918522001 0715624001 0...
1.0     0909370001 0865799006 0924243001 0809238001 0...
2.0     0909370001 0865799006 0918525001 0909371001 0...
3.0     0909370001 0751471001 0673677002 0910601003 0...
4.0     0918522001 0751471001 0751471043 0910601003 0...
5.0     0918522001 0908799002 0896152002 0924243001 0...
6.0     0736870001 0796210001 0908799002 0865799006 0...
Name: top_12_popular_items, dtype: object

In [101]:
# predict val data
val_df2 = val_df.copy()
val_df2['predict_score'] = val_pred
val_df2 = val_df2.sort_values('predict_score', ascending=False).drop_duplicates(['customer_id', 'article_id'], keep='first').reset_index(drop=True)
val_df2['rank'] = val_df2.groupby('customer_id')['predict_score'].rank('dense', ascending=False)
val_df2 = val_df2[val_df2['rank'] <= 12]
# val_df2['article_id'] = le.inverse_transform(val_df2['article_id'])
val_df2['article_id'] = ' 0' + val_df2['article_id'].astype(str)
val_pred_sr = val_df2.groupby('customer_id')['article_id'].sum()
display(val_pred_sr.head())

customer_id
80      0671607001 0436261001 0918292001 0448509014 0...
86      0913367001 0905914002 0904026001 0920012001 0...
107     0706016001 0372860068 0706016002 0762846031 0...
117     0448509014 0918292001 0866731001 0915529003 0...
179     0806388001 0776179001 0448509014 0806388002 0...
Name: article_id, dtype: object

In [102]:
# val sub
submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

submission['prediction_lgb'] = submission['customer_id'].map(id_to_index_dict).map(val_pred_sr)
submission['prediction_lgb'] = submission['prediction_lgb'].fillna('')

submission['age_bin'] = submission['customer_id'].map(id_to_index_dict).map(customers.set_index('customer_id')['age_bin'])
submission['prediction_popular'] = submission['age_bin'].map(popular_items_sr)
submission['prediction_popular'] = submission['prediction_popular'].fillna(top12).astype('str')

submission['prediction'] = submission['prediction_lgb'] + submission['prediction_popular']
submission['prediction'] = submission['prediction'].str.strip()
submission['prediction'] = submission['prediction'].str[:131]
display(submission.head())
submission[['customer_id', 'prediction']].to_csv(f'../submissions/{EXP}_submission_fold1.csv', index=False)

Unnamed: 0,customer_id,prediction,prediction_lgb,age_bin,prediction_popular
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0909370001 0751471001 0673677002 0910601003 07...,,3.0,0909370001 0751471001 0673677002 0910601003 0...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0909370001 0865799006 0924243001 0809238001 04...,,1.0,0909370001 0865799006 0924243001 0809238001 0...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0909370001 0865799006 0924243001 0809238001 04...,,1.0,0909370001 0865799006 0924243001 0809238001 0...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0918522001 0751471001 0751471043 0910601003 09...,,4.0,0918522001 0751471001 0751471043 0910601003 0...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0918522001 0751471001 0751471043 0910601003 09...,,4.0,0918522001 0751471001 0751471043 0910601003 0...


In [None]:
del val_actual, val_customers, val_candidates_trending, val_candidates_recently, val_candidates_popular, val_df, val_customers_feature, val_articles_feature
del val_query, dval, val_pred
del transactions_last_week, top12, popular_items, popular_items_dict, popular_items_sr
del val_df2, val_pred_sr, submission
gc.collect()

In [38]:
# train last target_week data
tr_actual = transactions.query("week == 0")[['customer_id', 'article_id']].drop_duplicates()
tr_actual['label'] = 1
tr_customers = transactions.query("week == 0")['customer_id'].unique()
tr_candidates_trending = generate_candidates_trending(transactions, tr_customers, 0)
tr_candidates_recently = generate_candidates_recently(transactions, tr_customers, 0)
tr_candidates_popular = generate_candidates_popular(transactions, tr_customers, 0)
tr_df = tr_candidates_trending.merge(tr_candidates_recently.merge(tr_candidates_popular, **kwargs), **kwargs)
tr_df[strategy_flags] = tr_df[strategy_flags].fillna(0)
tr_df = tr_df.merge(tr_actual, how='left', on=['customer_id', 'article_id'])
tr_df['label'] = tr_df['label'].fillna(0)
tr_df = compress_df(tr_df, verbose=False)
tr_cusotmers_feature = compress_df(make_customers_feature(customers, transactions, target_week=0, debug=True), verbose=False)
tr_articles_feature = compress_df(make_articles_feature(articles, transactions, target_week=0, debug=True), verbose=False)
tr_df = tr_df.merge(tr_customers_feature, how='left', on=['customer_id'], copy=False)
tr_df = tr_df.merge(tr_articles_feature, how='left', on=['article_id'], copy=False)
tr_df = compress_df(tr_df, verbose=False)
print(f"[Info] candidates per a customer: {len(tr_df) / len(tr_customers):.1f}")
print(f"[Info] Precision: {tr_df['label'].sum() / len(tr_df):.5f}")
print(f"[Info] Recall: {tr_df['label'].sum() / len(tr_actual):.5f}")

tr_df = tr_df.sort_values('customer_id').reset_index(drop=True)
train_query = tr_df.groupby('customer_id')['customer_id'].count().to_list()
dtrain = lgb.Dataset(tr_df[cols], label=tr_df['label'], group=train_query)

params['num_iterations'] = model.best_iteration
model = lgb.train(
    params, dtrain, valid_sets=[dtrain], callbacks=[lgb.log_evaluation(10)])
with open(f"../models/lgb_rank/{EXP}_model_fold0.pkl", 'wb') as f:
    pickle.dump(model, f)
    
feature_importance_dfs.append(pd.DataFrame({'feature': model.feature_name(), 'importance(gain)': model.feature_importance('gain'), 'fold': last_week}))

[Info] candidates per a customer: 38.5
[Info] Precision: 0.003
[Info] Recall: 0.042


NameError: name 'tr_purchase_df' is not defined

In [26]:
feature_importance_df = pd.concat(feature_importance_dfs, ignore_index=True, axis=0)
display(feature_importance_df.groupby(['feature'])[['importance(gain)']].mean().sort_values('importance(gain)', ascending=False).head(20))

In [26]:
del tr_actual, tr_customers, tr_candidates_trending, tr_candidates_recently, tr_candidates_popular, tr_df, tr_customers_feature, tr_articles_feature
del train_query, dtrain, params, model
del feature_importance_dfs
gc.collect()

In [None]:
# predict test data
BATCH_SIZE = 1000
submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
test_customers = submission['customer_id'].map(id_to_index_dict).unique()
preds = []

for i in tqdm(range(len(test_customers)//BATCH_SIZE + 1), desc="Mini Batch"):
    if i == (len(test_customers)//BATCH_SIZE):
        test_customers_batch = test_customers[i*BATCH_SIZE : ]
    else:
        test_customers_batch = test_customers[i*BATCH_SIZE : (i+1)*BATCH_SIZE]

    test_candidates_trending = generate_candidates_trending(transactions, test_customers_batch, -1)
    test_candidates_recently = generate_candidates_recently(transactions, test_customers_batch, -1)
    test_candidates_popular = generate_candidates_popular(transactions, test_customers_batch, -1)    
    test_df = compress_df(test_candidates_trending.merge(test_candidates_recently.merge(test_candidates_popular, **kwargs), **kwargs), verbose=False)
    test_df[strategy_flags] = test_df[strategy_flags].fillna(0)    
    test_customers_feature = compress_df(make_customers_feature(customers, transactions, target_week=-1 debug=True), verbose=False)
    test_articles_feature = compress_df(make_articles_feature(articles, transactions, target_week=-1, debug=True), verbose=False)
    test_df = test_df.merge(test_customers_feature, how='left', on=['customer_id'], copy=False)
    test_df = test_df.merge(test_articles_feature, how='left', on=['article_id'], copy=False)
    test_df = compress_df(test_df, verbose=False)
    
    pred = np.zeros(len(test_df))
    all_weeks = oof_weeks + [0]
    for w in all_weeks:
        with open(f"../models/lgb_rank/{EXP}_model_fold{w}.pkl", 'rb') as f:
            model = pickle.load(f)
        pred += model.predict(test_df[cols], num_iteration=model.best_iteration)    
    pred = pred/len(all_weeks)
    
    test_df['predict_score'] = pred
    test_df = test_df.sort_values('predict_score', ascending=False).drop_duplicates(['customer_id', 'article_id'], keep='first').reset_index(drop=True)
    test_df['rank'] = test_df.groupby('customer_id')['predict_score'].rank('min', ascending=False)
    test_df = test_df[test_df['rank'] <= 12]
    
    # test_df['article_id'] = le.inverse_transform(test_df['article_id'])
    test_df['article_id'] = ' 0' + test_df['article_id'].astype(str)
    preds.append(test_df.groupby('customer_id')['article_id'].sum())
    
pred_sr = pd.concat(preds, axis=0)
display(pred_sr.head())

In [None]:
del test_candidates_trending, test_candidates_recently, test_candidates_popular, test_df, test_customers_feature, test_articles_feature, pred, preds
gc.collect()

Mini Batch:   0%|          | 0/1372 [00:00<?, ?it/s]

In [None]:
# most popular items
transactions_last_week = transactions.loc[transactions.week == 0]
top12 = ' 0' + ' 0'.join(transactions_last_week.article_id.value_counts().index.astype('str')[:12])
print("Top 12 popular items:")
print( top12 )

customers['age_bin'] = pd.cut(customers['age'], bins=[10, 20, 30, 40, 50, 60, 70, 100], labels=False)
transactions_last_week = transactions_last_week.merge(customers[['customer_id', 'age', 'age_bin']], how='left')
popular_items = transactions_last_week.groupby('age_bin')['article_id'].value_counts()
popular_items_dict = {}
for index in popular_items.index.levels[0]:
    popular_items_dict[index] = ' 0'+' 0'.join(popular_items[index][:12].index.astype('str'))
popular_items_sr = pd.Series(popular_items_dict, name='top_12_popular_items', dtype='str')
popular_items_sr

In [None]:
# test sub
submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

submission['prediction_lgb'] = submission['customer_id'].map(id_to_index_dict).map(pred_sr)
submission['prediction_lgb'] = submission['prediction_lgb'].fillna('')

submission['age_bin'] = submission['customer_id'].map(id_to_index_dict).map(customers.set_index('customer_id')['age_bin'])
submission['prediction_popular'] = submission['age_bin'].map(popular_items_sr)
submission['prediction_popular'] = submission['prediction_popular'].fillna(top12).astype('str')

submission['prediction'] = submission['prediction_lgb'] + submission['prediction_popular']
submission['prediction'] = submission['prediction'].str.strip()
submission['prediction'] = submission['prediction'].str[:131]
submission = submission[['customer_id', 'prediction']]
display(submission.head())
submission.to_csv(f'../submissions/{EXP}_submission.csv', index=False)