In [1]:
# **colab008に移した**
# ランク学習
# tiobfをベースにbucketsを作る
# New: 特徴量エンジニアリング
# MAP@12 (all): 0.027606
# MAP@12 (cold start): 0.008750

EXP = '018'

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

from pathlib import Path
import pickle
import gc
from time import time
import warnings

pd.options.display.max_columns = None
warnings.simplefilter('ignore', pd.errors.PerformanceWarning)
data_path = Path('../input/h-and-m-personalized-fashion-recommendations/')

In [3]:
transactions = pd.read_csv(
    data_path / f'transactions_train.csv',
    # set dtype or pandas will drop the leading '0' and convert to int
    dtype={'article_id': 'int32'},
    parse_dates=['t_dat'])
customers = pd.read_csv(data_path / 'customers.csv')
articles = pd.read_csv(
    '../input/h-and-m-personalized-fashion-recommendations/articles.csv', 
    dtype={'article_id': 'int32'})

t_max = transactions['t_dat'].max()
transactions['t_diff'] = (t_max - transactions['t_dat']).dt.days
transactions['week'] = transactions['t_diff'] // 7

customers.loc[~customers['fashion_news_frequency'].isin(['Regularly', 'Monthly']), 'fashion_news_frequency'] = None

id_to_index_dict = dict(zip(customers["customer_id"], customers.index))
index_to_id_dict = dict(zip(customers.index, customers["customer_id"]))
transactions["customer_id"] = transactions["customer_id"].map(id_to_index_dict).astype('int32')
customers['customer_id'] = customers['customer_id'].map(id_to_index_dict).astype('int32')

print(transactions.shape)
transactions.tail()

(31788324, 7)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,t_diff,week
31788319,2020-09-22,1371691,929511001,0.059305,2,0,0
31788320,2020-09-22,1371691,891322004,0.042356,2,0,0
31788321,2020-09-22,1371721,918325001,0.043203,1,0,0
31788322,2020-09-22,1371747,833459002,0.006763,1,0,0
31788323,2020-09-22,1371960,898573003,0.033881,2,0,0


In [4]:
def make_purchase_df(transactions: pd.DataFrame, phase: str, debug: bool = False):
    df = transactions.copy()

    if phase == 'train':
        labels = transactions.query("week == 1")[['customer_id', 'article_id']].drop_duplicates().copy()
        labels['is_purchased'] = 1
        df = df.query('week >= 2')
        # target_weekにラベル1（購入）が一つもないユーザは除く
        df = df.query("customer_id in @labels['customer_id'].unique()")
        df = df.copy()
        df['week'] = df['week'] - 2
    elif phase == 'val':
        labels = transactions.query("week == 0")[['customer_id', 'article_id']].drop_duplicates().copy()
        labels['is_purchased'] = 1
        df = df.query('week >= 1')
        # target_weekにラベル1（購入）が一つもないユーザは除く
        df = df.query("customer_id in @labels['customer_id'].unique()")
        df = df.copy()
        df['week'] = df['week'] - 1
    elif phase == 'test':
        pass
    else:
        raise ValueError("phase must be 'train', 'val', or 'test'")

    if debug == True:
        # デバッグ時は4週間分だけ使う
        df = df.query('week < 4')
    
    if phase == 'train' or phase == 'val':
        use_customers = np.intersect1d(df['customer_id'].unique(), labels['customer_id'].unique())
    elif phase == 'test':
        use_customers = df['customer_id'].unique()
    else:
        raise ValueError("phase must be 'train', 'val', or 'test'")

    dummy_count_df = df.groupby(['article_id', 'week'])['week'].count().rename('dummy_count').reset_index().copy()
    dummy_count_df['rank_in_week'] = dummy_count_df.groupby('week')['dummy_count'].rank(method='min', ascending=False)
    dummy_articles = dummy_count_df.query('rank_in_week <= 12')['article_id'].unique()
    dummy_count_df = dummy_count_df[dummy_count_df['article_id'].isin(dummy_articles)]
    
    for w in df['week'].unique()[::-1]:
        tmp = df.query('week == @w').groupby(['customer_id', 'article_id'])['article_id'].count().rename(f'count_{w}w').reset_index().copy()
        tmp_dummy = dummy_count_df.query('week == @w')[['article_id', 'dummy_count']].rename(columns={'dummy_count': f'count_{w}w'})
        if w == 0:
            purchase_df = tmp
            dummy_df = tmp_dummy
            continue
        purchase_df = purchase_df.merge(tmp, how='outer', on=['customer_id', 'article_id'])
        dummy_df = dummy_df.merge(tmp_dummy, how='outer', on=['article_id'])

    del df, dummy_count_df, dummy_articles, tmp, tmp_dummy
    gc.collect()

    dummy_df = pd.DataFrame(
        np.concatenate(
            [np.repeat(use_customers, repeats=len(dummy_df)).reshape(-1, 1),
            np.repeat(np.expand_dims(dummy_df.copy().to_numpy(), axis=0), axis=0, repeats=len(use_customers)).reshape(-1, dummy_df.shape[1])],
            axis=-1),
        columns = ['customer_id'] + list(dummy_df.columns),
    )
    dummy_df = dummy_df.astype({'customer_id': 'int32', 'article_id': 'int32'})

    purchase_df['is_dummy'] = 0
    dummy_df['is_dummy'] = 1

    # print(purchase_df.shape)
    # print(f"{purchase_df.__sizeof__() // 1_000_000} MB")
    # display(purchase_df.head())
    # print(dummy_df.shape)
    # print(f"{dummy_df.__sizeof__() // 1_000_000} MB")
    # display(dummy_df.head())

    purchase_df = pd.concat([purchase_df, dummy_df], axis=0)
    purchase_df = purchase_df.sort_values('customer_id').reset_index(drop=True)

    if phase == 'train' or phase == 'val':
        purchase_df = purchase_df.merge(labels, how='left', on=['customer_id', 'article_id'])
        purchase_df['is_purchased'] = purchase_df['is_purchased'].fillna(0)
        
        return purchase_df
    else:
        return purchase_df

In [None]:
train_purchase_df = make_purchase_df(transactions, phase='train', debug=True)
val_purchase_df = make_purchase_df(transactions, phase='val', debug=True)
print('saving...')
train_purchase_df.to_csv(f'../input/ranking_features/train_purchase_df.csv', index=False)
val_purchase_df.to_csv(f'../input/ranking_features/val_purchase_df.csv', index=False)

In [None]:
train_purchase_df = pd.read_csv(
    f'../input/ranking_features/train_purchase_df.csv', 
    dtype={'customer_id': 'int32', 'article_id': 'int32'})
print('全て欠損値の行（バグ）：', train_purchase_df.drop(['customer_id', 'article_id', 'is_purchased'], axis=1).isna().all(axis=1).sum())
print(train_purchase_df.shape)
print(f"{train_purchase_df.__sizeof__() // 1_000_000} MB")
train_purchase_df.head()

In [None]:
val_purchase_df = pd.read_csv(
    f'../input/ranking_features/val_purchase_df.csv', 
    dtype={'customer_id': 'int32', 'article_id': 'int32'})
print('全て欠損値の行（バグ）：', val_purchase_df.drop(['customer_id', 'article_id', 'is_purchased'], axis=1).isna().all(axis=1).sum())
print(val_purchase_df.shape)
print(f"{val_purchase_df.__sizeof__() // 1_000_000} MB")
val_purchase_df.head()

In [5]:
def make_customers_feature(customers: pd.DataFrame, transactions: pd.DataFrame, phase: str, debug: bool = False):
    df = transactions.copy()
    customers_feature = customers.drop(['postal_code'], axis=1).copy()
    customers_feature.loc[~customers_feature['fashion_news_frequency'].isin(['Regularly', 'Monthly']), 'fashion_news_frequency'] = None
    customers_feature[['FN', 'Active']] = customers_feature[['FN', 'Active']].fillna(0)

    if phase == 'train':
        df = df.query('week >= 2').copy()
        df['week'] = df['week'] - 2
    elif phase == 'val':
        df = df.query('week >= 1').copy()
        df['week'] = df['week'] - 1      
    elif phase == 'test':
        pass
    else:
        raise ValueError("phase must be 'train', 'val', or 'test'")
    
    if debug == True:
        df = df.query('week < 12')

    weekly_purchase = df.groupby(['customer_id', 'week'])['week'].count().rename('purchase').reset_index()
    
    for agg_name in ['max', 'min', 'mean', 'sum']:
        agg_sr = weekly_purchase.groupby('customer_id')['purchase'].agg(agg_name)
        customers_feature[f'purchase_{agg_name}_groupby_customer'] = customers_feature['customer_id'].map(agg_sr)
    
    for w in df['week'].unique()[::-1]:
        tmp = weekly_purchase[weekly_purchase['week']==w]
        tmp = tmp[['customer_id', 'purchase']].set_index('customer_id')['purchase']
        customers_feature[f'purchase_{w}w'] = customers_feature['customer_id'].map(tmp).fillna(0)
        for agg_name in ['max', 'min', 'mean', 'sum']:
            customers_feature[f'purchase_{agg_name}_groupby_customer_ratio_{w}w'] = customers_feature[f'purchase_{w}w'] / customers_feature[f'purchase_{agg_name}_groupby_customer']
            customers_feature[f'purchase_{agg_name}_groupby_customer_diff_{w}w'] = customers_feature[f'purchase_{w}w'] - customers_feature[f'purchase_{agg_name}_groupby_customer']

    unique_transactions = df[['customer_id', 'article_id', 'week']].drop_duplicates()
    unique_transactions['rank'] = unique_transactions.groupby(['customer_id', 'article_id'])['week'].rank(method='dense', ascending=False)

    customers_feature['repurchase_article'] = customers_feature['customer_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['customer_id', 'article_id']).groupby('customer_id')['article_id'].count()).fillna(0)
    customers_feature['purchase_article'] = customers_feature['customer_id'].map(unique_transactions.drop_duplicates(subset=['customer_id', 'article_id']).groupby('customer_id')['article_id'].count())
    customers_feature['repurchase_article_percent'] = customers_feature['repurchase_article'] / customers_feature['purchase_article']

    customers_feature['repurchase_week'] = customers_feature['customer_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['customer_id', 'week']).groupby('customer_id')['week'].count()).fillna(0)
    customers_feature['purchase_week'] = customers_feature['customer_id'].map(
        unique_transactions.drop_duplicates(subset=['customer_id', 'week']).groupby('customer_id')['week'].count())
    customers_feature['repurchase_week_percent'] = customers_feature['repurchase_week'] / customers_feature['purchase_week']

    customers_feature['repurchase_article_and_week'] = customers_feature['customer_id'].map(
        unique_transactions.query('rank >= 2').groupby('customer_id')['customer_id'].count()).fillna(0)
    customers_feature['purchase_article_and_week'] = customers_feature['customer_id'].map(
        unique_transactions.groupby('customer_id')['customer_id'].count())
    customers_feature['repurchase_article_and_week_percent'] = customers_feature['repurchase_article_and_week'] / customers_feature['purchase_article_and_week']
    
    return customers_feature

In [None]:
train_customers_feature = make_customers_feature(customers, transactions, phase='train', debug=True)
val_customers_feature = make_customers_feature(customers, transactions, phase='val', debug=True)
print('saving...')
train_customers_feature.to_csv(f'../input/ranking_features/train_customers_feature.csv', index=False)
val_customers_feature.to_csv(f'../input/ranking_features/val_customers_feature.csv', index=False)

In [None]:
train_customers_feature = pd.read_csv(
    f'../input/ranking_features/train_customers_feature.csv',
    dtype={'customer_id': 'int32'})
print(train_customers_feature.shape)
print(f"{train_customers_feature.__sizeof__() // 1_000_000} MB")
train_customers_feature.head()

In [None]:
val_customers_feature = pd.read_csv(
    f'../input/ranking_features/val_customers_feature.csv',
    dtype={'customer_id': 'int32'})
print(val_customers_feature.shape)
print(f"{val_customers_feature.__sizeof__() // 1_000_000} MB")
val_customers_feature.head()

In [6]:
def make_articles_feature(articles: pd.DataFrame, transactions: pd.DataFrame, phase: str, debug: bool = False):
    df = transactions.copy()
    articles_feature = articles.drop(
        ['prod_name', 'product_type_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'prod_name', 'department_name', 'detail_desc'], 
        axis=1).copy()
    
    if phase == 'train':
        df = df.query('week >= 2').copy()
        df['week'] = df['week'] - 2
    elif phase == 'val':
        df = df.query('week >= 1').copy()
        df['week'] = df['week'] - 1      
    elif phase == 'test':
        pass
    else:
        raise ValueError("phase must be 'train', 'val', or 'test'")

    if debug == True:
        df = df.query('week <= 12')

    weekly_sale = df.groupby(['article_id', 'week'])['week'].count().rename('sale').reset_index()
    
    for agg_name in ['max', 'min', 'mean', 'sum']:
        agg_sr = weekly_sale.groupby('article_id')['sale'].agg(agg_name)
        articles_feature[f'sale_{agg_name}_groupby_article'] = articles_feature['article_id'].map(agg_sr)
    
    for w in df['week'].unique()[::-1]:
        tmp = weekly_sale[weekly_sale['week']==w]
        tmp = tmp[['article_id', 'sale']].set_index('article_id')['sale']
        articles_feature[f'sale_{w}w'] = articles_feature['article_id'].map(tmp).fillna(0)
        for agg_name in ['max', 'min', 'mean', 'sum']:
            articles_feature[f'sale_{agg_name}_groupby_article_ratio_{w}w'] = articles_feature[f'sale_{w}w'] / articles_feature[f'sale_{agg_name}_groupby_article']
            articles_feature[f'sale_{agg_name}_groupby_article_diff_{w}w'] = articles_feature[f'sale_{w}w'] - articles_feature[f'sale_{agg_name}_groupby_article']

    unique_transactions = df[['article_id', 'customer_id', 'week']].drop_duplicates()
    unique_transactions['rank'] = unique_transactions.groupby(['article_id', 'customer_id'])['week'].rank(method='dense', ascending=False)

    articles_feature['resale_customer'] = articles_feature['article_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['article_id', 'customer_id']).groupby('article_id')['customer_id'].count()).fillna(0)
    articles_feature['sale_customer'] = articles_feature['article_id'].map(unique_transactions.drop_duplicates(subset=['article_id', 'customer_id']).groupby('article_id')['customer_id'].count())
    articles_feature['resale_customer_percent'] = articles_feature['resale_customer'] / articles_feature['sale_customer']

    articles_feature['resale_week'] = articles_feature['article_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['article_id', 'week']).groupby('article_id')['week'].count()).fillna(0)
    articles_feature['sale_week'] = articles_feature['article_id'].map(
        unique_transactions.drop_duplicates(subset=['article_id', 'week']).groupby('article_id')['week'].count())
    articles_feature['resale_week_percent'] = articles_feature['resale_week'] / articles_feature['sale_week']

    articles_feature['resale_customer_and_week'] = articles_feature['article_id'].map(
        unique_transactions.query('rank >= 2').groupby('article_id')['article_id'].count()).fillna(0)
    articles_feature['sale_customer_and_week'] = articles_feature['article_id'].map(
        unique_transactions.groupby('article_id')['article_id'].count())
    articles_feature['resale_customer_and_week_percent'] = articles_feature['resale_customer_and_week'] / articles_feature['sale_customer_and_week']

    return articles_feature

In [None]:
train_articles_feature = make_articles_feature(articles, transactions, phase='train', debug=True)
val_articles_feature = make_articles_feature(articles, transactions, phase='val', debug=True)
print('saving...')
train_articles_feature.to_csv(f'../input/ranking_features/train_articles_feature.csv', index=False)
val_articles_feature.to_csv(f'../input/ranking_features/val_articles_feature.csv', index=False)

In [None]:
train_articles_feature = pd.read_csv(
    f'../input/ranking_features/train_articles_feature.csv', 
    dtype={'article_id': 'int32'})
print(train_articles_feature.shape)
print(f"{train_articles_feature.__sizeof__() // 1_000_000} MB")
train_articles_feature.head()

In [None]:
val_articles_feature = pd.read_csv(
    f'../input/ranking_features/val_articles_feature.csv', 
    dtype={'article_id': 'int32'})
print(val_articles_feature.shape)
print(f"{val_articles_feature.__sizeof__() // 1_000_000} MB")
val_articles_feature.head()

In [None]:
train_purchase_df = train_purchase_df.merge(train_customers_feature, how='left', on=['customer_id'])
train_purchase_df = train_purchase_df.merge(train_articles_feature, how='left', on=['article_id'])

val_purchase_df = val_purchase_df.merge(val_customers_feature, how='left', on=['customer_id'])
val_purchase_df = val_purchase_df.merge(val_articles_feature, how='left', on=['article_id'])

print('saving...')
train_purchase_df.to_csv('../input/ranking_features/train_purchase_df_merged.csv', index=False)
val_purchase_df.to_csv('../input/ranking_features/val_purchase_df_merged.csv', index=False)

del train_customers_feature, train_articles_feature, val_customers_feature, val_articles_feature
gc.collect()

In [None]:
train_purchase_df = pd.read_csv(
    '../input/ranking_features/train_purchase_df_merged.csv',
    dtype={'customer_id': 'int32', 'article_id': 'int32'})

print(train_purchase_df.shape)
print(f"{train_purchase_df.__sizeof__() // 1_000_000} MB")
display(train_purchase_df.head())

In [None]:
val_purchase_df = pd.read_csv(
    '../input/ranking_features/val_purchase_df_merged.csv',
    dtype={'customer_id': 'int32', 'article_id': 'int32'})

print(val_purchase_df.shape)
print(f"{val_purchase_df.__sizeof__() // 1_000_000} MB")
display(val_purchase_df.head())

In [None]:
train_purchase_df.select_dtypes(exclude='number').dtypes

In [None]:
for c in ['club_member_status', 'fashion_news_frequency', 'product_group_name', 'index_code']:
    print(train_purchase_df[c].unique())

In [None]:
exclude_columns = ['customer_id', 'article_id', 'is_purchased']
category_columns = ['club_member_status', 'fashion_news_frequency', 'product_group_name', 'index_code']
cols = [c for c in train_purchase_df.columns if c not in exclude_columns]

with tqdm(cols) as bar:
    for c in bar:
        if c in category_columns:
            bar.set_description(f"{c}(category)")
            train_purchase_df[c] = train_purchase_df[c].astype('category')
            val_purchase_df[c] = val_purchase_df[c].astype('category')
        else:
            bar.set_description(f"{c}(float)")
            train_purchase_df[c] = train_purchase_df[c].astype(float)
            val_purchase_df[c] = val_purchase_df[c].astype(float)

with open(f'../models/lgb_rank/{EXP}_category_columns.pkl', 'wb') as f:
    pickle.dump(category_columns, f)
with open(f'../models/lgb_rank/{EXP}_cols.pkl', 'wb') as f:
    pickle.dump(cols, f)

print(len(cols))

In [8]:
# ランク学習
params = {
    'objective': 'lambdarank',
    'boosting': 'gbdt',
    'num_iterations': 10000,
    'learning_rate': 0.1,
    'num_leaves': 31,
    'num_threads': 4,  # for M1 Mac
    'min_data_in_leaf': 20,
    'max_depth': -1,
    'bagging_freq': 5,
    'bagging_fraction': 0.75,
    'metric': ['ndcg', 'map'],
    'eval_at': [12],  # 上位何件のランキングをnDCGとMAPの算出に用いるか
    'random_state': 41,
}

In [None]:
train_query = train_purchase_df.groupby('customer_id')['customer_id'].count().to_list()
dtrain = lgb.Dataset(train_purchase_df[cols], label=train_purchase_df['is_purchased'], group=train_query)
val_query = val_purchase_df.groupby('customer_id')['customer_id'].count().to_list()
dval = lgb.Dataset(val_purchase_df[cols], reference=dtrain, label=val_purchase_df['is_purchased'], group=val_query)

model = lgb.train(
    params, dtrain, valid_sets=[dtrain, dval], 
    callbacks=[lgb.early_stopping(500, first_metric_only=True), lgb.log_evaluation(500)])

with open(f'../models/lgb_rank/{EXP}_model_fold1.pkl', 'wb') as f:
    pickle.dump(model, f)

In [24]:
pd.DataFrame({'feature': model.feature_name(), 'importance(gain)': model.feature_importance('gain')}).sort_values('importance(gain)', ascending=False).head(20)

Unnamed: 0,feature,importance(gain)
4,is_dummy,61178.624136
0,count_0w,11859.639997
147,sale_0w,6981.868315
14,purchase_0w,6553.53278
1,count_1w,2395.191624
123,purchase_article,2392.24301
266,resale_customer_percent,1451.156986
132,product_type_no,1300.875978
9,age,1214.680098
21,purchase_sum_groupby_customer_ratio_0w,1131.096905


In [None]:
val_pred = model.predict(val_purchase_df[cols], num_iteration=model.best_iteration)
np.sort(val_pred)

In [None]:
# most popular items
transactions_last_week = transactions.loc[transactions.week == 1]
top12 = ' 0' + ' 0'.join(transactions_last_week.article_id.value_counts().index.astype('str')[:12])
print("Top 12 popular items:")
print( top12 )

customers['age_bin'] = pd.cut(customers['age'], bins=[10, 20, 30, 40, 50, 60, 70, 100], labels=False)
transactions_last_week = transactions_last_week.merge(customers[['customer_id', 'age', 'age_bin']], how='left')
popular_items = transactions_last_week.groupby('age_bin')['article_id'].value_counts()
popular_items_dict = {}
for index in popular_items.index.levels[0]:
    popular_items_dict[index] = ' 0'+' 0'.join(popular_items[index][:12].index.astype('str'))
popular_items_sr = pd.Series(popular_items_dict, name='top_12_popular_items', dtype='str')
popular_items_sr

In [None]:
submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

val_purchase_df2 = val_purchase_df.copy()
val_purchase_df2['predict_score'] = val_pred
val_purchase_df2['rank'] = val_purchase_df2.groupby('customer_id')['predict_score'].rank('dense', ascending=False)
val_purchase_df2 = val_purchase_df2[val_purchase_df2['rank'] <= 12]
val_purchase_df2 = val_purchase_df2.sort_values('rank').reset_index(drop=True)
# val_purchase_df2['article_id'] = le.inverse_transform(val_purchase_df2['article_id'])
val_purchase_df2['article_id'] = ' 0' + val_purchase_df2['article_id'].astype(str)
submission['prediction_lgb'] = submission['customer_id'].map(id_to_index_dict).map(val_purchase_df2.groupby('customer_id')['article_id'].sum())
submission['prediction_lgb'] = submission['prediction_lgb'].fillna('')

submission['age_bin'] = submission['customer_id'].map(id_to_index_dict).map(customers.set_index('customer_id')['age_bin'])
submission['prediction_popular'] = submission['age_bin'].map(popular_items_sr)
submission['prediction_popular'] = submission['prediction_popular'].fillna(top12).astype('str')

submission['prediction'] = submission['prediction_lgb'] + submission['prediction_popular']
submission['prediction'] = submission['prediction'].str.strip()
submission['prediction'] = submission['prediction'].str[:131]
submission = submission[['customer_id', 'prediction']]
submission.head()

In [None]:
submission.to_csv(f'../submissions/{EXP}_submission_fold1.csv', index=False)

In [None]:
del train_query, dtrain, val_query, dval, val_pred
del transactions_last_week, top12, popular_items, popular_items_dict, popular_items_sr
del val_purchase_df2, submission
gc.collect()

In [None]:
# train all datas
train_purchase_df_all = pd.concat([train_purchase_df, val_purchase_df], axis=0)

del train_purchase_df, val_purchase_df
gc.collect()

In [None]:
with tqdm(cols) as bar:
    for c in bar:
        if c in category_columns:
            bar.set_description(f"{c}(category)")
            train_purchase_df_all[c] = train_purchase_df_all[c].astype('category')
        else:
            bar.set_description(f"{c}(float)")
            train_purchase_df_all[c] = train_purchase_df_all[c].astype(float)

train_query = train_purchase_df_all.groupby('customer_id')['customer_id'].count().to_list()
dtrain = lgb.Dataset(train_purchase_df_all[cols], label=train_purchase_df_all['is_purchased'], group=train_query)

params['num_iterations'] = model.best_iteration
model = lgb.train(
    params, dtrain, valid_sets=[dtrain],
    callbacks=[lgb.log_evaluation(10)])

with open(f'../models/lgb_rank/{EXP}_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
del train_purchase_df_all, train_query, dtrain
gc.collect()

In [14]:
BATCH_SIZE = 1000
test_customers = transactions.query("week < 4")['customer_id'].unique()
test_customers_feature = make_customers_feature(customers, transactions, phase='test', debug=True)
test_articles_feature = make_articles_feature(articles, transactions, phase='test', debug=True)

with open(f'../models/lgb_rank/{EXP}_category_columns.pkl', 'rb') as f:
    category_columns = pickle.load(f)
with open(f'../models/lgb_rank/{EXP}_cols.pkl', 'rb') as f:
    cols = pickle.load(f)    
with open(f'../models/lgb_rank/{EXP}_model.pkl', 'rb') as f:
    model = pickle.load(f)
    
preds = []

for i in tqdm(range(len(test_customers)//BATCH_SIZE + 1)):
    if i == (len(test_customers)//BATCH_SIZE):
        transactions_batch = transactions[transactions['customer_id'].isin(test_customers[i*BATCH_SIZE:])]
    else:
        transactions_batch = transactions[transactions['customer_id'].isin(test_customers[i*BATCH_SIZE : (i+1)*BATCH_SIZE])]
    test_purchase_df = make_purchase_df(transactions_batch, phase='test', debug=True)
    test_purchase_df = test_purchase_df.merge(test_customers_feature, how='left', on=['customer_id'])
    test_purchase_df = test_purchase_df.merge(test_articles_feature, how='left', on=['article_id'])

    for c in cols:
        if c in category_columns:
            test_purchase_df[c] = test_purchase_df[c].astype('category')
        elif c not in test_purchase_df.columns:
            test_purchase_df[c] = np.nan
            test_purchase_df[c] = test_purchase_df[c].astype(float)
        else:
            test_purchase_df[c] = test_purchase_df[c].astype(float)

    pred = model.predict(test_purchase_df[cols], num_iteration=model.best_iteration)
    
    test_purchase_df['predict_score'] = pred
    test_purchase_df['rank'] = test_purchase_df.groupby('customer_id')['predict_score'].rank('dense', ascending=False)
    test_purchase_df = test_purchase_df[test_purchase_df['rank'] <= 12]
    test_purchase_df = test_purchase_df.sort_values('rank').reset_index(drop=True)
    # test_purchase_df['article_id'] = le.inverse_transform(test_purchase_df['article_id'])
    test_purchase_df['article_id'] = ' 0' + test_purchase_df['article_id'].astype(str)
    preds.append(test_purchase_df.groupby('customer_id')['article_id'].sum())
    
pred_sr = pd.concat(preds, axis=0)
display(pred_sr.head())

del test_purchase_df, pred, preds
gc.collect()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234/234 [07:11<00:00,  1.84s/it]


customer_id
341      0928206001 0929275001 0924243001 0889550002 0...
381      0610776002 0399256001 0882882010 0827968022 0...
782      0902265003 0921671001 0885951001 0835704001 0...
926      0928206001 0929275001 0918292001 0909370001 0...
1034     0835348011 0928206001 0929275001 0918292001 0...
Name: article_id, dtype: object

4

In [15]:
# most popular items
transactions_last_week = transactions.loc[transactions.week == 0]
top12 = ' 0' + ' 0'.join(transactions_last_week.article_id.value_counts().index.astype('str')[:12])
print("Top 12 popular items:")
print( top12 )

customers['age_bin'] = pd.cut(customers['age'], bins=[10, 20, 30, 40, 50, 60, 70, 100], labels=False)
transactions_last_week = transactions_last_week.merge(customers[['customer_id', 'age', 'age_bin']], how='left')
popular_items = transactions_last_week.groupby('age_bin')['article_id'].value_counts()
popular_items_dict = {}
for index in popular_items.index.levels[0]:
    popular_items_dict[index] = ' 0'+' 0'.join(popular_items[index][:12].index.astype('str'))
popular_items_sr = pd.Series(popular_items_dict, name='top_12_popular_items', dtype='str')
popular_items_sr

Top 12 popular items:
 0924243001 0924243002 0918522001 0923758001 0866731001 0909370001 0751471001 0915529003 0915529005 0448509014 0762846027 0714790020


0.0     0685813003 0918522001 0715624001 0850917001 0...
1.0     0924243001 0866731001 0909370001 0918522001 0...
2.0     0923758001 0909370001 0924243001 0935541001 0...
3.0     0751471001 0928206001 0924243001 0924243002 0...
4.0     0924243001 0928206001 0930380001 0924243002 0...
5.0     0930380001 0924243001 0751471043 0910601003 0...
6.0     0751471043 0930380001 0865799006 0714790030 0...
Name: top_12_popular_items, dtype: object

In [16]:
submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

submission['prediction_lgb'] = submission['customer_id'].map(id_to_index_dict).map(pred_sr)
submission['prediction_lgb'] = submission['prediction_lgb'].fillna('')

submission['age_bin'] = submission['customer_id'].map(id_to_index_dict).map(customers.set_index('customer_id')['age_bin'])
submission['prediction_popular'] = submission['age_bin'].map(popular_items_sr)
submission['prediction_popular'] = submission['prediction_popular'].fillna(top12).astype('str')

submission['prediction'] = submission['prediction_lgb'] + submission['prediction_popular']
submission['prediction'] = submission['prediction'].str.strip()
submission['prediction'] = submission['prediction'].str[:131]
submission = submission[['customer_id', 'prediction']]
submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0909370001 0889550002 0898573003 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0866731001 0909370001 0918522001 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0924243001 0827968001 0909370001 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0928206001 0930380001 0924243002 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243001 0928206001 0930380001 0924243002 09...


In [17]:
submission.to_csv(f'../submissions/{EXP}_submission.csv', index=False)