In [1]:
# **colab008に移した**
# ランク学習
# tiobfをベースにbucketsを作る
# 特徴量エンジニアリング
# New: 学習データ4週分
# New: 候補作り12週分
# MAP@12 (all): 0.026817
# MAP@12 (cold start): 0.008750

EXP = '019'

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm

from pathlib import Path
import pickle
import gc
from time import time
import warnings

tqdm.pandas()
pd.options.display.max_columns = None
warnings.simplefilter('ignore', pd.errors.PerformanceWarning)
warnings.simplefilter('ignore', UserWarning)
data_path = Path('../input/h-and-m-personalized-fashion-recommendations/')

In [3]:
transactions = pd.read_csv(
    data_path / f'transactions_train.csv',
    # set dtype or pandas will drop the leading '0' and convert to int
    dtype={'article_id': 'int32'},
    parse_dates=['t_dat'])
customers = pd.read_csv(data_path / 'customers.csv')
articles = pd.read_csv(
    '../input/h-and-m-personalized-fashion-recommendations/articles.csv', 
    dtype={'article_id': 'int32'})

t_max = transactions['t_dat'].max()
transactions['t_diff'] = (t_max - transactions['t_dat']).dt.days
transactions['week'] = transactions['t_diff'] // 7

customers.loc[~customers['fashion_news_frequency'].isin(['Regularly', 'Monthly']), 'fashion_news_frequency'] = None

id_to_index_dict = dict(zip(customers["customer_id"], customers.index))
index_to_id_dict = dict(zip(customers.index, customers["customer_id"]))
transactions["customer_id"] = transactions["customer_id"].map(id_to_index_dict).astype('int32')
customers['customer_id'] = customers['customer_id'].map(id_to_index_dict).astype('int32')

print(transactions.shape)
display(transactions.tail())

(31788324, 7)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,t_diff,week
31788319,2020-09-22,1371691,929511001,0.059305,2,0,0
31788320,2020-09-22,1371691,891322004,0.042356,2,0,0
31788321,2020-09-22,1371721,918325001,0.043203,1,0,0
31788322,2020-09-22,1371747,833459002,0.006763,1,0,0
31788323,2020-09-22,1371960,898573003,0.033881,2,0,0


In [4]:
def make_purchase_df(transactions: pd.DataFrame, target_week: int, debug: bool = False):
    df = transactions.copy()
    
    df = df.query('week >= @target_week').copy()
    df['week'] = df['week'] - target_week

    if debug == True:
        # デバッグ時は13週間分だけ使う（1週間分はラベル）
        df = df.query('week < 13')

    if target_week >= 0:  # 学習（検証）データ
        labels = df.query("week == 0")[['customer_id', 'article_id']].drop_duplicates().copy()
        labels['is_purchased'] = 1
        df = df.query('week >= 1')
        # target_weekにラベル1（購入）が一つもないユーザは除く
        df = df.query("customer_id in @labels['customer_id'].unique()")
        use_customers = np.intersect1d(df['customer_id'].unique(), labels['customer_id'].unique())
    elif target_week == -1:  # テストデータ
        use_customers = df['customer_id'].unique()
    else:
        raise ValueError("target_week is incorrect. >= 0(train and valid), == -1(test)")
    
    if len(df) == 0:
        raise RuntimeError(f"candidates are empty.")
    
    dummy_count_df = df.groupby(['article_id', 'week'])['week'].count().rename('dummy_count').reset_index().copy()
    dummy_count_df['rank_in_week'] = dummy_count_df.groupby('week')['dummy_count'].rank(method='min', ascending=False)
    dummy_articles = dummy_count_df.query('rank_in_week <= 12')['article_id'].unique()
    dummy_count_df = dummy_count_df[dummy_count_df['article_id'].isin(dummy_articles)]
    
    for w in df['week'].unique()[::-1]:
        tmp = df.query('week == @w').groupby(['customer_id', 'article_id'])['article_id'].count().rename(f'count_{w}w').reset_index().copy()
        tmp_dummy = dummy_count_df.query('week == @w')[['article_id', 'dummy_count']].rename(columns={'dummy_count': f'count_{w}w'})
        if w == 1:
            purchase_df = tmp
            dummy_df = tmp_dummy
            continue
        purchase_df = purchase_df.merge(tmp, how='outer', on=['customer_id', 'article_id'])
        dummy_df = dummy_df.merge(tmp_dummy, how='outer', on=['article_id'])

    del df, dummy_count_df, dummy_articles, tmp, tmp_dummy
    gc.collect()

    dummy_df = pd.DataFrame(
        np.concatenate(
            [np.repeat(use_customers, repeats=len(dummy_df)).reshape(-1, 1),
            np.repeat(np.expand_dims(dummy_df.copy().to_numpy(), axis=0), axis=0, repeats=len(use_customers)).reshape(-1, dummy_df.shape[1])],
            axis=-1),
        columns = ['customer_id'] + list(dummy_df.columns),
    )
    dummy_df = dummy_df.astype({'customer_id': 'int32', 'article_id': 'int32'})

    purchase_df['is_dummy'] = 0
    dummy_df['is_dummy'] = 1

    purchase_df = pd.concat([purchase_df, dummy_df], axis=0)
    purchase_df = purchase_df.sort_values('customer_id').reset_index(drop=True)
    
    purchase_df['target_week'] = target_week

    if target_week >= 0:
        purchase_df = purchase_df.merge(labels, how='left', on=['customer_id', 'article_id'])
        purchase_df['is_purchased'] = purchase_df['is_purchased'].fillna(0)
        
        return purchase_df
    else:
        return purchase_df

In [5]:
def make_customers_feature(customers: pd.DataFrame, transactions: pd.DataFrame, debug: bool = False):
    df = transactions.copy()
    customers_feature = customers.drop(['postal_code'], axis=1).copy()
    customers_feature.loc[~customers_feature['fashion_news_frequency'].isin(['Regularly', 'Monthly']), 'fashion_news_frequency'] = None
    customers_feature[['FN', 'Active']] = customers_feature[['FN', 'Active']].fillna(0)

    # 暫定的なリーク防止
    df = df.query('week > 4')
    
    if debug == True:
        df = df.query('week <= 24')

    weekly_purchase = df.groupby(['customer_id', 'week'])['week'].count().rename('purchase').reset_index()
    
    for agg_name in ['max', 'min', 'mean', 'sum']:
        agg_sr = weekly_purchase.groupby('customer_id')['purchase'].agg(agg_name)
        customers_feature[f'purchase_{agg_name}_groupby_customer'] = customers_feature['customer_id'].map(agg_sr)
    
    for w in df['week'].unique()[::-1]:
        tmp = weekly_purchase[weekly_purchase['week']==w]
        tmp = tmp[['customer_id', 'purchase']].set_index('customer_id')['purchase']
        customers_feature[f'purchase_{w}w'] = customers_feature['customer_id'].map(tmp).fillna(0)
        for agg_name in ['max', 'min', 'mean', 'sum']:
            customers_feature[f'purchase_{agg_name}_groupby_customer_ratio_{w}w'] = customers_feature[f'purchase_{w}w'] / customers_feature[f'purchase_{agg_name}_groupby_customer']
            customers_feature[f'purchase_{agg_name}_groupby_customer_diff_{w}w'] = customers_feature[f'purchase_{w}w'] - customers_feature[f'purchase_{agg_name}_groupby_customer']

    unique_transactions = df[['customer_id', 'article_id', 'week']].drop_duplicates()
    unique_transactions['rank'] = unique_transactions.groupby(['customer_id', 'article_id'])['week'].rank(method='dense', ascending=False)

    customers_feature['repurchase_article'] = customers_feature['customer_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['customer_id', 'article_id']).groupby('customer_id')['article_id'].count()).fillna(0)
    customers_feature['purchase_article'] = customers_feature['customer_id'].map(unique_transactions.drop_duplicates(subset=['customer_id', 'article_id']).groupby('customer_id')['article_id'].count())
    customers_feature['repurchase_article_percent'] = customers_feature['repurchase_article'] / customers_feature['purchase_article']

    customers_feature['repurchase_week'] = customers_feature['customer_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['customer_id', 'week']).groupby('customer_id')['week'].count()).fillna(0)
    customers_feature['purchase_week'] = customers_feature['customer_id'].map(
        unique_transactions.drop_duplicates(subset=['customer_id', 'week']).groupby('customer_id')['week'].count())
    customers_feature['repurchase_week_percent'] = customers_feature['repurchase_week'] / customers_feature['purchase_week']

    customers_feature['repurchase_article_and_week'] = customers_feature['customer_id'].map(
        unique_transactions.query('rank >= 2').groupby('customer_id')['customer_id'].count()).fillna(0)
    customers_feature['purchase_article_and_week'] = customers_feature['customer_id'].map(
        unique_transactions.groupby('customer_id')['customer_id'].count())
    customers_feature['repurchase_article_and_week_percent'] = customers_feature['repurchase_article_and_week'] / customers_feature['purchase_article_and_week']
        
    return customers_feature

In [6]:
def make_articles_feature(articles: pd.DataFrame, transactions: pd.DataFrame, debug: bool = False):
    df = transactions.copy()
    articles_feature = articles.drop(
        ['prod_name', 'product_type_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'prod_name', 'department_name', 'detail_desc'], 
        axis=1).copy()
    
    # 暫定的なリーク防止
    df = df.query('week > 4')

    if debug == True:
        df = df.query('week <= 24')

    weekly_sale = df.groupby(['article_id', 'week'])['week'].count().rename('sale').reset_index()
    
    for agg_name in ['max', 'min', 'mean', 'sum']:
        agg_sr = weekly_sale.groupby('article_id')['sale'].agg(agg_name)
        articles_feature[f'sale_{agg_name}_groupby_article'] = articles_feature['article_id'].map(agg_sr)
    
    for w in df['week'].unique()[::-1]:
        tmp = weekly_sale[weekly_sale['week']==w]
        tmp = tmp[['article_id', 'sale']].set_index('article_id')['sale']
        articles_feature[f'sale_{w}w'] = articles_feature['article_id'].map(tmp).fillna(0)
        for agg_name in ['max', 'min', 'mean', 'sum']:
            articles_feature[f'sale_{agg_name}_groupby_article_ratio_{w}w'] = articles_feature[f'sale_{w}w'] / articles_feature[f'sale_{agg_name}_groupby_article']
            articles_feature[f'sale_{agg_name}_groupby_article_diff_{w}w'] = articles_feature[f'sale_{w}w'] - articles_feature[f'sale_{agg_name}_groupby_article']

    unique_transactions = df[['article_id', 'customer_id', 'week']].drop_duplicates()
    unique_transactions['rank'] = unique_transactions.groupby(['article_id', 'customer_id'])['week'].rank(method='dense', ascending=False)

    articles_feature['resale_customer'] = articles_feature['article_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['article_id', 'customer_id']).groupby('article_id')['customer_id'].count()).fillna(0)
    articles_feature['sale_customer'] = articles_feature['article_id'].map(unique_transactions.drop_duplicates(subset=['article_id', 'customer_id']).groupby('article_id')['customer_id'].count())
    articles_feature['resale_customer_percent'] = articles_feature['resale_customer'] / articles_feature['sale_customer']

    articles_feature['resale_week'] = articles_feature['article_id'].map(
        unique_transactions.query('rank >= 2').drop_duplicates(subset=['article_id', 'week']).groupby('article_id')['week'].count()).fillna(0)
    articles_feature['sale_week'] = articles_feature['article_id'].map(
        unique_transactions.drop_duplicates(subset=['article_id', 'week']).groupby('article_id')['week'].count())
    articles_feature['resale_week_percent'] = articles_feature['resale_week'] / articles_feature['sale_week']

    articles_feature['resale_customer_and_week'] = articles_feature['article_id'].map(
        unique_transactions.query('rank >= 2').groupby('article_id')['article_id'].count()).fillna(0)
    articles_feature['sale_customer_and_week'] = articles_feature['article_id'].map(
        unique_transactions.groupby('article_id')['article_id'].count())
    articles_feature['resale_customer_and_week_percent'] = articles_feature['resale_customer_and_week'] / articles_feature['sale_customer_and_week']
    
    return articles_feature

In [7]:
purchase_dfs = []
for w in tqdm(range(5), leave=False):
    purchase_dfs.append(make_purchase_df(transactions, target_week=w, debug=True))
train_purchase_df = pd.concat(purchase_dfs, ignore_index=True, axis=0)

print(train_purchase_df.shape)
print(f"{train_purchase_df.__sizeof__() // 1_000_000} MB")
display(train_purchase_df.head())

del purchase_dfs
gc.collect()

  0%|          | 0/5 [00:00<?, ?it/s]

(34299808, 17)
4664 MB


Unnamed: 0,customer_id,article_id,count_1w,count_2w,count_3w,count_4w,count_5w,count_6w,count_7w,count_8w,count_9w,count_10w,count_11w,count_12w,is_dummy,target_week,is_purchased
0,86,621381012,1.0,,,,,,,,,,,,0,0,0.0
1,86,909921001,21.0,7.0,12.0,12.0,19.0,37.0,96.0,2.0,,,,,1,0,0.0
2,86,909370001,358.0,20.0,,,,,,,,,,,1,0,0.0
3,86,906794003,17.0,17.0,29.0,34.0,51.0,86.0,9.0,,,,,,1,0,0.0
4,86,905945001,12.0,117.0,2.0,,,,,,,,,,1,0,0.0


32

In [8]:
train_customers_feature = make_customers_feature(customers, transactions, debug=True)

print(train_customers_feature.shape)
print(f"{train_customers_feature.__sizeof__() // 1_000_000} MB")
display(train_customers_feature.head())

(1371980, 199)
2296 MB


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,purchase_max_groupby_customer,purchase_min_groupby_customer,purchase_mean_groupby_customer,purchase_sum_groupby_customer,purchase_5w,purchase_max_groupby_customer_ratio_5w,purchase_max_groupby_customer_diff_5w,purchase_min_groupby_customer_ratio_5w,purchase_min_groupby_customer_diff_5w,purchase_mean_groupby_customer_ratio_5w,purchase_mean_groupby_customer_diff_5w,purchase_sum_groupby_customer_ratio_5w,purchase_sum_groupby_customer_diff_5w,purchase_6w,purchase_max_groupby_customer_ratio_6w,purchase_max_groupby_customer_diff_6w,purchase_min_groupby_customer_ratio_6w,purchase_min_groupby_customer_diff_6w,purchase_mean_groupby_customer_ratio_6w,purchase_mean_groupby_customer_diff_6w,purchase_sum_groupby_customer_ratio_6w,purchase_sum_groupby_customer_diff_6w,purchase_7w,purchase_max_groupby_customer_ratio_7w,purchase_max_groupby_customer_diff_7w,purchase_min_groupby_customer_ratio_7w,purchase_min_groupby_customer_diff_7w,purchase_mean_groupby_customer_ratio_7w,purchase_mean_groupby_customer_diff_7w,purchase_sum_groupby_customer_ratio_7w,purchase_sum_groupby_customer_diff_7w,purchase_8w,purchase_max_groupby_customer_ratio_8w,purchase_max_groupby_customer_diff_8w,purchase_min_groupby_customer_ratio_8w,purchase_min_groupby_customer_diff_8w,purchase_mean_groupby_customer_ratio_8w,purchase_mean_groupby_customer_diff_8w,purchase_sum_groupby_customer_ratio_8w,purchase_sum_groupby_customer_diff_8w,purchase_9w,purchase_max_groupby_customer_ratio_9w,purchase_max_groupby_customer_diff_9w,purchase_min_groupby_customer_ratio_9w,purchase_min_groupby_customer_diff_9w,purchase_mean_groupby_customer_ratio_9w,purchase_mean_groupby_customer_diff_9w,purchase_sum_groupby_customer_ratio_9w,purchase_sum_groupby_customer_diff_9w,purchase_10w,purchase_max_groupby_customer_ratio_10w,purchase_max_groupby_customer_diff_10w,purchase_min_groupby_customer_ratio_10w,purchase_min_groupby_customer_diff_10w,purchase_mean_groupby_customer_ratio_10w,purchase_mean_groupby_customer_diff_10w,purchase_sum_groupby_customer_ratio_10w,purchase_sum_groupby_customer_diff_10w,purchase_11w,purchase_max_groupby_customer_ratio_11w,purchase_max_groupby_customer_diff_11w,purchase_min_groupby_customer_ratio_11w,purchase_min_groupby_customer_diff_11w,purchase_mean_groupby_customer_ratio_11w,purchase_mean_groupby_customer_diff_11w,purchase_sum_groupby_customer_ratio_11w,purchase_sum_groupby_customer_diff_11w,purchase_12w,purchase_max_groupby_customer_ratio_12w,purchase_max_groupby_customer_diff_12w,purchase_min_groupby_customer_ratio_12w,purchase_min_groupby_customer_diff_12w,purchase_mean_groupby_customer_ratio_12w,purchase_mean_groupby_customer_diff_12w,purchase_sum_groupby_customer_ratio_12w,purchase_sum_groupby_customer_diff_12w,purchase_13w,purchase_max_groupby_customer_ratio_13w,purchase_max_groupby_customer_diff_13w,purchase_min_groupby_customer_ratio_13w,purchase_min_groupby_customer_diff_13w,purchase_mean_groupby_customer_ratio_13w,purchase_mean_groupby_customer_diff_13w,purchase_sum_groupby_customer_ratio_13w,purchase_sum_groupby_customer_diff_13w,purchase_14w,purchase_max_groupby_customer_ratio_14w,purchase_max_groupby_customer_diff_14w,purchase_min_groupby_customer_ratio_14w,purchase_min_groupby_customer_diff_14w,purchase_mean_groupby_customer_ratio_14w,purchase_mean_groupby_customer_diff_14w,purchase_sum_groupby_customer_ratio_14w,purchase_sum_groupby_customer_diff_14w,purchase_15w,purchase_max_groupby_customer_ratio_15w,purchase_max_groupby_customer_diff_15w,purchase_min_groupby_customer_ratio_15w,purchase_min_groupby_customer_diff_15w,purchase_mean_groupby_customer_ratio_15w,purchase_mean_groupby_customer_diff_15w,purchase_sum_groupby_customer_ratio_15w,purchase_sum_groupby_customer_diff_15w,purchase_16w,purchase_max_groupby_customer_ratio_16w,purchase_max_groupby_customer_diff_16w,purchase_min_groupby_customer_ratio_16w,purchase_min_groupby_customer_diff_16w,purchase_mean_groupby_customer_ratio_16w,purchase_mean_groupby_customer_diff_16w,purchase_sum_groupby_customer_ratio_16w,purchase_sum_groupby_customer_diff_16w,purchase_17w,purchase_max_groupby_customer_ratio_17w,purchase_max_groupby_customer_diff_17w,purchase_min_groupby_customer_ratio_17w,purchase_min_groupby_customer_diff_17w,purchase_mean_groupby_customer_ratio_17w,purchase_mean_groupby_customer_diff_17w,purchase_sum_groupby_customer_ratio_17w,purchase_sum_groupby_customer_diff_17w,purchase_18w,purchase_max_groupby_customer_ratio_18w,purchase_max_groupby_customer_diff_18w,purchase_min_groupby_customer_ratio_18w,purchase_min_groupby_customer_diff_18w,purchase_mean_groupby_customer_ratio_18w,purchase_mean_groupby_customer_diff_18w,purchase_sum_groupby_customer_ratio_18w,purchase_sum_groupby_customer_diff_18w,purchase_19w,purchase_max_groupby_customer_ratio_19w,purchase_max_groupby_customer_diff_19w,purchase_min_groupby_customer_ratio_19w,purchase_min_groupby_customer_diff_19w,purchase_mean_groupby_customer_ratio_19w,purchase_mean_groupby_customer_diff_19w,purchase_sum_groupby_customer_ratio_19w,purchase_sum_groupby_customer_diff_19w,purchase_20w,purchase_max_groupby_customer_ratio_20w,purchase_max_groupby_customer_diff_20w,purchase_min_groupby_customer_ratio_20w,purchase_min_groupby_customer_diff_20w,purchase_mean_groupby_customer_ratio_20w,purchase_mean_groupby_customer_diff_20w,purchase_sum_groupby_customer_ratio_20w,purchase_sum_groupby_customer_diff_20w,purchase_21w,purchase_max_groupby_customer_ratio_21w,purchase_max_groupby_customer_diff_21w,purchase_min_groupby_customer_ratio_21w,purchase_min_groupby_customer_diff_21w,purchase_mean_groupby_customer_ratio_21w,purchase_mean_groupby_customer_diff_21w,purchase_sum_groupby_customer_ratio_21w,purchase_sum_groupby_customer_diff_21w,purchase_22w,purchase_max_groupby_customer_ratio_22w,purchase_max_groupby_customer_diff_22w,purchase_min_groupby_customer_ratio_22w,purchase_min_groupby_customer_diff_22w,purchase_mean_groupby_customer_ratio_22w,purchase_mean_groupby_customer_diff_22w,purchase_sum_groupby_customer_ratio_22w,purchase_sum_groupby_customer_diff_22w,purchase_23w,purchase_max_groupby_customer_ratio_23w,purchase_max_groupby_customer_diff_23w,purchase_min_groupby_customer_ratio_23w,purchase_min_groupby_customer_diff_23w,purchase_mean_groupby_customer_ratio_23w,purchase_mean_groupby_customer_diff_23w,purchase_sum_groupby_customer_ratio_23w,purchase_sum_groupby_customer_diff_23w,purchase_24w,purchase_max_groupby_customer_ratio_24w,purchase_max_groupby_customer_diff_24w,purchase_min_groupby_customer_ratio_24w,purchase_min_groupby_customer_diff_24w,purchase_mean_groupby_customer_ratio_24w,purchase_mean_groupby_customer_diff_24w,purchase_sum_groupby_customer_ratio_24w,purchase_sum_groupby_customer_diff_24w,repurchase_article,purchase_article,repurchase_article_percent,repurchase_week,purchase_week,repurchase_week_percent,repurchase_article_and_week,purchase_article_and_week,repurchase_article_and_week_percent
0,0,0.0,0.0,ACTIVE,,49.0,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,0.0,,,0.0,,
1,1,0.0,0.0,ACTIVE,,25.0,14.0,1.0,7.5,15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,1.0,0.071429,-13.0,1.0,0.0,0.133333,-6.5,0.066667,-14.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,14.0,1.0,0.0,14.0,13.0,1.866667,6.5,0.933333,-1.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,0.0,-14.0,0.0,-1.0,0.0,-7.5,0.0,-15.0,0.0,9.0,0.0,0.0,2.0,0.0,0.0,9.0,0.0
2,2,0.0,0.0,ACTIVE,,24.0,8.0,2.0,5.0,10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,8.0,1.0,0.0,4.0,6.0,1.6,3.0,0.8,-2.0,0.0,0.0,-8.0,0.0,-2.0,0.0,-5.0,0.0,-10.0,2.0,0.25,-6.0,1.0,0.0,0.4,-3.0,0.2,-8.0,0.0,8.0,0.0,0.0,2.0,0.0,0.0,8.0,0.0
3,3,0.0,0.0,ACTIVE,,54.0,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,0.0,,,0.0,,
4,4,1.0,1.0,ACTIVE,Regularly,52.0,4.0,3.0,3.5,7.0,4.0,1.0,0.0,1.333333,1.0,1.142857,0.5,0.571429,-3.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,3.0,0.75,-1.0,1.0,0.0,0.857143,-0.5,0.428571,-4.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,0.0,-4.0,0.0,-3.0,0.0,-3.5,0.0,-7.0,0.0,7.0,0.0,0.0,2.0,0.0,0.0,7.0,0.0


In [9]:
train_articles_feature = make_articles_feature(articles, transactions, debug=True)

print(train_articles_feature.shape)
print(f"{train_articles_feature.__sizeof__() // 1_000_000} MB")
display(train_articles_feature.head())

(105542, 206)
185 MB


Unnamed: 0,article_id,product_code,product_type_no,product_group_name,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,sale_max_groupby_article,sale_min_groupby_article,sale_mean_groupby_article,sale_sum_groupby_article,sale_5w,sale_max_groupby_article_ratio_5w,sale_max_groupby_article_diff_5w,sale_min_groupby_article_ratio_5w,sale_min_groupby_article_diff_5w,sale_mean_groupby_article_ratio_5w,sale_mean_groupby_article_diff_5w,sale_sum_groupby_article_ratio_5w,sale_sum_groupby_article_diff_5w,sale_6w,sale_max_groupby_article_ratio_6w,sale_max_groupby_article_diff_6w,sale_min_groupby_article_ratio_6w,sale_min_groupby_article_diff_6w,sale_mean_groupby_article_ratio_6w,sale_mean_groupby_article_diff_6w,sale_sum_groupby_article_ratio_6w,sale_sum_groupby_article_diff_6w,sale_7w,sale_max_groupby_article_ratio_7w,sale_max_groupby_article_diff_7w,sale_min_groupby_article_ratio_7w,sale_min_groupby_article_diff_7w,sale_mean_groupby_article_ratio_7w,sale_mean_groupby_article_diff_7w,sale_sum_groupby_article_ratio_7w,sale_sum_groupby_article_diff_7w,sale_8w,sale_max_groupby_article_ratio_8w,sale_max_groupby_article_diff_8w,sale_min_groupby_article_ratio_8w,sale_min_groupby_article_diff_8w,sale_mean_groupby_article_ratio_8w,sale_mean_groupby_article_diff_8w,sale_sum_groupby_article_ratio_8w,sale_sum_groupby_article_diff_8w,sale_9w,sale_max_groupby_article_ratio_9w,sale_max_groupby_article_diff_9w,sale_min_groupby_article_ratio_9w,sale_min_groupby_article_diff_9w,sale_mean_groupby_article_ratio_9w,sale_mean_groupby_article_diff_9w,sale_sum_groupby_article_ratio_9w,sale_sum_groupby_article_diff_9w,sale_10w,sale_max_groupby_article_ratio_10w,sale_max_groupby_article_diff_10w,sale_min_groupby_article_ratio_10w,sale_min_groupby_article_diff_10w,sale_mean_groupby_article_ratio_10w,sale_mean_groupby_article_diff_10w,sale_sum_groupby_article_ratio_10w,sale_sum_groupby_article_diff_10w,sale_11w,sale_max_groupby_article_ratio_11w,sale_max_groupby_article_diff_11w,sale_min_groupby_article_ratio_11w,sale_min_groupby_article_diff_11w,sale_mean_groupby_article_ratio_11w,sale_mean_groupby_article_diff_11w,sale_sum_groupby_article_ratio_11w,sale_sum_groupby_article_diff_11w,sale_12w,sale_max_groupby_article_ratio_12w,sale_max_groupby_article_diff_12w,sale_min_groupby_article_ratio_12w,sale_min_groupby_article_diff_12w,sale_mean_groupby_article_ratio_12w,sale_mean_groupby_article_diff_12w,sale_sum_groupby_article_ratio_12w,sale_sum_groupby_article_diff_12w,sale_13w,sale_max_groupby_article_ratio_13w,sale_max_groupby_article_diff_13w,sale_min_groupby_article_ratio_13w,sale_min_groupby_article_diff_13w,sale_mean_groupby_article_ratio_13w,sale_mean_groupby_article_diff_13w,sale_sum_groupby_article_ratio_13w,sale_sum_groupby_article_diff_13w,sale_14w,sale_max_groupby_article_ratio_14w,sale_max_groupby_article_diff_14w,sale_min_groupby_article_ratio_14w,sale_min_groupby_article_diff_14w,sale_mean_groupby_article_ratio_14w,sale_mean_groupby_article_diff_14w,sale_sum_groupby_article_ratio_14w,sale_sum_groupby_article_diff_14w,sale_15w,sale_max_groupby_article_ratio_15w,sale_max_groupby_article_diff_15w,sale_min_groupby_article_ratio_15w,sale_min_groupby_article_diff_15w,sale_mean_groupby_article_ratio_15w,sale_mean_groupby_article_diff_15w,sale_sum_groupby_article_ratio_15w,sale_sum_groupby_article_diff_15w,sale_16w,sale_max_groupby_article_ratio_16w,sale_max_groupby_article_diff_16w,sale_min_groupby_article_ratio_16w,sale_min_groupby_article_diff_16w,sale_mean_groupby_article_ratio_16w,sale_mean_groupby_article_diff_16w,sale_sum_groupby_article_ratio_16w,sale_sum_groupby_article_diff_16w,sale_17w,sale_max_groupby_article_ratio_17w,sale_max_groupby_article_diff_17w,sale_min_groupby_article_ratio_17w,sale_min_groupby_article_diff_17w,sale_mean_groupby_article_ratio_17w,sale_mean_groupby_article_diff_17w,sale_sum_groupby_article_ratio_17w,sale_sum_groupby_article_diff_17w,sale_18w,sale_max_groupby_article_ratio_18w,sale_max_groupby_article_diff_18w,sale_min_groupby_article_ratio_18w,sale_min_groupby_article_diff_18w,sale_mean_groupby_article_ratio_18w,sale_mean_groupby_article_diff_18w,sale_sum_groupby_article_ratio_18w,sale_sum_groupby_article_diff_18w,sale_19w,sale_max_groupby_article_ratio_19w,sale_max_groupby_article_diff_19w,sale_min_groupby_article_ratio_19w,sale_min_groupby_article_diff_19w,sale_mean_groupby_article_ratio_19w,sale_mean_groupby_article_diff_19w,sale_sum_groupby_article_ratio_19w,sale_sum_groupby_article_diff_19w,sale_20w,sale_max_groupby_article_ratio_20w,sale_max_groupby_article_diff_20w,sale_min_groupby_article_ratio_20w,sale_min_groupby_article_diff_20w,sale_mean_groupby_article_ratio_20w,sale_mean_groupby_article_diff_20w,sale_sum_groupby_article_ratio_20w,sale_sum_groupby_article_diff_20w,sale_21w,sale_max_groupby_article_ratio_21w,sale_max_groupby_article_diff_21w,sale_min_groupby_article_ratio_21w,sale_min_groupby_article_diff_21w,sale_mean_groupby_article_ratio_21w,sale_mean_groupby_article_diff_21w,sale_sum_groupby_article_ratio_21w,sale_sum_groupby_article_diff_21w,sale_22w,sale_max_groupby_article_ratio_22w,sale_max_groupby_article_diff_22w,sale_min_groupby_article_ratio_22w,sale_min_groupby_article_diff_22w,sale_mean_groupby_article_ratio_22w,sale_mean_groupby_article_diff_22w,sale_sum_groupby_article_ratio_22w,sale_sum_groupby_article_diff_22w,sale_23w,sale_max_groupby_article_ratio_23w,sale_max_groupby_article_diff_23w,sale_min_groupby_article_ratio_23w,sale_min_groupby_article_diff_23w,sale_mean_groupby_article_ratio_23w,sale_mean_groupby_article_diff_23w,sale_sum_groupby_article_ratio_23w,sale_sum_groupby_article_diff_23w,sale_24w,sale_max_groupby_article_ratio_24w,sale_max_groupby_article_diff_24w,sale_min_groupby_article_ratio_24w,sale_min_groupby_article_diff_24w,sale_mean_groupby_article_ratio_24w,sale_mean_groupby_article_diff_24w,sale_sum_groupby_article_ratio_24w,sale_sum_groupby_article_diff_24w,resale_customer,sale_customer,resale_customer_percent,resale_week,sale_week,resale_week_percent,resale_customer_and_week,sale_customer_and_week,resale_customer_and_week_percent
0,108775015,108775,253,Garment Upper body,1010016,9,4,5,1676,A,1,16,1002,9.0,1.0,3.0,21.0,0.0,0.0,-9.0,0.0,-1.0,0.0,-3.0,0.0,-21.0,0.0,0.0,-9.0,0.0,-1.0,0.0,-3.0,0.0,-21.0,0.0,0.0,-9.0,0.0,-1.0,0.0,-3.0,0.0,-21.0,1.0,0.111111,-8.0,1.0,0.0,0.333333,-2.0,0.047619,-20.0,2.0,0.222222,-7.0,2.0,1.0,0.666667,-1.0,0.095238,-19.0,0.0,0.0,-9.0,0.0,-1.0,0.0,-3.0,0.0,-21.0,0.0,0.0,-9.0,0.0,-1.0,0.0,-3.0,0.0,-21.0,0.0,0.0,-9.0,0.0,-1.0,0.0,-3.0,0.0,-21.0,0.0,0.0,-9.0,0.0,-1.0,0.0,-3.0,0.0,-21.0,0.0,0.0,-9.0,0.0,-1.0,0.0,-3.0,0.0,-21.0,1.0,0.111111,-8.0,1.0,0.0,0.333333,-2.0,0.047619,-20.0,1.0,0.111111,-8.0,1.0,0.0,0.333333,-2.0,0.047619,-20.0,0.0,0.0,-9.0,0.0,-1.0,0.0,-3.0,0.0,-21.0,0.0,0.0,-9.0,0.0,-1.0,0.0,-3.0,0.0,-21.0,0.0,0.0,-9.0,0.0,-1.0,0.0,-3.0,0.0,-21.0,0.0,0.0,-9.0,0.0,-1.0,0.0,-3.0,0.0,-21.0,0.0,0.0,-9.0,0.0,-1.0,0.0,-3.0,0.0,-21.0,2.0,0.222222,-7.0,2.0,1.0,0.666667,-1.0,0.095238,-19.0,9.0,1.0,0.0,9.0,8.0,3.0,6.0,0.428571,-12.0,5.0,0.555556,-4.0,5.0,4.0,1.666667,2.0,0.238095,-16.0,0.0,15.0,0.0,0.0,7.0,0.0,0.0,15.0,0.0
1,108775044,108775,253,Garment Upper body,1010016,10,3,9,1676,A,1,16,1002,34.0,3.0,12.8,256.0,5.0,0.147059,-29.0,1.666667,2.0,0.390625,-7.8,0.019531,-251.0,5.0,0.147059,-29.0,1.666667,2.0,0.390625,-7.8,0.019531,-251.0,6.0,0.176471,-28.0,2.0,3.0,0.46875,-6.8,0.023438,-250.0,8.0,0.235294,-26.0,2.666667,5.0,0.625,-4.8,0.03125,-248.0,3.0,0.088235,-31.0,1.0,0.0,0.234375,-9.8,0.011719,-253.0,17.0,0.5,-17.0,5.666667,14.0,1.328125,4.2,0.066406,-239.0,6.0,0.176471,-28.0,2.0,3.0,0.46875,-6.8,0.023438,-250.0,20.0,0.588235,-14.0,6.666667,17.0,1.5625,7.2,0.078125,-236.0,34.0,1.0,0.0,11.333333,31.0,2.65625,21.2,0.132812,-222.0,20.0,0.588235,-14.0,6.666667,17.0,1.5625,7.2,0.078125,-236.0,7.0,0.205882,-27.0,2.333333,4.0,0.546875,-5.8,0.027344,-249.0,18.0,0.529412,-16.0,6.0,15.0,1.40625,5.2,0.070312,-238.0,18.0,0.529412,-16.0,6.0,15.0,1.40625,5.2,0.070312,-238.0,15.0,0.441176,-19.0,5.0,12.0,1.171875,2.2,0.058594,-241.0,4.0,0.117647,-30.0,1.333333,1.0,0.3125,-8.8,0.015625,-252.0,14.0,0.411765,-20.0,4.666667,11.0,1.09375,1.2,0.054688,-242.0,13.0,0.382353,-21.0,4.333333,10.0,1.015625,0.2,0.050781,-243.0,13.0,0.382353,-21.0,4.333333,10.0,1.015625,0.2,0.050781,-243.0,21.0,0.617647,-13.0,7.0,18.0,1.640625,8.2,0.082031,-235.0,9.0,0.264706,-25.0,3.0,6.0,0.703125,-3.8,0.035156,-247.0,4.0,191.0,0.020942,2.0,20.0,0.1,4.0,195.0,0.020513
2,108775051,108775,253,Garment Upper body,1010017,11,1,9,1676,A,1,16,1002,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,,,,,,,0.0,,,0.0,,,0.0,,
3,110065001,110065,306,Underwear,1010016,9,4,5,1339,B,1,61,1017,4.0,1.0,1.7,17.0,0.0,0.0,-4.0,0.0,-1.0,0.0,-1.7,0.0,-17.0,0.0,0.0,-4.0,0.0,-1.0,0.0,-1.7,0.0,-17.0,1.0,0.25,-3.0,1.0,0.0,0.588235,-0.7,0.058824,-16.0,0.0,0.0,-4.0,0.0,-1.0,0.0,-1.7,0.0,-17.0,1.0,0.25,-3.0,1.0,0.0,0.588235,-0.7,0.058824,-16.0,0.0,0.0,-4.0,0.0,-1.0,0.0,-1.7,0.0,-17.0,2.0,0.5,-2.0,2.0,1.0,1.176471,0.3,0.117647,-15.0,1.0,0.25,-3.0,1.0,0.0,0.588235,-0.7,0.058824,-16.0,4.0,1.0,0.0,4.0,3.0,2.352941,2.3,0.235294,-13.0,2.0,0.5,-2.0,2.0,1.0,1.176471,0.3,0.117647,-15.0,3.0,0.75,-1.0,3.0,2.0,1.764706,1.3,0.176471,-14.0,1.0,0.25,-3.0,1.0,0.0,0.588235,-0.7,0.058824,-16.0,0.0,0.0,-4.0,0.0,-1.0,0.0,-1.7,0.0,-17.0,1.0,0.25,-3.0,1.0,0.0,0.588235,-0.7,0.058824,-16.0,1.0,0.25,-3.0,1.0,0.0,0.588235,-0.7,0.058824,-16.0,0.0,0.0,-4.0,0.0,-1.0,0.0,-1.7,0.0,-17.0,0.0,0.0,-4.0,0.0,-1.0,0.0,-1.7,0.0,-17.0,0.0,0.0,-4.0,0.0,-1.0,0.0,-1.7,0.0,-17.0,0.0,0.0,-4.0,0.0,-1.0,0.0,-1.7,0.0,-17.0,0.0,0.0,-4.0,0.0,-1.0,0.0,-1.7,0.0,-17.0,0.0,17.0,0.0,0.0,10.0,0.0,0.0,17.0,0.0
4,110065002,110065,306,Underwear,1010016,10,3,9,1339,B,1,61,1017,2.0,1.0,1.5,9.0,0.0,0.0,-2.0,0.0,-1.0,0.0,-1.5,0.0,-9.0,1.0,0.5,-1.0,1.0,0.0,0.666667,-0.5,0.111111,-8.0,2.0,1.0,0.0,2.0,1.0,1.333333,0.5,0.222222,-7.0,0.0,0.0,-2.0,0.0,-1.0,0.0,-1.5,0.0,-9.0,1.0,0.5,-1.0,1.0,0.0,0.666667,-0.5,0.111111,-8.0,2.0,1.0,0.0,2.0,1.0,1.333333,0.5,0.222222,-7.0,0.0,0.0,-2.0,0.0,-1.0,0.0,-1.5,0.0,-9.0,0.0,0.0,-2.0,0.0,-1.0,0.0,-1.5,0.0,-9.0,0.0,0.0,-2.0,0.0,-1.0,0.0,-1.5,0.0,-9.0,2.0,1.0,0.0,2.0,1.0,1.333333,0.5,0.222222,-7.0,0.0,0.0,-2.0,0.0,-1.0,0.0,-1.5,0.0,-9.0,0.0,0.0,-2.0,0.0,-1.0,0.0,-1.5,0.0,-9.0,1.0,0.5,-1.0,1.0,0.0,0.666667,-0.5,0.111111,-8.0,0.0,0.0,-2.0,0.0,-1.0,0.0,-1.5,0.0,-9.0,0.0,0.0,-2.0,0.0,-1.0,0.0,-1.5,0.0,-9.0,0.0,0.0,-2.0,0.0,-1.0,0.0,-1.5,0.0,-9.0,0.0,0.0,-2.0,0.0,-1.0,0.0,-1.5,0.0,-9.0,0.0,0.0,-2.0,0.0,-1.0,0.0,-1.5,0.0,-9.0,0.0,0.0,-2.0,0.0,-1.0,0.0,-1.5,0.0,-9.0,0.0,0.0,-2.0,0.0,-1.0,0.0,-1.5,0.0,-9.0,0.0,9.0,0.0,0.0,6.0,0.0,0.0,9.0,0.0


In [10]:
train_purchase_df.to_parquet(f'../input/ranking_features/train_purchase_df.parquet', index=False)

In [11]:
train_customers_feature.to_parquet(f'../input/ranking_features/train_customers_feature.parquet', index=False)

In [12]:
train_articles_feature.to_parquet(f'../input/ranking_features/train_articles_feature.parquet', index=False)

In [None]:
train_purchase_df = pd.read_parquet(
    f'../input/ranking_features/train_purchase_df.parquet', 
    dtype={'customer_id': 'int32', 'article_id': 'int32'})
print(train_purchase_df.shape)
print(f"{train_purchase_df.__sizeof__() // 1_000_000} MB")
display(train_purchase_df.head())

In [None]:
train_customers_feature = pd.read_parquet(
    f'../input/ranking_features/train_customers_feature.parquet',
    dtype={'customer_id': 'int32'})
print(train_customers_feature.shape)
print(f"{train_customers_feature.__sizeof__() // 1_000_000} MB")
display(train_customers_feature.head())

In [None]:
train_articles_feature = pd.read_parquet(
    f'../input/ranking_features/train_articles_feature.parquet', 
    dtype={'article_id': 'int32'})
print(train_articles_feature.shape)
print(f"{train_articles_feature.__sizeof__() // 1_000_000} MB")
display(train_articles_feature.head())

In [13]:
def compress_df(
    df: pd.DataFrame, 
    category_columns: list =['club_member_status', 'fashion_news_frequency', 'product_group_name', 'index_code', 'strategy'], 
    verbose: bool =True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        bar = tqdm(df.columns, leave=False)
    else:
        bar = df.columns
    for col in bar:
        col_type = df[col].dtypes
        if col in category_columns:
            if verbose:
                bar.set_description(f"{col}(category)")
            df[col] = df[col].astype('category')
        elif col_type in numerics:
            if verbose:
                bar.set_description(f"{col}(num)")
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [14]:
train_purchase_df = compress_df(train_purchase_df)
train_customers_feature = compress_df(train_customers_feature)
train_articles_feature = compress_df(train_articles_feature)

  0%|          | 0/17 [00:00<?, ?it/s]

Mem. usage decreased to 1177.59 Mb (73.5% reduction)


  0%|          | 0/199 [00:00<?, ?it/s]

Mem. usage decreased to 520.75 Mb (74.9% reduction)


  0%|          | 0/206 [00:00<?, ?it/s]

Mem. usage decreased to 41.37 Mb (75.0% reduction)


In [15]:
exclude_columns = ['target_week', 'customer_id', 'article_id', 'is_purchased']
cols = set(train_purchase_df.columns.tolist() + train_customers_feature.columns.tolist() + train_articles_feature.columns.tolist())
cols = [c for c in cols if c not in exclude_columns]
with open(f'../models/lgb_rank/{EXP}_cols.pkl', 'wb') as f:
    pickle.dump(cols, f)
print(len(cols))

416


In [23]:
# ランク学習
params = {
    'objective': 'lambdarank',
    'boosting': 'gbdt',
    'num_iterations': 1000,
    'learning_rate': 0.1,
    'num_leaves': 31,
    'num_threads': 4,  # for M1 Mac
    'min_data_in_leaf': 20,
    'max_depth': -1,
    'bagging_freq': 5,
    'bagging_fraction': 0.75,
    'metric': ['ndcg'],
    'eval_at': [12],  # 上位何件のランキングをnDCGとMAPの算出に用いるか
    'random_state': 41,
    'verbosity': 0
}

In [19]:
oof_weeks = [4, 3, 2, 1]
feature_importance_dfs = []
for w in tqdm(oof_weeks):
    print(f"\ntarget_week(fold): {w}")
    tr_purchase_df = train_purchase_df.query("target_week == @w")
    tr_purchase_df = tr_purchase_df.merge(train_customers_feature, how='left', on=['customer_id'], copy=False)
    tr_purchase_df = tr_purchase_df.merge(train_articles_feature, how='left', on=['article_id'], copy=False)
    tr_purchase_df = tr_purchase_df.sort_values(['target_week', 'customer_id']).reset_index(drop=True)
    val_purchase_df = train_purchase_df.query("target_week == (@w-1)")
    val_purchase_df = val_purchase_df.merge(train_customers_feature, how='left', on=['customer_id'], copy=False)
    val_purchase_df = val_purchase_df.merge(train_articles_feature, how='left', on=['article_id'], copy=False)
    val_purchase_df = val_purchase_df.sort_values(['target_week', 'customer_id']).reset_index(drop=True)
    
    train_query = tr_purchase_df.groupby(['target_week', 'customer_id'])['customer_id'].count().to_list()
    dtrain = lgb.Dataset(tr_purchase_df[cols], label=tr_purchase_df['is_purchased'], group=train_query)
    val_query = val_purchase_df.groupby(['target_week', 'customer_id'])['customer_id'].count().to_list()
    dval = lgb.Dataset(val_purchase_df[cols], reference=dtrain, label=val_purchase_df['is_purchased'], group=val_query)

    model = lgb.train(
        params, dtrain, valid_sets=[dtrain, dval], 
        callbacks=[lgb.early_stopping(10, first_metric_only=True), lgb.log_evaluation(10)])
    with open(f'../models/lgb_rank/{EXP}_model_fold{w}.pkl', 'wb') as f:
        pickle.dump(model, f)
    
    feature_importance_dfs.append(pd.DataFrame({'feature': model.feature_name(), 'importance(gain)': model.feature_importance('gain'), 'fold': w}))

  0%|          | 0/4 [00:00<?, ?it/s]


target_week(fold): 4
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 100 rounds
[100]	training's ndcg@12: 0.873107	valid_1's ndcg@12: 0.859832
Early stopping, best iteration is:
[97]	training's ndcg@12: 0.872795	valid_1's ndcg@12: 0.85994
Evaluated only: ndcg@12

target_week(fold): 3
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 100 rounds
[100]	training's ndcg@12: 0.8829	valid_1's ndcg@12: 0.852721
Early stopping, best iteration is:
[35]	training's ndcg@12: 0.874061	valid_1's ndcg@12: 0.85413
Evaluated only: ndcg@12

target_week(fold): 2
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 100 rounds
[100]	training's ndcg@12: 0.878776	valid_1's ndcg@12: 0.869982
Early stopping, bes

In [20]:
val_pred = np.zeros(len(val_purchase_df))
with tqdm(oof_weeks) as pbar:
    for w in pbar:
        pbar.set_description(f"model's target_week(fold): {w}")
        with open(f"../models/lgb_rank/{EXP}_model_fold{w}.pkl", 'rb') as f:
            model = pickle.load(f)
        val_pred += model.predict(val_purchase_df[cols], num_iteration=model.best_iteration)
val_pred = val_pred/len(oof_weeks)
np.sort(val_pred)

  0%|          | 0/4 [00:00<?, ?it/s]

array([-5.14059591, -5.12218101, -5.12010449, ...,  2.90538833,
        2.91990481,  2.92301746])

In [21]:
# most popular items
transactions_last_week = transactions.loc[transactions.week == 1]
top12 = ' 0' + ' 0'.join(transactions_last_week.article_id.value_counts().index.astype('str')[:12])
print("Top 12 popular items:")
print( top12 )

customers['age_bin'] = pd.cut(customers['age'], bins=[10, 20, 30, 40, 50, 60, 70, 100], labels=False)
transactions_last_week = transactions_last_week.merge(customers[['customer_id', 'age', 'age_bin']], how='left')
popular_items = transactions_last_week.groupby('age_bin')['article_id'].value_counts()
popular_items_dict = {}
for index in popular_items.index.levels[0]:
    popular_items_dict[index] = ' 0'+' 0'.join(popular_items[index][:12].index.astype('str'))
popular_items_sr = pd.Series(popular_items_dict, name='top_12_popular_items', dtype='str')
popular_items_sr

Top 12 popular items:
 0909370001 0865799006 0918522001 0924243001 0448509014 0751471001 0809238001 0918292001 0762846027 0809238005 0673677002 0923758001


0.0     0685814003 0448509014 0918522001 0715624001 0...
1.0     0909370001 0865799006 0924243001 0809238001 0...
2.0     0909370001 0865799006 0918525001 0909371001 0...
3.0     0909370001 0751471001 0673677002 0910601003 0...
4.0     0918522001 0751471001 0751471043 0910601003 0...
5.0     0918522001 0908799002 0896152002 0924243001 0...
6.0     0736870001 0796210001 0908799002 0865799006 0...
Name: top_12_popular_items, dtype: object

In [22]:
# predict val data

submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

val_purchase_df2 = val_purchase_df.copy()
val_purchase_df2['predict_score'] = val_pred
val_purchase_df2 = val_purchase_df2.sort_values('predict_score', ascending=False).drop_duplicates(['customer_id', 'article_id'], keep='first').reset_index(drop=True)
val_purchase_df2['rank'] = val_purchase_df2.groupby('customer_id')['predict_score'].rank('min', ascending=False)
val_purchase_df2 = val_purchase_df2[val_purchase_df2['rank'] <= 12]
# val_purchase_df2['article_id'] = le.inverse_transform(val_purchase_df2['article_id'])
val_purchase_df2['article_id'] = ' 0' + val_purchase_df2['article_id'].astype(str)
submission['prediction_lgb'] = submission['customer_id'].map(id_to_index_dict).map(val_purchase_df2.groupby('customer_id')['article_id'].sum())
submission['prediction_lgb'] = submission['prediction_lgb'].fillna('')

submission['age_bin'] = submission['customer_id'].map(id_to_index_dict).map(customers.set_index('customer_id')['age_bin'])
submission['prediction_popular'] = submission['age_bin'].map(popular_items_sr)
submission['prediction_popular'] = submission['prediction_popular'].fillna(top12).astype('str')

submission['prediction'] = submission['prediction_lgb'] + submission['prediction_popular']
submission['prediction'] = submission['prediction'].str.strip()
submission['prediction'] = submission['prediction'].str[:131]
display(submission.head())
submission[['customer_id', 'prediction']].to_csv(f'../submissions/{EXP}_submission_fold1.csv', index=False)

Unnamed: 0,customer_id,prediction,prediction_lgb,age_bin,prediction_popular
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0909370001 0751471001 0673677002 0910601003 07...,,3.0,0909370001 0751471001 0673677002 0910601003 0...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0909370001 0865799006 0924243001 0809238001 04...,,1.0,0909370001 0865799006 0924243001 0809238001 0...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0909370001 0865799006 0924243001 0809238001 04...,,1.0,0909370001 0865799006 0924243001 0809238001 0...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0918522001 0751471001 0751471043 0910601003 09...,,4.0,0918522001 0751471001 0751471043 0910601003 0...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0918522001 0751471001 0751471043 0910601003 09...,,4.0,0918522001 0751471001 0751471043 0910601003 0...


In [24]:
del tr_purchase_df, val_purchase_df
del train_query, dtrain, val_query, dval, val_pred
del transactions_last_week, top12, popular_items, popular_items_dict, popular_items_sr
del val_purchase_df2, submission
gc.collect()

924

In [25]:
# train last target_week data

last_week = 0

tr_purchase_df = train_purchase_df.query("target_week == @last_week")
tr_purchase_df = tr_purchase_df.merge(train_customers_feature, how='left', on=['customer_id'], copy=False)
tr_purchase_df = tr_purchase_df.merge(train_articles_feature, how='left', on=['article_id'], copy=False)
tr_purchase_df = tr_purchase_df.sort_values(['target_week', 'customer_id']).reset_index(drop=True)

train_query = tr_purchase_df.groupby(['target_week', 'customer_id'])['customer_id'].count().to_list()
dtrain = lgb.Dataset(tr_purchase_df[cols], label=tr_purchase_df['is_purchased'], group=train_query)

params['num_iterations'] = model.best_iteration
model = lgb.train(
    params, dtrain, valid_sets=[dtrain], callbacks=[lgb.log_evaluation(10)])
with open(f"../models/lgb_rank/{EXP}_model_fold0.pkl", 'wb') as f:
    pickle.dump(model, f)
    
feature_importance_dfs.append(pd.DataFrame({'feature': model.feature_name(), 'importance(gain)': model.feature_importance('gain'), 'fold': last_week}))
    
del tr_purchase_df, train_purchase_df, train_customers_feature, train_articles_feature, train_query, dtrain
gc.collect()

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[10]	training's ndcg@12: 0.853453
[20]	training's ndcg@12: 0.856642
[30]	training's ndcg@12: 0.858497
[40]	training's ndcg@12: 0.861115
[50]	training's ndcg@12: 0.863395


149

In [26]:
feature_importance_df = pd.concat(feature_importance_dfs, ignore_index=True, axis=0)
display(feature_importance_df.groupby(['feature'])[['importance(gain)']].mean().sort_values('importance(gain)', ascending=False).head(20))
del feature_importance_dfs
gc.collect()

Unnamed: 0_level_0,importance(gain)
feature,Unnamed: 1_level_1
count_1w,35880.338607
is_dummy,25893.217059
count_4w,11238.661017
count_2w,6925.640986
sale_max_groupby_article_ratio_5w,6305.905348
count_3w,5919.487007
count_6w,4434.231122
count_5w,3872.94002
age,2863.763897
purchase_article,1646.186861


0

In [27]:
# predict test data

BATCH_SIZE = 1000
submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
test_customers = submission['customer_id'].map(id_to_index_dict).unique()
test_customers_feature = make_customers_feature(customers, transactions, debug=True)
test_articles_feature = make_articles_feature(articles, transactions, debug=True)
preds = []

for i in tqdm(range(len(test_customers)//BATCH_SIZE + 1), desc="Mini Batch"):
    if i == (len(test_customers)//BATCH_SIZE):
        transactions_batch = transactions[transactions['customer_id'].isin(test_customers[i*BATCH_SIZE:])]
    else:
        transactions_batch = transactions[transactions['customer_id'].isin(test_customers[i*BATCH_SIZE : (i+1)*BATCH_SIZE])]
    try:
        test_purchase_df = make_purchase_df(transactions_batch, target_week=-1, debug=True)
    except RuntimeError as e:
        print(f"batch {i}: {e}")
        continue
    test_purchase_df = test_purchase_df.merge(test_customers_feature, how='left', on=['customer_id'], copy=False)
    test_purchase_df = test_purchase_df.merge(test_articles_feature, how='left', on=['article_id'], copy=False)
    test_purchase_df = compress_df(test_purchase_df, verbose=False)
    
    pred = np.zeros(len(test_purchase_df))
    all_weeks = oof_weeks + [0]
    for w in all_weeks:
        with open(f"../models/lgb_rank/{EXP}_model_fold{w}.pkl", 'rb') as f:
            model = pickle.load(f)
        pred += model.predict(test_purchase_df[cols], num_iteration=model.best_iteration)    
    pred = pred/len(all_weeks)
    
    test_purchase_df['predict_score'] = pred
    test_purchase_df = test_purchase_df.sort_values('predict_score', ascending=False).drop_duplicates(['customer_id', 'article_id'], keep='first').reset_index(drop=True)
    test_purchase_df['rank'] = test_purchase_df.groupby('customer_id')['predict_score'].rank('min', ascending=False)
    test_purchase_df = test_purchase_df[test_purchase_df['rank'] <= 12]
    
    # test_purchase_df['article_id'] = le.inverse_transform(test_purchase_df['article_id'])
    test_purchase_df['article_id'] = ' 0' + test_purchase_df['article_id'].astype(str)
    preds.append(test_purchase_df.groupby('customer_id')['article_id'].sum())
    
pred_sr = pd.concat(preds, axis=0)
display(pred_sr.head())

del test_purchase_df, test_customers_feature, test_articles_feature, pred, preds
gc.collect()

Mini Batch:   0%|          | 0/1372 [00:00<?, ?it/s]

customer_id
0     0568601043 0850917001 0897221001 0779781013 0...
1     0850917001 0891591007 0707269004 0914441004 0...
2     0794321007 0850917001 0891591007 0867966009 0...
4     0730683050 0791587015 0896152002 0927530004 0...
6     0719530003 0448509014 0850917001 0897221001 0...
Name: article_id, dtype: object

19

In [28]:
# most popular items
transactions_last_week = transactions.loc[transactions.week == 0]
top12 = ' 0' + ' 0'.join(transactions_last_week.article_id.value_counts().index.astype('str')[:12])
print("Top 12 popular items:")
print( top12 )

customers['age_bin'] = pd.cut(customers['age'], bins=[10, 20, 30, 40, 50, 60, 70, 100], labels=False)
transactions_last_week = transactions_last_week.merge(customers[['customer_id', 'age', 'age_bin']], how='left')
popular_items = transactions_last_week.groupby('age_bin')['article_id'].value_counts()
popular_items_dict = {}
for index in popular_items.index.levels[0]:
    popular_items_dict[index] = ' 0'+' 0'.join(popular_items[index][:12].index.astype('str'))
popular_items_sr = pd.Series(popular_items_dict, name='top_12_popular_items', dtype='str')
popular_items_sr

Top 12 popular items:
 0924243001 0924243002 0918522001 0923758001 0866731001 0909370001 0751471001 0915529003 0915529005 0448509014 0762846027 0714790020


0.0     0685813003 0918522001 0715624001 0850917001 0...
1.0     0924243001 0866731001 0909370001 0918522001 0...
2.0     0923758001 0909370001 0924243001 0935541001 0...
3.0     0751471001 0928206001 0924243001 0924243002 0...
4.0     0924243001 0928206001 0930380001 0924243002 0...
5.0     0930380001 0924243001 0751471043 0910601003 0...
6.0     0751471043 0930380001 0865799006 0714790030 0...
Name: top_12_popular_items, dtype: object

In [29]:
submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

submission['prediction_lgb'] = submission['customer_id'].map(id_to_index_dict).map(pred_sr)
submission['prediction_lgb'] = submission['prediction_lgb'].fillna('')

submission['age_bin'] = submission['customer_id'].map(id_to_index_dict).map(customers.set_index('customer_id')['age_bin'])
submission['prediction_popular'] = submission['age_bin'].map(popular_items_sr)
submission['prediction_popular'] = submission['prediction_popular'].fillna(top12).astype('str')

submission['prediction'] = submission['prediction_lgb'] + submission['prediction_popular']
submission['prediction'] = submission['prediction'].str.strip()
submission['prediction'] = submission['prediction'].str[:131]
submission = submission[['customer_id', 'prediction']]
display(submission.head())
submission.to_csv(f'../submissions/{EXP}_submission.csv', index=False)

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0850917001 0897221001 0779781013 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0850917001 0891591007 0707269004 0914441004 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0850917001 0891591007 0867966009 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0928206001 0930380001 0924243002 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0730683050 0791587015 0896152002 0927530004 08...
