In [1]:
# ランク学習
# tiobfをベースにbucketsを作る
# MAP@12 (all): 0.018444
# MAP@12 (cold start): 0.007597

EXP = '016'
FOLD = '_fold1'

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

from pathlib import Path
import pickle
import gc

data_path = Path('../input/h-and-m-personalized-fashion-recommendations/')

In [3]:
transactions = pd.read_csv(
    data_path / f'transactions_train{FOLD}.csv',
    # set dtype or pandas will drop the leading '0' and convert to int
    dtype={'article_id': str},
    parse_dates=['t_dat']
)
t_max = transactions['t_dat'].max()
transactions['t_diff'] = (t_max - transactions['t_dat']).dt.days
# transactions = transactions.drop_duplicates(subset=['t_dat', 'customer_id', 'article_id'])
print(transactions.shape)
transactions.tail()

(31548013, 6)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,t_diff
31548008,2020-09-15,ffe41634ff990908faacbb465063e027e7c39499f8dfc1...,850917001,0.025407,1,0
31548009,2020-09-15,ffef8aec5cf011fa1393b40337a5993ce0b7b81af6b322...,853316001,0.008458,1,0
31548010,2020-09-15,ffef8aec5cf011fa1393b40337a5993ce0b7b81af6b322...,296366006,0.000847,1,0
31548011,2020-09-15,ffef8aec5cf011fa1393b40337a5993ce0b7b81af6b322...,789769001,0.013542,1,0
31548012,2020-09-15,fff5bd112051feb2367276df143f79bc69126814c73e21...,728156001,0.043203,1,0


In [4]:
submission = pd.read_csv(data_path / 'sample_submission.csv')
print(submission.shape)
submission.head()

(1371980, 2)


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...


In [5]:
transactions_3w = transactions[(transactions['t_diff'] >= 7) & (transactions['t_diff'] < 28)].copy()
transactions_2w = transactions[(transactions['t_diff'] >= 7) & (transactions['t_diff'] < 21)].copy()
transactions_1w = transactions[(transactions['t_diff'] >= 7) & (transactions['t_diff'] < 14)].copy()
transactions_0w = transactions[transactions['t_diff'] < 7].copy()

labels = transactions[transactions['t_diff'] < 7][['customer_id', 'article_id']].drop_duplicates().copy()
labels['is_purchased'] = 1

In [53]:
def make_purchase_df(transactions_1w, transactions_2w, transactions_3w):
    purchase_df_3w = transactions_3w.groupby(['customer_id', 'article_id'])['article_id'].count().rename('count_3w').reset_index().copy()
    purchase_df_2w = transactions_2w.groupby(['customer_id', 'article_id'])['article_id'].count().rename('count_2w').reset_index().copy()
    purchase_df_1w = transactions_1w.groupby(['customer_id', 'article_id'])['article_id'].count().rename('count_1w').reset_index().copy()

    purchase_df = purchase_df_1w.merge(purchase_df_2w, how='outer', on=['customer_id', 'article_id'])
    purchase_df = purchase_df.merge(purchase_df_3w, how='outer', on=['customer_id', 'article_id'])

    dummy_df_3w = (transactions_3w.groupby('article_id')['article_id'].count().rename('dummy_count').sort_values(ascending=False))[:12].reset_index().copy()
    dummy_df_2w = (transactions_2w.groupby('article_id')['article_id'].count().rename('dummy_count').sort_values(ascending=False))[:12].reset_index().copy()
    dummy_df_1w = (transactions_1w.groupby('article_id')['article_id'].count().rename('dummy_count').sort_values(ascending=False))[:12].reset_index().copy()

    dummy_df = dummy_df_1w.merge(dummy_df_2w, how='outer', on=['article_id'], suffixes=['_1w', ''])
    dummy_df = dummy_df.merge(dummy_df_3w, how='outer', on=['article_id'], suffixes=['_2w', '_3w'])

    # dummy_dfs = []
    # for customer_id in submission['customer_id']:
    #     tmp = dummy_df.copy()
    #     tmp['customer_id'] = customer_id
    #     dummy_dfs.append(tmp)
    # dummy_df = pd.concat(dummy_dfs, axis=0)

    dummy_df = pd.DataFrame(
            np.concatenate(
                [np.repeat(submission['customer_id'].copy().to_numpy(), repeats=len(dummy_df)).reshape(-1, 1),
                np.repeat(np.expand_dims(dummy_df.copy().to_numpy(), axis=0), axis=0, repeats=len(submission['customer_id'])).reshape(-1, 4)],
                axis=-1),
            columns = ['customer_id'] + list(dummy_df.columns),
        )

    display(purchase_df.head())
    display(dummy_df.head())
    purchase_df = purchase_df.merge(dummy_df, how='outer', on=['customer_id', 'article_id'])

    purchase_df = purchase_df.merge(labels, how='left', on=['customer_id', 'article_id'])
    purchase_df['is_purchased'] = purchase_df['is_purchased'].fillna(0)
    purchase_df = purchase_df.sort_values('customer_id').reset_index(drop=True)

    for c in ['count_1w', 'count_2w', 'count_3w', 'dummy_count_1w', 'dummy_count_2w', 'dummy_count_3w']:
        purchase_df[c] = purchase_df[c].astype(float)

    return purchase_df

In [54]:
train_purchase_df = make_purchase_df(transactions_1w, transactions_2w, transactions_3w)
print('全て欠損値の行（バグ）：', train_purchase_df.drop(['customer_id', 'article_id', 'is_purchased'], axis=1).isna().all(axis=1).sum())
train_purchase_df.head()

Unnamed: 0,customer_id,article_id,count_1w,count_2w,count_3w
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601043,1.0,1.0,1
1,000172a9c322560c849754ffbdfdb2180d408aa7176b94...,685814001,3.0,3.0,3
2,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,572998013,1.0,1.0,1
3,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,888024005,1.0,1.0,1
4,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,909869004,1.0,1.0,1


Unnamed: 0,customer_id,article_id,dummy_count_1w,dummy_count_2w,dummy_count_3w
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,915526001,779.0,1576.0,1940.0
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,751471043,757.0,1410.0,1893.0
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,751471001,724.0,1570.0,2151.0
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,706016001,683.0,1384.0,1955.0
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,919365008,614.0,,


0


Unnamed: 0,customer_id,article_id,count_1w,count_2w,count_3w,dummy_count_1w,dummy_count_2w,dummy_count_3w,is_purchased
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601043,1.0,1.0,1.0,,,,0.0
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,896152003,,,,,,1358.0,0.0
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,916468003,,,,,,1732.0,0.0
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,915529001,,,,,911.0,,0.0
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,915526002,,,,,925.0,1381.0,0.0


In [55]:
articles = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv', dtype={'article_id': str},)

le = LabelEncoder()
le.fit(articles['article_id'].unique())

del articles
gc.collect()

with open(f'../input/label_encoder/{EXP}_label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

train_purchase_df['article_id'] = le.transform(train_purchase_df['article_id'])

train_purchase_df.head()

Unnamed: 0,customer_id,article_id,count_1w,count_2w,count_3w,dummy_count_1w,dummy_count_2w,dummy_count_3w,is_purchased
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,16023,1.0,1.0,1.0,,,,0.0
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,101368,,,,,,1358.0,0.0
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,103864,,,,,,1732.0,0.0
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,103795,,,,,911.0,,0.0
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,103794,,,,,925.0,1381.0,0.0


In [56]:
# ランク学習

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [6, 12],
    'boosting_type': 'gbdt',
}

train_query = train_purchase_df.groupby('customer_id')['customer_id'].count().to_list()
dtrain = lgb.Dataset(train_purchase_df.drop(['customer_id', 'is_purchased'], axis=1), label=train_purchase_df['is_purchased'], group=train_query)
# dval = lgb.Dataset(test, reference=dtrain, group=test_query)
model = lgb.train(params, dtrain)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 320
[LightGBM] [Info] Number of data points in the train set: 25389787, number of used features: 7


In [71]:
test_purchase_df = make_purchase_df(transactions_0w, transactions_1w, transactions_2w)
print('全て欠損値の行（バグ）：', test_purchase_df.drop(['customer_id', 'article_id', 'is_purchased'], axis=1).isna().all(axis=1).sum())
test_purchase_df.head()

Unnamed: 0,customer_id,article_id,count_1w,count_2w,count_3w
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,794321007,1.0,,
1,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,448509014,1.0,,
2,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,719530003,1.0,,
3,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,734592001,1.0,,
4,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,621381012,1.0,,


Unnamed: 0,customer_id,article_id,dummy_count_1w,dummy_count_2w,dummy_count_3w
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,909370001,1283.0,,
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,865799006,768.0,,
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,918522001,729.0,,
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,924243001,704.0,,
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,448509014,609.0,529.0,


全て欠損値の行（バグ）： 0


Unnamed: 0,customer_id,article_id,count_1w,count_2w,count_3w,dummy_count_1w,dummy_count_2w,dummy_count_3w,is_purchased
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,751471043,,,,,757.0,1410.0,0.0
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,933706001,,,,,,1005.0,0.0
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,898694001,,,,,,1386.0,0.0
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,762846031,,,,,450.0,,0.0
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,909916001,,,,,459.0,,0.0


In [72]:
test_purchase_df['article_id'] = le.transform(test_purchase_df['article_id'])

test_purchase_df.head()

Unnamed: 0,customer_id,article_id,count_1w,count_2w,count_3w,dummy_count_1w,dummy_count_2w,dummy_count_3w,is_purchased
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,67543,,,,,757.0,1410.0,0.0
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,105146,,,,,,1005.0,0.0
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,101718,,,,,,1386.0,0.0
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,71110,,,,,450.0,,0.0
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,103186,,,,,459.0,,0.0


In [73]:
pred = model.predict(test_purchase_df.drop(['customer_id', 'is_purchased'], axis=1), num_iteration=model.best_iteration)
np.sort(pred)

array([-2.65990777, -2.65990777, -2.65990777, ...,  3.81929863,
        3.82447934,  4.00649312])

In [75]:
test_purchase_df['predict_score'] = pred
test_purchase_df['rank'] = test_purchase_df.groupby('customer_id')['predict_score'].rank('dense', ascending=False)
test_purchase_df = test_purchase_df[test_purchase_df['rank'] <= 12]
test_purchase_df = test_purchase_df.sort_values('rank').reset_index(drop=True)
test_purchase_df['article_id'] = le.inverse_transform(test_purchase_df['article_id'])
submission = test_purchase_df.groupby('customer_id')['article_id'].apply(list).rename('prediction').reset_index()
submission['prediction'] = submission['prediction'].map(lambda l: (' ').join(l))
submission['prediction'] = submission['prediction'].str.strip()
submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0909370001 0918522001 0924243001 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0909370001 0918522001 0924243001 0809238001 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0909370001 0794321007 0918522001 0924243001 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0909370001 0918522001 0924243001 0809238001 08...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0909370001 0918522001 0924243001 0809238001 08...


In [93]:
test_purchase_df.query("customer_id == @test_purchase_df.iloc[47]['customer_id']")

Unnamed: 0,customer_id,article_id,count_1w,count_2w,count_3w,dummy_count_1w,dummy_count_2w,dummy_count_3w,is_purchased,predict_score,rank
47,add20240eb4c5d3c2c3e21e3e8928432d411947e931d53...,706016019,1.0,,,,,,1.0,0.370573,1.0
2257964,add20240eb4c5d3c2c3e21e3e8928432d411947e931d53...,864668002,1.0,,,,,,1.0,0.1924,2.0
3006161,add20240eb4c5d3c2c3e21e3e8928432d411947e931d53...,909370001,,,,1283.0,,,0.0,0.138514,3.0
5174123,add20240eb4c5d3c2c3e21e3e8928432d411947e931d53...,785060003,1.0,,,,,,1.0,0.107927,4.0
6902615,add20240eb4c5d3c2c3e21e3e8928432d411947e931d53...,123173001,1.0,,,,,,1.0,0.080154,5.0
6902625,add20240eb4c5d3c2c3e21e3e8928432d411947e931d53...,179208001,1.0,,,,,,1.0,0.080154,5.0
7363158,add20240eb4c5d3c2c3e21e3e8928432d411947e931d53...,918522001,,,,729.0,,,0.0,0.053361,6.0
8601620,add20240eb4c5d3c2c3e21e3e8928432d411947e931d53...,924243001,,,,704.0,,,0.0,0.017068,7.0
10363881,add20240eb4c5d3c2c3e21e3e8928432d411947e931d53...,809238001,,,,563.0,,,0.0,-0.070024,8.0
11943011,add20240eb4c5d3c2c3e21e3e8928432d411947e931d53...,918836001,1.0,,,,,,1.0,-0.086176,9.0


In [76]:
submission.to_csv(f'../submissions/{EXP}_submission{FOLD}.csv', index=False)