In [65]:
# https://www.kaggle.com/code/hervind/h-m-faster-trending-products-weekly/notebook をベースに
# コールドスタートは年齢毎に
# together の戦略を入れる
# New: pair weight を色々試す

# 0.1
# MAP@12 (all): 0.027895
# MAP@12 (cold start): 0.008798

# 0.2
# MAP@12 (all): 0.027914
# MAP@12 (cold start): 0.008798

# 0.3
# MAP@12 (all): 0.027844
# MAP@12 (cold start): 0.008798

# 0.5
# MAP@12 (all): 0.027063
# MAP@12 (cold start): 0.008798

# 0.8
# MAP@12 (all): 0.026310
# MAP@12 (cold start): 0.008798

EXP = '012'
FOLD = '' # '_fold1' のように指定、全データ学習時は'' を指定

In [66]:
import numpy as np
import pandas as pd

from math import sqrt
from pathlib import Path
from tqdm import tqdm
tqdm.pandas()

In [67]:
data_path = Path('../input/h-and-m-personalized-fashion-recommendations/')
N = 12

### Read the transactions data

In [68]:
df = pd.read_csv(data_path / f'transactions_train{FOLD}.csv',
                 usecols = ['t_dat', 'customer_id', 'article_id'],
                 dtype={'article_id': 'int32'})
df['customer_id'] = df['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
df['t_dat'] = pd.to_datetime(df['t_dat'])
last_ts = df['t_dat'].max()

### Add the last day of billing week

In [69]:
tmp = df[['t_dat']].copy()
tmp['dow'] = tmp['t_dat'].dt.dayofweek
tmp['ldbw'] = tmp['t_dat'] - pd.TimedeltaIndex(tmp['dow'] - 1, unit='D')
tmp.loc[tmp['dow'] >=2 , 'ldbw'] = tmp.loc[tmp['dow'] >=2 , 'ldbw'] + pd.TimedeltaIndex(np.ones(len(tmp.loc[tmp['dow'] >=2])) * 7, unit='D')

df['ldbw'] = tmp['ldbw'].values

### Count the number of transactions per week 

In [70]:
weekly_sales = df.drop('customer_id', axis=1).groupby(['ldbw', 'article_id']).count().reset_index()
weekly_sales = weekly_sales.rename(columns={'t_dat': 'count'})

In [71]:
df = df.merge(weekly_sales, on=['ldbw', 'article_id'], how = 'left')

### Let's assume that in the target week sales will be similar to the last week of the training data

In [72]:
weekly_sales = weekly_sales.reset_index().set_index('article_id')

df = df.join(
    weekly_sales.loc[weekly_sales['ldbw']==last_ts, ['count']],
    on='article_id', rsuffix="_targ")

df['count_targ'].fillna(0, inplace=True)
del weekly_sales

### Calculate sales rate adjusted for changes in product popularity 

In [73]:
df['quotient'] = df['count_targ'] / df['count']

### Take supposedly popular products

In [74]:
df['diff_dat'] = (last_ts - df['t_dat']).map(lambda delta: delta.days)
target_sales = df.query('diff_dat < 14').drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
general_pred = target_sales.nlargest(N).index.tolist()
general_pred = ['0' + str(article_id) for article_id in general_pred]
general_pred_str =  ' '.join(general_pred)
del target_sales
print(general_pred)

# --------------------------------
# 年齢ごとの上位12アイテムも求める
customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
customers['customer_id'] = customers['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
customers['age_bin'] = pd.cut(customers['age'], bins=[10, 20, 30, 40, 50, 60, 70, 100], labels=False)
train = df.query('diff_dat < 14').merge(customers[['customer_id', 'age', 'age_bin']], how='left')
popular_items = train.groupby(['age_bin', 'article_id'])['quotient'].sum().sort_values(ascending=False)
popular_items_dict = {}
for index in popular_items.index.levels[0]:
    popular_items_dict[index] = '0'+' 0'.join(popular_items[index][:12].index.astype('str'))
popular_items_sr = pd.Series(popular_items_dict, name='top_12_popular_items', dtype='str')
popular_items_sr
# ---------------------------------

['0909370001', '0865799006', '0918522001', '0924243001', '0448509014', '0751471001', '0809238001', '0918292001', '0762846027', '0673677002', '0923758001', '0706016001']


0.0    0448509014 0685814003 0715624001 0918522001 07...
1.0    0909370001 0865799006 0924243001 0918522001 08...
2.0    0909370001 0924243002 0865799006 0908799002 08...
3.0    0923758001 0909370001 0909371001 0751471001 08...
4.0    0918522001 0751471001 0751471043 0910601003 07...
5.0    0909370001 0896152002 0924243001 0918522001 09...
6.0    0926285001 0796210001 0751471043 0924243001 08...
Name: top_12_popular_items, dtype: object

### Fill in purchase dictionary

In [75]:
tmp = df.copy()
tmp['x'] = ((last_ts - tmp['t_dat']) / np.timedelta64(1, 'D')).astype(int)
tmp['dummy_1'] = 1 
tmp['x'] = tmp[["x", "dummy_1"]].max(axis=1)

a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
tmp['y'] = a / np.sqrt(tmp['x']) + b * np.exp(-c*tmp['x']) - d

tmp['dummy_0'] = 0 
tmp['y'] = tmp[["y", "dummy_0"]].max(axis=1)
tmp['value'] = tmp['quotient'] * tmp['y'] 


# -------------------------------------
# ペアアイテムも0.2倍の重みづけで混ぜる
pair_weight = 0.2
pairs = np.load(f'../input/hmitempairs/pairs_cudf{FOLD}_1item.npy',allow_pickle=True).item()
tmp2 = tmp.copy()
tmp2['article_id'] = tmp2['article_id'].map(pairs)
tmp2 = tmp2.dropna(subset=['article_id'])
tmp2['article_id'] = tmp2['article_id'].astype('int')
# tmp2 = tmp2.drop_duplicates(['customer_id', 'article_id2'])
tmp2['value'] = tmp2['value'] * pair_weight
tmp = pd.concat([tmp, tmp2], axis=0, ignore_index=True)
# ---------------------------------------


tmp = tmp.groupby(['customer_id', 'article_id']).agg({'value': 'sum'})
tmp = tmp.reset_index()

tmp = tmp.loc[tmp['value'] > 0]
tmp['rank'] = tmp.groupby("customer_id")["value"].rank("dense", ascending=False)
tmp = tmp.loc[tmp['rank'] <= 12]

purchase_df = tmp.sort_values(['customer_id', 'value'], ascending = False).reset_index(drop = True)
purchase_df['prediction'] = '0' + purchase_df['article_id'].astype(str) + ' '
purchase_df = purchase_df.groupby('customer_id').agg({'prediction': sum}).reset_index()
purchase_df['prediction'] = purchase_df['prediction'].str.strip()
# purchase_df = cudf.DataFrame(purchase_df)

### Make a submission

In [76]:
sub  = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv',
                            usecols= ['customer_id'], 
                            dtype={'customer_id': 'string'})

sub['customer_id2'] = sub['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')

sub = sub.merge(purchase_df, left_on = 'customer_id2', right_on = 'customer_id', how = 'left',
               suffixes = ('', '_ignored'))

# ------------------------------------------------------
# 年齢ごとのアイテムを推薦する
sub = sub.merge(customers[['customer_id', 'age_bin']].rename(columns={'customer_id': 'customer_id2'}), on='customer_id2', how='left')
sub['top12_popular_items'] = sub['age_bin'].map(popular_items_sr)
sub['top12_popular_items'] = sub['top12_popular_items'].fillna(general_pred_str).astype('str')
# -------------------------------------------------------
sub['prediction'] = sub['prediction'].fillna('')
sub['prediction'] = sub['prediction'] + ' ' + sub['top12_popular_items']
sub['prediction'] = sub['prediction'].str.strip()
sub['prediction'] = sub['prediction'].str[:131]
sub = sub[['customer_id', 'prediction']]
sub.to_csv(f'../submissions/{EXP}_submission{FOLD}.csv',index=False)
sub.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0859416011 0568597006 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0706016001 0739590027 0764280001 05...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0794321011 0852643003 0852643001 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0918522001 0751471001 0751471043 0910601003 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0896152002 0791587015 0730683050 0927530004 08...
