In [2]:
%pip install tensorflow_addons



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import sys
sys.path.append("/content/drive/MyDrive/HM-new/")

In [5]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Input, Dense, Dropout, BatchNormalization, Concatenate, Activation
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.utils import plot_model
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import gc

In [6]:
import tensorflow_addons as tfa

In [8]:
from src.data import DataHelper
from src.data.metrics import map_at_k, recall_at_k

In [9]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [10]:
from pathlib import Path
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [11]:
# * uncomment this when predicting testset to avoid GPU memory error
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [12]:
RANK_EMBEDDING_DIM = 64
BATCH_SIZE = 2**12
NEPOCH = 20

In [13]:
TRAIN_WEEK_NUM = 4
WEEK_NUM = TRAIN_WEEK_NUM + 2

VERSION_NAME = "LargeRecall"
TEST = False # * Set as `False` when do local experiments to save time

In [14]:
data_dir = Path("/content/drive/MyDrive/HM-new/data/")
model_dir = Path("/content/drive/MyDrive/HM-new/models/")

In [15]:
dh = DataHelper(data_dir)
data = dh.load_data(name="encoded_full")

In [16]:
inter = data['inter']
inter = inter.loc[(inter.t_dat <= "2020-08-19")]

## Calculate & Load Embeddings

In [17]:
# article description - TFIDF - SVD
if not os.path.exists(data_dir/'external/tfidf_item_embd.npy'):
    articles = pd.read_csv(data_dir/'raw/articles.csv')

    corpus = articles[[col for col in articles.columns if 'name' in col] + ['detail_desc']].T.apply(lambda x: ' '.join(map(str,x))).T

    vectorizer = TfidfVectorizer(min_df=3)
    X = vectorizer.fit_transform(corpus)
    svd = TruncatedSVD(n_components=256, random_state=0)
    tfidf_item = svd.fit_transform(X)
    tfidf_item = np.concatenate([np.ones((1,256)), tfidf_item], axis=0)
    tfidf_item.dump(data_dir/'external/tfidf_item_embd.npy')
else:
    tfidf_item = np.load(data_dir/'external/tfidf_item_embd.npy', allow_pickle=True)

In [18]:
# article_id - customer_id TFIDF + SVD
if not os.path.exists(data_dir/'external/tfidf_item_embd2.npy'):
    corpus = inter.groupby('article_id').customer_id.apply(lambda x: ' '.join(map(str, x)))
    article_ids = np.array(list(corpus.index))

    vectorizer = TfidfVectorizer(min_df=3)
    X = vectorizer.fit_transform(corpus)
    svd = TruncatedSVD(n_components=128, random_state=0)
    X_svd = svd.fit_transform(X)

    item_num = data['item']['article_id'].nunique()
    tfidf_item2 = np.ones((item_num+1, 128)) / 128
    for i,iid in enumerate(article_ids):
        tfidf_item2[iid,:] = X_svd[i,:]

    tfidf_item2.dump(data_dir/'external/tfidf_item_embd2.npy')
else:
    tfidf_item2 = np.load(data_dir/'external/tfidf_item_embd2.npy', allow_pickle=True)

In [19]:
# customer_id - product_code TFIDF + SVD
if not os.path.exists(data_dir/'external/tfidf_user_embd.npy'):
    inter = inter.merge(data['item'][['article_id','product_code']], on=['article_id'], how='left')
    corpus = inter.groupby('customer_id').product_code.apply(lambda x: ' '.join(map(str, x)))
    customer_ids = np.array(list(corpus.index))

    vectorizer = TfidfVectorizer(min_df=3)
    X = vectorizer.fit_transform(corpus)
    svd = TruncatedSVD(n_components=128, random_state=0)
    X_svd = svd.fit_transform(X)

    user_num = data['user']['customer_id'].nunique()
    tfidf_user = np.ones((user_num+1, 128)) / 128
    for i,uid in enumerate(customer_ids):
        tfidf_user[uid,:] = X_svd[i,:]

    tfidf_user.dump(data_dir/'external/tfidf_user_embd.npy')
else:
    tfidf_user = np.load(data_dir/'external/tfidf_user_embd.npy', allow_pickle=True)

In [20]:
# * Load pre-trained embeddings
w2v_user_embd = np.load(data_dir/'external'/'w2v_user_embd.npy', allow_pickle=True)
w2v_item_embd = np.load(data_dir/'external'/'w2v_item_embd.npy', allow_pickle=True)
w2v_product_embd = np.load(data_dir/'external'/'w2v_product_embd.npy', allow_pickle=True)
image_item_embd = np.load(data_dir/'external'/'image_embd.npy', allow_pickle=True)
w2v_sg_user_embd = np.load(data_dir/'external'/'w2v_skipgram_user_embd.npy', allow_pickle=True)
w2v_sg_item_embd = np.load(data_dir/'external'/'w2v_skipgram_item_embd.npy', allow_pickle=True)
w2v_sg_product_embd = np.load(data_dir/'external'/'w2v_skipgram_product_embd.npy', allow_pickle=True)

## Load Candidates & Prepare Data

In [None]:
candidates = {}
labels = {}
for i in tqdm(range(1, WEEK_NUM)):
    candidates[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_candidate.pqt")
    candidates[i] = candidates[i][candidates[i]['rank']<=80]
    labels[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_label.pqt")

 20%|██        | 1/5 [00:11<00:45, 11.46s/it]

In [None]:
feats = [
    x
    for x in candidates[1].columns
    if x
    not in [
        "label",
        "sales_channel_id",
        "t_dat",
        "week",

        # 'i_w_full_sale_ratio',
        # 'i_2w_full_sale_ratio',
        'p_w_full_sale_ratio',
        'p_2w_full_sale_ratio',
        # 'i_week_above_daily_sale',
        'p_week_above_full_sale',
        # 'i_2w_week_above_daily_sale',
        'p_2w_week_above_daily_sale',
        'product_type_no_daily_sale',
        # 'i_product_type_no_daily_sale_ratio',
        'p_product_type_no_daily_sale_ratio',

        # 'i_3w_sale',
        # 'i_3w_sale_rank',
        # 'i_3w_sale_norm',
        'p_3w_sale',
        'p_3w_sale_rank',
        'p_3w_sale_norm',
        # 'i_4w_sale',
        # 'i_4w_sale_rank',
        # 'i_4w_sale_norm',
        'p_4w_sale',
        'p_4w_sale_rank',
        'p_4w_sale_norm',
        # "rank",
        # "score",
        # "prob"
    ]
]

ids = ["customer_id", "article_id", "product_code"]
dense_feats = [x for x in feats if x not in ids]
# feats = ids + cat_features + dense_feats

In [None]:
# for f in tqdm(dense_feats):
#     for i in range(1,WEEK_NUM):
#         if f in candidates[i].columns:
#             candidates[i][f] = candidates[i][f].astype('float16')

In [None]:
full_data = pd.concat([candidates[i] for i in range(1,WEEK_NUM)], ignore_index=True)
full_data = full_data[feats+['week','label']]
gc.collect()
# for f in tqdm(rule_feats):
#     full_data[f] = full_data.groupby(['week','customer_id'])[f].rank()
train = full_data[full_data['week']>1]
valid = full_data[full_data['week']==1]

In [None]:
del candidates
gc.collect()

In [None]:
# Standardize
# for feat in dense_feats:
    # mask = train[feat].notnull()
    # value = train.loc[mask, feat].mean()
    # train[feat] = train[feat].fillna(value)
    # valid[feat] = valid[feat].fillna(value)
    # scaler = MinMaxScaler().fit(train[feat].values.reshape(-1,1))
    # train[feat] = scaler.transform(train[feat].values.reshape(-1,1))
    # valid[feat] = scaler.transform(valid[feat].values.reshape(-1,1))

In [None]:
feat_dim = {}
for feat in ids:
    if feat in data['user'].columns:
        feat_dim[feat] = int(data['user'][feat].max()) + 1
    elif feat in data['item'].columns:
        feat_dim[feat] = int(data['item'][feat].max()) + 1
    else:
        feat_dim[feat] = int(full_data[feat].max()) + 1

In [None]:
del full_data
gc.collect()

In [None]:
X_train1 = train[['customer_id', 'article_id', 'product_code']].values.astype('int32')
X_train2 = np.zeros((X_train1.shape[0], len(dense_feats)), dtype='float32')
for i,f in tqdm(enumerate(dense_feats)):
    X_train2[:, i] = np.nan_to_num(train[f].values).astype('float32')
    del train[f]
y_train = train['label'].values

In [None]:
X_test1 = valid[['customer_id', 'article_id', 'product_code']].values.astype('int32')
X_test2 = np.zeros((X_test1.shape[0], len(dense_feats)), dtype='float32')
for i,f in tqdm(enumerate(dense_feats)):
    X_test2[:, i] = np.nan_to_num(valid[f].values).astype('float32')
    del valid[f]
y_test = valid['label'].values

## Train Model

In [None]:
customer_embd_layer_1 = Embedding(
    feat_dim["customer_id"], 64, weights=[w2v_sg_user_embd], trainable=False
)
customer_embd_layer_2 = Embedding(
    feat_dim["customer_id"], 64, weights=[w2v_user_embd], trainable=False
)
customer_embd_layer_3 = Embedding(
    feat_dim["customer_id"], 128, weights=[tfidf_user], trainable=False
)

In [None]:
article_embd_layer_1 = Embedding(
    feat_dim["article_id"], 64, weights=[w2v_sg_item_embd], trainable=False
)

article_embd_layer_2 = Embedding(
    feat_dim["article_id"], 64, weights=[w2v_item_embd], trainable=False
)

article_embd_layer_3 = Embedding(
    feat_dim["article_id"], 256, weights=[tfidf_item], trainable=False
)

article_embd_layer_4 = Embedding(
    feat_dim["article_id"], 128, weights=[tfidf_item2], trainable=False
)

article_embd_layer_5 = Embedding(
    feat_dim["article_id"], 512, weights=[image_item_embd], trainable=False
)

In [None]:
product_embd_layer_1 = Embedding(
    feat_dim["product_code"], 64, weights=[w2v_sg_product_embd], trainable=False
)
product_embd_layer_2 = Embedding(
    feat_dim["product_code"], 64, weights=[w2v_product_embd], trainable=False
)

In [None]:
inputs1 = Input(shape=X_train1.shape[1:], dtype=tf.int64)
inputs2 = Input(shape=X_train2.shape[1:], dtype=tf.float32)
input1 = tf.cast(inputs1, dtype=tf.int64)

x_c_id1 = customer_embd_layer_1(input1[:,0])
x_c_id2 = customer_embd_layer_2(input1[:,0])
x_c_id3 = customer_embd_layer_3(input1[:,0])

x_a_id1 = article_embd_layer_1(input1[:,1])
x_a_id2 = article_embd_layer_2(input1[:,1])
x_a_id3 = article_embd_layer_3(input1[:,1])
x_a_id3 = Dense(128)(x_a_id3)
x_a_id4 = article_embd_layer_4(input1[:,1])
x_a_id5 = article_embd_layer_5(input1[:,1])
x_a_id5 = Dense(128)(x_a_id5)

x_p_id1 = product_embd_layer_1(input1[:,2])
x_p_id2 = product_embd_layer_2(input1[:,2])


x_id = Concatenate(axis=-1)([
    x_c_id1, x_c_id2,
    x_a_id1, x_a_id2, x_a_id3, x_a_id4, x_a_id5,
    x_p_id1, x_p_id2,
])

x0 = Concatenate(axis=-1)([x_id, BatchNormalization()(inputs2)])
# x = Dropout(0.2)(x0)
# x = Dense(1024, activation='swish')(x)
x = Dropout(0.2)(x0)
x = Dense(512, activation='swish')(x)
x = Dropout(0.2)(x)
x = Dense(256, activation='swish')(x)

x = Concatenate(axis=-1)([x, x0])
x = Dropout(0.2)(x)

output = Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=[inputs1, inputs2], outputs=[output])
model.summary()
    
model.compile(
    tfa.optimizers.AdamW(learning_rate=0.001, weight_decay=1e-4),
    loss = 'binary_crossentropy',
    metrics=['AUC']
)

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_auc', patience=10, mode='max')
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=model_dir/'large_model_nn.h5',
    save_weights_only=True,
    monitor='val_auc',
    mode='max',
    save_best_only=True)

history = model.fit(
    [X_train1, X_train2], y_train.astype(int), 
    shuffle=True,
    batch_size=2048,
    validation_data=([X_test1, X_test2], y_test.astype(int)),
    epochs=30,
    callbacks=[checkpoint, early_stop]
)

# 0.7565

In [None]:
model.load_weights(model_dir/'large_model_nn.h5')

In [None]:
probs = model.predict([X_test1, X_test2], batch_size=4096)

In [None]:
label = data['inter'][data['inter']['t_dat']>='2020-09-16']
label = label.groupby('customer_id')['article_id'].apply(list).reset_index()

In [None]:
valid['prob'] = probs
pred = valid.sort_values(by='prob',ascending=False).reset_index(drop=True)
pred = pred.groupby('customer_id')['article_id'].apply(list).reset_index()
pred.columns = ['customer_id','prediction']

In [None]:
valid = valid[['customer_id','article_id','prob']]

In [None]:
valid.to_parquet(data_dir/'external'/'large_nn_valid.pqt')

In [None]:
label = label.merge(pred, on='customer_id', how='left')

In [None]:
map_at_k(label['article_id'], label['prediction'], k=12)
# 0.028500554033301987
# 0.029904528760153

# 0.031648009478868075
# 0.031309369857160076

# 031769005497044554

## Test

In [None]:
model.load_weights(model_dir/'large_model_nn.h5')

In [None]:
class TQDMPredictCallback(tf.keras.callbacks.Callback):
    def __init__(self, custom_tqdm_instance=None, tqdm_cls=tqdm, **tqdm_params):
        super().__init__()
        self.tqdm_cls = tqdm_cls
        self.tqdm_progress = None
        self.prev_predict_batch = None
        self.custom_tqdm_instance = custom_tqdm_instance
        self.tqdm_params = tqdm_params

    def on_predict_batch_begin(self, batch, logs=None):
        pass

    def on_predict_batch_end(self, batch, logs=None):
        self.tqdm_progress.update(batch - self.prev_predict_batch)
        self.prev_predict_batch = batch

    def on_predict_begin(self, logs=None):
        self.prev_predict_batch = 0
        if self.custom_tqdm_instance:
            self.tqdm_progress = self.custom_tqdm_instance
            return

        total = self.params.get('steps')
        if total:
            total -= 1

        self.tqdm_progress = self.tqdm_cls(total=total, **self.tqdm_params)

    def on_predict_end(self, logs=None):
        if self.tqdm_progress and not self.custom_tqdm_instance:
            self.tqdm_progress.close()

In [None]:
del train, valid, X_train1, X_train2, X_test1, X_test2
gc.collect()

In [None]:
chunk = 0

In [None]:
test_candidates = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week0_candidate_{chunk}.pqt")
for f in tqdm(dense_feats):
    test_candidates[f] = test_candidates[f].astype('float16')
test1 = test_candidates[['customer_id', 'article_id', 'product_code']].values.astype('int32')
test2 = np.zeros((test1.shape[0], len(dense_feats)), dtype='float32')
for i,f in tqdm(enumerate(dense_feats)):
    test2[:, i] = np.nan_to_num(test_candidates[f].values).astype('float32')
    del test_candidates[f]
gc.collect()

probs = model.predict([test1, test2], batch_size=2048, callbacks=[TQDMPredictCallback()])
test_candidates["prob"] = probs
pred_lgb = test_candidates[['customer_id','article_id','prob']]
pred_lgb.rename(columns={'article_id':'prediction'}, inplace=True)
pred_lgb['customer_id'] = pred_lgb['customer_id'].astype(int)
pred_lgb.to_parquet(data_dir/'interim'/f'large_nn_test_{chunk}.pqt')

In [None]:
chunk = 1

In [None]:
test_candidates = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week0_candidate_{chunk}.pqt")
for f in tqdm(dense_feats):
    test_candidates[f] = test_candidates[f].astype('float16')
test1 = test_candidates[['customer_id', 'article_id', 'product_code']].values.astype('int32')
test2 = np.zeros((test1.shape[0], len(dense_feats)), dtype='float32')
for i,f in tqdm(enumerate(dense_feats)):
    test2[:, i] = np.nan_to_num(test_candidates[f].values).astype('float32')
    del test_candidates[f]
gc.collect()

probs = model.predict([test1, test2], batch_size=2048, callbacks=[TQDMPredictCallback()])
test_candidates["prob"] = probs
pred_lgb = test_candidates[['customer_id','article_id','prob']]
pred_lgb.rename(columns={'article_id':'prediction'}, inplace=True)
pred_lgb['customer_id'] = pred_lgb['customer_id'].astype(int)
pred_lgb.to_parquet(data_dir/'interim'/f'large_nn_test_{chunk}.pqt')

---

In [None]:
test_pred1 = pd.read_parquet(data_dir/'interim'/f'nn_test_0.pqt')
test_pred2 = pd.read_parquet(data_dir/'interim'/f'nn_test_1.pqt')

In [None]:
test_pred = pd.concat([test_pred1, test_pred2], ignore_index=True)
test_pred = test_pred.sort_values(by=["prob"], ascending=False).reset_index(drop=True)
test_pred = test_pred.drop_duplicates(['customer_id', 'prediction'], keep='first')

In [None]:
test_pred.to_parquet(data_dir/'processed'/'nn_test.pqt')