In [None]:
%pip install tensorflow_addons

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.append("/content/drive/MyDrive/HM-new/")

In [3]:
import tensorflow_addons as tfa
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Input, Dense, Dropout, BatchNormalization, Concatenate
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.utils import plot_model
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import gc

In [5]:
from src.data import DataHelper
from src.data.metrics import map_at_k, recall_at_k

In [6]:
from pathlib import Path
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [8]:
RANK_EMBEDDING_DIM = 64
BATCH_SIZE = 2**12
NEPOCH = 20

In [9]:
TRAIN_WEEK_NUM = 4
WEEK_NUM = TRAIN_WEEK_NUM + 2

VERSION_NAME = "pivot"
TEST = False # * Set as `False` when do local experiments to save time

In [10]:
data_dir = Path("/content/drive/MyDrive/HM-new/data/")
model_dir = Path("/content/drive/MyDrive/HM-new/models/")

In [11]:
dh = DataHelper(data_dir)
data = dh.load_data(name="encoded_full")

In [12]:
inter = data['inter']
inter = inter.loc[(inter.t_dat <= "2020-08-19")]

## Load Candidates & Prepare Data

In [13]:
candidates = {}
labels = {}
for i in tqdm(range(1, WEEK_NUM)):
    candidates[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_candidate.pqt")
    labels[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_label.pqt")    

100%|██████████| 5/5 [00:07<00:00,  1.55s/it]


In [14]:
feats = [
    x
    for x in candidates[1].columns
    if x
    not in [
        "label",
        "sales_channel_id",
        "t_dat",
        "week",
    ]
]
cat_features = []
ids = ["customer_id", "article_id"]
dense_feats = [x for x in feats if x not in cat_features + ids]
# feats = ids + cat_features + dense_feats

In [34]:
full_data = pd.concat([candidates[i] for i in range(1,WEEK_NUM)], ignore_index=True)
train = full_data[full_data['week']>1]
valid = full_data[full_data['week']==1]

In [17]:
# * Load pre-trained embeddings
w2v_user_embd = np.load(data_dir/'external'/'w2v_user_embd.npy', allow_pickle=True)
w2v_item_embd = np.load(data_dir/'external'/'w2v_item_embd.npy', allow_pickle=True)

In [18]:
feat_dim = {}
for feat in cat_features + ids:
    if feat in data['user'].columns:
        feat_dim[feat] = int(data['user'][feat].max()) + 1
    elif feat in data['item'].columns:
        feat_dim[feat] = int(data['item'][feat].max()) + 1
    else:
        feat_dim[feat] = int(full_data[feat].max()) + 1

In [19]:
del full_data
gc.collect()

150

In [20]:
X_train1 = train[['customer_id', 'article_id']].values.astype('int32')
X_train2 = np.nan_to_num(train[feats[2:]].values).astype('float32')
y_train = train['label'].values

X_test1 = valid[['customer_id', 'article_id']].values.astype('int32')
X_test2 = np.nan_to_num(valid[feats[2:]].values).astype('float32')
y_test= valid['label'].values

## Train Model

In [21]:
# customer_embd_layer_1 = Embedding(
#     feat_dim["customer_id"], 64, weights=[w2v_user_embd], trainable=True
# )
customer_embd_layer_2 = Embedding(
    feat_dim["customer_id"], 64, weights=[w2v_user_embd], trainable=False
)

In [22]:
# article_embd_layer_1 = Embedding(
#     feat_dim["article_id"], 64, weights=[w2v_item_embd], trainable=True
# )

article_embd_layer_2 = Embedding(
    feat_dim["article_id"], 64, weights=[w2v_item_embd], trainable=False
)

In [26]:
inputs1 = Input(shape=X_train1.shape[1:], dtype=tf.int64)
inputs2 = Input(shape=X_train2.shape[1:], dtype=tf.float32)

input1 = tf.cast(inputs1, dtype=tf.int64)

# x_c_id1 = customer_embd_layer_1(input1[:,0])
# x_c_id1 = Dropout(0.5)(x_c_id1)
# x_c_id1 = Dense(16)(x_c_id1)
# x_a_id1 = article_embd_layer_1(input1[:,1])
# x_a_id1 = Dropout(0.5)(x_a_id1)
# x_a_id1 = Dense(16)(x_a_id1)

x_c_id2 = customer_embd_layer_2(input1[:,0])
x_a_id2 = article_embd_layer_2(input1[:,1])

x_id = Concatenate(axis=-1)([x_c_id2, x_a_id2]) # x_a_id1, x_c_id1, 

x = Concatenate(axis=-1)([x_id, BatchNormalization()(inputs2)])
x = Dropout(0.5)(x)
x = Dense(256, activation='swish', )(x)
x = Dropout(0.5)(x)
x = Dense(128, activation='swish', )(x)
x = Dropout(0.5)(x)

output = Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=[inputs1, inputs2], outputs=[output])
model.summary()
    
model.compile(
    tfa.optimizers.AdamW(learning_rate=0.001, weight_decay=1e-4),
    loss = 'binary_crossentropy',
    metrics=['AUC']
)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 2)]          0           []                               
                                                                                                  
 tf.cast (TFOpLambda)           (None, 2)            0           ['input_1[0][0]']                
                                                                                                  
 tf.__operators__.getitem (Slic  (None,)             0           ['tf.cast[0][0]']                
 ingOpLambda)                                                                                     
                                                                                                  
 tf.__operators__.getitem_1 (Sl  (None,)             0           ['tf.cast[0][0]']            

In [28]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_auc', patience=5, mode='max')
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=model_dir/'model_nn.h5',
    save_weights_only=True,
    monitor='val_auc',
    mode='max',
    save_best_only=True)

history = model.fit(
    [X_train1, X_train2], y_train.astype(int), 
    shuffle=True,
    batch_size=2048,
    validation_data=([X_test1, X_test2], y_test.astype(int)),
    epochs=20,
    callbacks=[checkpoint, early_stop]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [27]:
model.load_weights(model_dir/'model_nn.h5')

In [28]:
probs = model.predict([X_test1, X_test2], batch_size=4096)

In [29]:
label = data['inter'][data['inter']['t_dat']>='2020-09-16']
label = label.groupby('customer_id')['article_id'].apply(list).reset_index()

In [35]:
valid['prob'] = probs
valid = valid.sort_values(by='prob',ascending=False).reset_index(drop=True)
pred = valid.groupby('customer_id')['article_id'].apply(list).reset_index()
pred.columns = ['customer_id','prediction']

In [41]:
valid = valid[['customer_id','article_id','prob']]

In [43]:
valid.to_csv(data_dir/'external'/'nn_valid.csv', index=None)

In [36]:
label = label.merge(pred, on='customer_id', how='left')

In [37]:
label.head()

Unnamed: 0,customer_id,article_id,prediction_x,prediction_y
0,81,[28968],"[73340, 42130, 74, 44033, 82629, 67523, 104046...","[73340, 42130, 74, 44033, 82629, 67523, 104046..."
1,87,[87372],"[100229, 98607, 27906, 33869, 104074, 53893, 7...","[100229, 98607, 27906, 33869, 104074, 53893, 7..."
2,108,"[69712, 77257, 33873]","[13043, 2220, 61304, 61305, 61306, 53893, 5389...","[13043, 2220, 61304, 61305, 61306, 53893, 5389..."
3,118,[97392],"[82629, 67523, 103797, 94657, 104046, 104073, ...","[82629, 67523, 103797, 94657, 104046, 104073, ..."
4,180,"[102398, 98410, 74, 95785, 103797, 105104, 103...","[104987, 95217, 105181, 3511, 95218, 103794, 1...","[104987, 95217, 105181, 3511, 95218, 103794, 1..."


In [39]:
map_at_k(label['article_id'], label['prediction_y'], k=12)

0.029260257132958767