In [1]:
# ! git clone https://github.com/Wp-Zhang/HandyRec.git

In [2]:
%pip install tensorflow_addons



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import sys
sys.path.append("/content/drive/MyDrive/HM-new/")
# sys.path.append("./HandyRec/")

In [5]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Input, Dense, Dropout, BatchNormalization, Concatenate, Activation
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.utils import plot_model
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import gc

In [6]:
import tensorflow_addons as tfa

In [7]:
# from handyrec.layers.core import DNN
# from handyrec.layers.utils import concat
# from handyrec.features import DenseFeature, SparseFeature, FeatureGroup, FeaturePool

In [8]:
from src.data import DataHelper
from src.data.metrics import map_at_k, recall_at_k

In [9]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [10]:
from pathlib import Path
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [11]:
# tf.compat.v1.disable_eager_execution()

In [12]:
RANK_EMBEDDING_DIM = 64
BATCH_SIZE = 2**12
NEPOCH = 20

In [13]:
TRAIN_WEEK_NUM = 4
WEEK_NUM = TRAIN_WEEK_NUM + 2

VERSION_NAME = "pivot"
TEST = False # * Set as `False` when do local experiments to save time

In [14]:
data_dir = Path("/content/drive/MyDrive/HM-new/data/")
model_dir = Path("/content/drive/MyDrive/HM-new/models/")

In [15]:
dh = DataHelper(data_dir)
data = dh.load_data(name="encoded_full")

In [16]:
inter = data['inter']
inter = inter.loc[(inter.t_dat <= "2020-08-19")]

## Calculate & Load Embeddings

In [17]:
# article description - TFIDF - SVD
if not os.path.exists(data_dir/'external/tfidf_item_embd.npy'):
    articles = pd.read_csv(data_dir/'raw/articles.csv')

    corpus = articles[[col for col in articles.columns if 'name' in col] + ['detail_desc']].T.apply(lambda x: ' '.join(map(str,x))).T

    vectorizer = TfidfVectorizer(min_df=3)
    X = vectorizer.fit_transform(corpus)
    svd = TruncatedSVD(n_components=256, random_state=0)
    tfidf_item = svd.fit_transform(X)
    tfidf_item = np.concatenate([np.ones((1,256)), tfidf_item], axis=0)
    tfidf_item.dump(data_dir/'external/tfidf_item_embd.npy')
else:
    tfidf_item = np.load(data_dir/'external/tfidf_item_embd.npy', allow_pickle=True)

In [18]:
# article_id - customer_id TFIDF + SVD
if not os.path.exists(data_dir/'external/tfidf_item_embd2.npy'):
    corpus = inter.groupby('article_id').customer_id.apply(lambda x: ' '.join(map(str, x)))
    article_ids = np.array(list(corpus.index))

    vectorizer = TfidfVectorizer(min_df=3)
    X = vectorizer.fit_transform(corpus)
    svd = TruncatedSVD(n_components=128, random_state=0)
    X_svd = svd.fit_transform(X)

    item_num = data['item']['article_id'].nunique()
    tfidf_item2 = np.ones((item_num+1, 128)) / 128
    for i,iid in enumerate(article_ids):
        tfidf_item2[iid,:] = X_svd[i,:]

    tfidf_item2.dump(data_dir/'external/tfidf_item_embd2.npy')
else:
    tfidf_item2 = np.load(data_dir/'external/tfidf_item_embd2.npy', allow_pickle=True)

In [19]:
# customer_id - product_code TFIDF + SVD
if not os.path.exists(data_dir/'external/tfidf_user_embd.npy'):
    inter = inter.merge(data['item'][['article_id','product_code']], on=['article_id'], how='left')
    corpus = inter.groupby('customer_id').product_code.apply(lambda x: ' '.join(map(str, x)))
    customer_ids = np.array(list(corpus.index))

    vectorizer = TfidfVectorizer(min_df=3)
    X = vectorizer.fit_transform(corpus)
    svd = TruncatedSVD(n_components=128, random_state=0)
    X_svd = svd.fit_transform(X)

    user_num = data['user']['customer_id'].nunique()
    tfidf_user = np.ones((user_num+1, 128)) / 128
    for i,uid in enumerate(customer_ids):
        tfidf_user[uid,:] = X_svd[i,:]

    tfidf_user.dump(data_dir/'external/tfidf_user_embd.npy')
else:
    tfidf_user = np.load(data_dir/'external/tfidf_user_embd.npy', allow_pickle=True)

In [20]:
# * Load pre-trained embeddings
w2v_user_embd = np.load(data_dir/'external'/'w2v_user_embd.npy', allow_pickle=True)
w2v_item_embd = np.load(data_dir/'external'/'w2v_item_embd.npy', allow_pickle=True)
w2v_product_embd = np.load(data_dir/'external'/'w2v_product_embd.npy', allow_pickle=True)
image_item_embd = np.load(data_dir/'external'/'image_embd.npy', allow_pickle=True)
w2v_sg_user_embd = np.load(data_dir/'external'/'w2v_skipgram_user_embd.npy', allow_pickle=True)
w2v_sg_item_embd = np.load(data_dir/'external'/'w2v_skipgram_item_embd.npy', allow_pickle=True)
w2v_sg_product_embd = np.load(data_dir/'external'/'w2v_skipgram_product_embd.npy', allow_pickle=True)

## Load Candidates & Prepare Data

In [21]:
candidates = {}
labels = {}
for i in tqdm(range(1, WEEK_NUM)):
    candidates[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_candidate.pqt")
    labels[i] = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week{i}_label.pqt")    

100%|██████████| 5/5 [00:09<00:00,  1.83s/it]


In [22]:
feats = [
    x
    for x in candidates[1].columns
    if x
    not in [
        "label",
        "sales_channel_id",
        "t_dat",
        "week",
    ]
]
cat_features = [
    # "product_type_no",
    # "product_group_name",
    # "graphical_appearance_no",
    # "colour_group_code",
    # "perceived_colour_value_id",
    # "perceived_colour_master_id",
    # "FN",
    # "Active",
    # "club_member_status",
    # "fashion_news_frequency",
    # "user_gender",
    # "article_gender",
    # "season_type",
    # "age",
]
ids = ["customer_id", "article_id", "product_code"]
dense_feats = [x for x in feats if x not in cat_features + ids]
# feats = ids + cat_features + dense_feats

In [23]:
full_data = pd.concat([candidates[i] for i in range(1,WEEK_NUM)], ignore_index=True)
train = full_data[full_data['week']>1]
valid = full_data[full_data['week']==1]

In [24]:
del candidates
gc.collect()

50

In [25]:
# Standardize
# for feat in dense_feats:
    # mask = train[feat].notnull()
    # value = train.loc[mask, feat].mean()
    # train[feat] = train[feat].fillna(value)
    # valid[feat] = valid[feat].fillna(value)
    # scaler = MinMaxScaler().fit(train[feat].values.reshape(-1,1))
    # train[feat] = scaler.transform(train[feat].values.reshape(-1,1))
    # valid[feat] = scaler.transform(valid[feat].values.reshape(-1,1))

In [26]:
feat_dim = {}
for feat in cat_features + ids:
    if feat in data['user'].columns:
        feat_dim[feat] = int(data['user'][feat].max()) + 1
    elif feat in data['item'].columns:
        feat_dim[feat] = int(data['item'][feat].max()) + 1
    else:
        feat_dim[feat] = int(full_data[feat].max()) + 1

In [27]:
del full_data
gc.collect()

150

In [28]:
X_train1 = train[['customer_id', 'article_id', 'product_code']].values.astype('int32')
X_train2 = np.zeros((X_train1.shape[0], len(feats[3:])), dtype='float32')
for i,f in tqdm(enumerate(feats[3:])):
    X_train2[:, i] = np.nan_to_num(train[f].values).astype('float32')
    del train[f]
y_train = train['label'].values

95it [00:18,  5.07it/s]


In [29]:
X_test1 = valid[['customer_id', 'article_id', 'product_code']].values.astype('int32')
X_test2 = np.zeros((X_test1.shape[0], len(feats[3:])), dtype='float32')
for i,f in tqdm(enumerate(feats[3:])):
    X_test2[:, i] = np.nan_to_num(valid[f].values).astype('float32')
    del valid[f]
y_test = valid['label'].values

95it [00:05, 16.94it/s]


## Train Model

In [30]:
customer_embd_layer_1 = Embedding(
    feat_dim["customer_id"], 64, weights=[w2v_sg_user_embd], trainable=False
)
customer_embd_layer_2 = Embedding(
    feat_dim["customer_id"], 64, weights=[w2v_user_embd], trainable=False
)
customer_embd_layer_3 = Embedding(
    feat_dim["customer_id"], 128, weights=[tfidf_user], trainable=False
)

In [31]:
article_embd_layer_1 = Embedding(
    feat_dim["article_id"], 64, weights=[w2v_sg_item_embd], trainable=False
)

article_embd_layer_2 = Embedding(
    feat_dim["article_id"], 64, weights=[w2v_item_embd], trainable=False
)

article_embd_layer_3 = Embedding(
    feat_dim["article_id"], 256, weights=[tfidf_item], trainable=False
)

article_embd_layer_4 = Embedding(
    feat_dim["article_id"], 128, weights=[tfidf_item2], trainable=False
)

article_embd_layer_5 = Embedding(
    feat_dim["article_id"], 512, weights=[image_item_embd], trainable=False
)

In [32]:
product_embd_layer_1 = Embedding(
    feat_dim["product_code"], 64, weights=[w2v_sg_product_embd], trainable=False
)
product_embd_layer_2 = Embedding(
    feat_dim["product_code"], 64, weights=[w2v_product_embd], trainable=False
)

In [33]:
class FM(tf.keras.layers.Layer):
    """Factorization Machine"""

    def __init__(self, **kwargs):
        self.linear = None
        self.w_0 = None

        super().__init__(**kwargs)

    def build(self, input_shape):
        super().build(input_shape)
        self.linear = Dense(1, use_bias=False)
        self.w_0 = self.add_weight(
            shape=(1,),
            initializer=tf.keras.initializers.Zeros,
            dtype=tf.float32,
            trainable=True,
            name="W_0",
        )

    def call(self, inputs, mask=None, *args, **kwargs):
        # * inputs: (batch_size, num_of_fields, embedding_dim)
        # * part2: (batch_size, 1)
        part2 = tf.reduce_sum(self.linear(inputs), axis=1, keepdims=False)

        # * square_sum: (batch_size, embedding_dim)
        # * sum_square: (batch_size, embedding_dim)
        square_sum = tf.square(tf.reduce_sum(inputs, axis=1, keepdims=False))
        sum_square = tf.reduce_sum(inputs * inputs, axis=1, keepdims=False)
        
        # * part3: (batch_size, 1)
        part3 = square_sum - sum_square
        part3 = 0.5 * tf.reduce_sum(part3, axis=1, keepdims=True)
        return tf.nn.bias_add(part2 + part3, self.w_0)

    def compute_output_shape(self, input_shape):
        return (None, 1)

In [34]:
inputs1 = Input(shape=X_train1.shape[1:], dtype=tf.int64)
inputs2 = Input(shape=X_train2.shape[1:], dtype=tf.float32)
input1 = tf.cast(inputs1, dtype=tf.int64)

x_c_id1 = customer_embd_layer_1(input1[:,0])
x_c_id2 = customer_embd_layer_2(input1[:,0])
x_c_id3 = customer_embd_layer_3(input1[:,0])

x_a_id1 = article_embd_layer_1(input1[:,1])
x_a_id2 = article_embd_layer_2(input1[:,1])
x_a_id3 = article_embd_layer_3(input1[:,1])
x_a_id3 = Dense(128)(x_a_id3)
x_a_id4 = article_embd_layer_4(input1[:,1])
x_a_id5 = article_embd_layer_5(input1[:,1])
x_a_id5 = Dense(128)(x_a_id5)

x_p_id1 = product_embd_layer_1(input1[:,2])
x_p_id2 = product_embd_layer_2(input1[:,2])


x_id = Concatenate(axis=-1)([
    x_c_id1, x_c_id2,
    x_a_id1, x_a_id2, x_a_id3, x_a_id4, x_a_id5,
    x_p_id1, x_p_id2,
])

x0 = Concatenate(axis=-1)([x_id, BatchNormalization()(inputs2)])
# x = Dropout(0.2)(x0)
# x = Dense(1024, activation='swish')(x)
x = Dropout(0.2)(x0)
x = Dense(512, activation='swish')(x)
x = Dropout(0.2)(x)
x = Dense(256, activation='swish')(x)

x = Concatenate(axis=-1)([x, x0])
x = Dropout(0.2)(x)

output = Dense(1, activation='sigmoid')(x)

# x_c_id2_expand = tf.expand_dims(x_c_id2, axis=1)
# x_a_id2_expand = tf.expand_dims(x_a_id2, axis=1)
# x_p_id2_expand = tf.expand_dims(x_p_id2, axis=1)
# fm_output = FM()(Concatenate(axis=1)([x_c_id2_expand, x_a_id2_expand, x_p_id2_expand]))
# output = output + fm_output
# output = Activation('sigmoid')(output)

model = tf.keras.Model(inputs=[inputs1, inputs2], outputs=[output])
model.summary()
    
model.compile(
    tfa.optimizers.AdamW(learning_rate=0.001, weight_decay=1e-4),
    loss = 'binary_crossentropy',
    metrics=['AUC']
)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 3)]          0           []                               
                                                                                                  
 tf.cast (TFOpLambda)           (None, 3)            0           ['input_1[0][0]']                
                                                                                                  
 tf.__operators__.getitem_5 (Sl  (None,)             0           ['tf.cast[0][0]']                
 icingOpLambda)                                                                                   
                                                                                                  
 tf.__operators__.getitem_7 (Sl  (None,)             0           ['tf.cast[0][0]']            

In [35]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_auc', patience=10, mode='max')
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=model_dir/'model_nn.h5',
    save_weights_only=True,
    monitor='val_auc',
    mode='max',
    save_best_only=True)

history = model.fit(
    [X_train1, X_train2], y_train.astype(int), 
    shuffle=True,
    batch_size=2048,
    validation_data=([X_test1, X_test2], y_test.astype(int)),
    epochs=30,
    callbacks=[checkpoint, early_stop]
)
# 0.7114
# 0.7294
# 0.7382
# 0.7565

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [36]:
model.load_weights(model_dir/'model_nn.h5')

In [37]:
probs = model.predict([X_test1, X_test2], batch_size=4096)

In [38]:
label = data['inter'][data['inter']['t_dat']>='2020-09-16']
label = label.groupby('customer_id')['article_id'].apply(list).reset_index()

In [39]:
valid['prob'] = probs
pred = valid.sort_values(by='prob',ascending=False).reset_index(drop=True)
pred = pred.groupby('customer_id')['article_id'].apply(list).reset_index()
pred.columns = ['customer_id','prediction']

In [40]:
# valid = valid[['customer_id','article_id','prob']]

In [41]:
# valid.to_csv(data_dir/'external'/'nn_valid.csv', index=None)

In [42]:
label = label.merge(pred, on='customer_id', how='left')

In [43]:
label.head()

Unnamed: 0,customer_id,article_id,prediction
0,81,[28968],"[42130, 74, 67523, 104046, 44033, 71108, 94657..."
1,87,[87372],"[100229, 27906, 98607, 104074, 103584, 33869, ..."
2,108,"[69712, 77257, 33873]","[13043, 61304, 2220, 61306, 61305, 53894, 5389..."
3,118,[97392],"[103794, 3092, 104073, 56695, 67523, 81554, 94..."
4,180,"[102398, 98410, 74, 95785, 103797, 105104, 103...","[105181, 95217, 104987, 104073, 103794, 95218,..."


In [44]:
map_at_k(label['article_id'], label['prediction'], k=12)
# 0.028500554033301987
# 0.029904528760153

# 0.031648009478868075
# 0.031309369857160076

# 0.03178218657727387

0.031658461969565656