In [None]:
# %pip install -U lightgbm==3.3.2

In [1]:
# %pip install implicit

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np

import pickle
from tqdm import tqdm
import gc
from pathlib import Path

from gensim.models import Word2Vec

import warnings
import sys
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")
sys.path.append("/content/drive/MyDrive/HM-new") # path to the `src`` folder
InteractiveShell.ast_node_interactivity = "all"
tqdm.pandas()

In [4]:
from src.data import DataHelper
import pickle

In [5]:
from collections import defaultdict

In [6]:
data_dir = Path("/content/drive/MyDrive/HM-new/data/")
model_dir = Path("/content/drive/MyDrive/HM-new/models/")

In [7]:
dh = DataHelper(data_dir)
# data = dh.preprocess_data(save=True, name="encoded_full") # * run only once, processed data will be saved
data = dh.load_data(name="encoded_full")

### Train


In [10]:
inter = data['inter']

In [None]:
inter['t_dat'] = pd.to_datetime(inter['t_dat'])
last_week_start = pd.to_datetime("2020-08-19")
inter = inter.loc[(inter.t_dat < last_week_start)]

In [None]:
feedid_seq_list = inter.groupby(['customer_id']).article_id.apply(lambda x: [str(id) for id in x] ).values

In [None]:
# model_sg = Word2Vec(feedid_seq_list,  size=128, window=32, min_count=1, sg=0, sample=1e-3, negative=15, workers=32, seed=1, iter=10)
# model_sg.save(open(model_dir/'articleid_model_cbow.model','wb'))
model_sg = Word2Vec(feedid_seq_list,  size=128, window=32, min_count=1, sg=1, sample=1e-3, negative=15, workers=32, seed=1, iter=10)
model_sg.save(open(model_dir/'articleid_model_skipgram.model','wb'))

In [None]:
# model_sg = pickle.load(open(model_dir/'articleid_model_cbow.model','rb'))
model_sg = pickle.load(open(model_dir/'articleid_model_skipgram.model','rb'))

In [None]:
# * Customer Embedding
feedid_seq_list = inter.groupby(['customer_id']).article_id.apply(lambda x: [str(id) for id in x] ).reset_index()

full_users = data['user']['customer_id'].values
customer_embedding = np.ones((len(full_users)+1, 128))/128

for uid, items in tqdm(feedid_seq_list.values):
    if len(items)>1:
        vec = np.mean(list(map(lambda x: model_sg[x], items)), axis=0)
    else:
        vec = model_sg[items[0]]
    customer_embedding[uid] = vec/np.sqrt(np.sum(vec**2))

100%|██████████| 1334713/1334713 [04:02<00:00, 5512.67it/s]


In [None]:
del full_users, feedid_seq_list, inter
gc.collect()

53

In [None]:
# customer_embedding.dump(data_dir/'external'/'w2v_user_embd.npy')
customer_embedding.dump(data_dir/'external'/'w2v_skipgram_user_embd.npy')

In [None]:
# * Article Embedding
full_items = data['item']['article_id'].values
article_embedding = np.ones((len(full_items)+1, 128))/128
for item in tqdm(full_items):
    try:
        vec = model_sg[str(item)]
    except:
        vec = article_embedding[item]
    article_embedding[item,:] = vec/np.sqrt(np.sum(vec**2))

100%|██████████| 105542/105542 [00:03<00:00, 34165.45it/s]


In [None]:
# article_embedding.dump(data_dir/'external'/'w2v_item_embd.npy')
article_embedding.dump(data_dir/'external'/'w2v_skipgram_item_embd.npy')

In [None]:
# * Product_code Embedding
full_products = list(data['item']['product_code'].unique())

In [None]:
product_embd_dict = {}
for pid, items in tqdm(data['item'].groupby('product_code')):
    embd = article_embedding[items['article_id'].values]
    embd = np.sum(embd, axis=0)
    product_embd_dict[pid] = embd/np.sqrt(np.sum(embd**2))

100%|██████████| 47224/47224 [00:07<00:00, 5918.84it/s]


In [None]:
product_embedding = np.ones((len(full_products)+1, 128))/128
for pid,embd in tqdm(product_embd_dict.items()):
    product_embedding[pid,:] = embd

100%|██████████| 47224/47224 [00:00<00:00, 667652.54it/s]


In [None]:
# product_embedding.dump(data_dir/'external'/'w2v_product_embd.npy')
product_embedding.dump(data_dir/'external'/'w2v_skipgram_product_embd.npy')

---

In [None]:
# * Product_code Embedding
full_products = list(data['item']['product_code'].unique())

In [9]:
dssm_item_embd = np.load(data_dir/'external'/'dssm_item_embd.npy', allow_pickle=True)

product_embd_dict = {}
for pid, items in tqdm(data['item'].groupby('product_code')):
    embd = dssm_item_embd[items['article_id'].values-1]
    embd = np.sum(embd, axis=0)
    product_embd_dict[pid] = embd/np.sqrt(np.sum(embd**2))

product_embedding = np.ones((len(full_products), 128))/128
for pid,embd in tqdm(product_embd_dict.items()):
    product_embedding[pid-1,:] = embd

In [11]:
product_embedding.dump(data_dir/'external'/'dssm_product_embd.npy')

In [16]:
yt_item_embd = np.load(data_dir/'external'/'yt_item_embd.npy', allow_pickle=True)

product_embd_dict = {}
for pid, items in tqdm(data['item'].groupby('product_code')):
    embd = yt_item_embd[items['article_id'].values-1]
    embd = np.sum(embd, axis=0)
    product_embd_dict[pid] = embd/np.sqrt(np.sum(embd**2))

product_embedding = np.ones((len(full_products), 128))/128
for pid,embd in tqdm(product_embd_dict.items()):
    product_embedding[pid-1,:] = embd

100%|██████████| 47224/47224 [00:05<00:00, 8468.67it/s]
100%|██████████| 47224/47224 [00:00<00:00, 859955.42it/s]


In [17]:
product_embedding.dump(data_dir/'external'/'yt_product_embd.npy')