In [59]:
# %pip install -U lightgbm==3.3.2

In [60]:
%pip install implicit



In [61]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [62]:
import pandas as pd
import numpy as np

import pickle
from tqdm import tqdm
import gc
from pathlib import Path

import warnings
import sys
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")
sys.path.append("/content/drive/MyDrive/HM-new") # path to the `src`` folder
InteractiveShell.ast_node_interactivity = "all"
tqdm.pandas()

In [63]:
from src.data import DataHelper
from src.utils import calc_valid_date
import pickle

In [64]:
from collections import defaultdict

In [65]:
data_dir = Path("/content/drive/MyDrive/HM-new/data/")
model_dir = Path("/content/drive/MyDrive/HM-new/models/")

In [66]:
dh = DataHelper(data_dir)
# data = dh.preprocess_data(save=True, name="encoded_full") # * run only once, processed data will be saved
data = dh.load_data(name="encoded_full")

In [67]:

image_item_embd = np.load(data_dir/'external'/'image_embd.npy', allow_pickle=True)

In [68]:
image_item_embd = np.concatenate([np.ones((1,512))/512, image_item_embd], axis=0)

In [None]:
image_item_embd.dump(data_dir/'external'/'image_embd.npy')

### Train


In [None]:
inter = pd.read_parquet(data_dir / "processed/processed_inter.pqt")
data["inter"] = inter

In [None]:
inter['t_dat'] = pd.to_datetime(inter['t_dat'])
last_week_start = pd.to_datetime("2020-08-19")
inter = inter.loc[(inter.t_dat < last_week_start)]

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

feedid_seq_list = inter.groupby(['customer_id']).article_id.apply(lambda x: [str(id) for id in x] ).values
# model_sg = Word2Vec(feedid_seq_list,  size=64, window=32, min_count=1, sg=0, sample=1e-3, negative=15, workers=32, seed=1, iter=10)
# model_sg.save(open(model_dir/'articleid_model_cbow.model','wb'))
model_sg = Word2Vec(feedid_seq_list,  size=64, window=32, min_count=1, sg=1, sample=1e-3, negative=15, workers=32, seed=1, iter=10)
model_sg.save(open(model_dir/'articleid_model_skipgram.model','wb'))

In [None]:
# model_sg = pickle.load(open(model_dir/'articleid_model_cbow.model','rb'))
model_sg = pickle.load(open(model_dir/'articleid_model_skipgram.model','rb'))

In [None]:
# * Customer Embedding
feedid_seq_list = inter.groupby(['customer_id']).article_id.apply(lambda x: [str(id) for id in x] ).reset_index()

full_users = data['user']['customer_id'].values
customer_embedding = np.ones((len(full_users)+1, 64))/64

for uid, items in tqdm(feedid_seq_list.values):
    if len(items)>1:
        vec = np.mean(list(map(lambda x: model_sg[x], items)), axis=0)
    else:
        vec = model_sg[items[0]]
    customer_embedding[uid] = vec/np.sqrt(np.sum(vec**2))

100%|██████████| 1334713/1334713 [04:56<00:00, 4496.21it/s]


In [None]:
del full_users, feedid_seq_list, inter
gc.collect()

53

In [None]:
# customer_embedding.dump(data_dir/'external'/'w2v_user_embd.npy')
customer_embedding.dump(data_dir/'external'/'w2v_skipgram_user_embd.npy')

In [None]:
# * Article Embedding
full_items = data['item']['article_id'].values
article_embedding = np.ones((len(full_items)+1, 64))/64
for item in tqdm(full_items):
    try:
        vec = model_sg[str(item)]
    except:
        vec = article_embedding[item]
    article_embedding[item,:] = vec/np.sqrt(np.sum(vec**2))

100%|██████████| 105542/105542 [00:03<00:00, 26965.72it/s]


In [None]:
# article_embedding.dump(data_dir/'external'/'w2v_item_embd.npy')
article_embedding.dump(data_dir/'external'/'w2v_skipgram_item_embd.npy')

In [None]:
# * Product_code Embedding
full_products = list(data['item']['product_code'].unique())

In [None]:
product_embd_dict = {}
for pid, items in tqdm(data['item'].groupby('product_code')):
    embd = article_embedding[items['article_id'].values]
    embd = np.sum(embd, axis=0)
    product_embd_dict[pid] = embd/np.sqrt(np.sum(embd**2))

100%|██████████| 47224/47224 [00:09<00:00, 4993.40it/s]


In [None]:
product_embedding = np.ones((len(full_products)+1, 64))/64
for pid,embd in tqdm(product_embd_dict.items()):
    product_embedding[pid,:] = embd

100%|██████████| 47224/47224 [00:00<00:00, 559178.96it/s]


In [None]:
# product_embedding.dump(data_dir/'external'/'w2v_product_embd.npy')
product_embedding.dump(data_dir/'external'/'w2v_skipgram_product_embd.npy')