In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb

import sys

sys.path.insert(0, './code')
import dataloader
import trainer
import numpy as np

In [2]:
import importlib

loader = dataloader.DataLoader()
importlib.reload(trainer)
amt_handler = trainer.AmtTrainHandler()
profile_handler = trainer.AmtProfileHandler()
cnt_handler = trainer.CntTrainHandler()
rank_handler = trainer.RankTopHandler()
stack_handler = trainer.StackTrainHandler()

In [3]:
amt_results = loader.load_result('2021_12_20_amt_train_results.joblib')
cnt_results = loader.load_result('2021_12_20_cnt_train_results.joblib')

In [4]:
test_amt_results, train_amt_results, amt_idx_results = amt_results
test_cnt_results, train_cnt_results, cnt_idx_results = cnt_results

In [5]:
amt_feats = loader.load_result('2021_12_06_amt_feats.joblib')

In [6]:
amt_labels = amt_feats[['chid'] + amt_handler.ylabel_cols_23 + amt_handler.ylabel_cols_24].copy()
del amt_feats

In [7]:
import re
stack_labels = {'chid': amt_labels['chid'].to_list() * 2, }

for col in amt_labels.columns[1:17]:
    reg = r"shop_(\d+)_amt_23"
    shop_tag = re.findall(reg, col)[0]
    stack_labels.update({shop_tag: amt_labels[col].to_list()})

for col in amt_labels.columns[17:]:
    reg = r"shop_(\d+)_amt_24"
    shop_tag = re.findall(reg, col)[0]
    stack_labels[shop_tag] = stack_labels[shop_tag] + amt_labels[col].to_list()


In [8]:
stack_labels = pd.DataFrame(stack_labels)

In [9]:
stack_labels['dt'] = [23] * 500000 + [24] * 500000
stack_labels['query_id'] = stack_labels['chid'].apply(lambda x: str(x)) + stack_labels['dt'].apply(lambda x: str(x))

In [10]:
_, _, idx_results = loader.load_result('2021_12_20_amt_train_results.joblib')


In [11]:
stack_labels.loc[idx_results[0][1]]

Unnamed: 0,chid,2,6,10,12,13,15,18,19,21,22,25,26,36,37,39,48,dt,query_id
0,10000000,0.0,0.0,9131.600003,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.0,0.0,23,1000000023
6,10000006,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.0,0.0,23,1000000623
8,10000008,0.0,0.0,15867.687212,0.0,0.0,9669.978547,6980.867235,0.0,0.0,0.000000,4219.34189,0.0,0.000000,19524.323715,0.0,0.0,23,1000000823
11,10000011,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,8496.599701,0.00000,0.0,0.000000,0.000000,0.0,0.0,23,1000001123
16,10000016,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.0,0.0,23,1000001623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999990,10499990,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.0,0.0,24,1049999024
999991,10499991,0.0,0.0,10142.115730,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.0,0.0,24,1049999124
999992,10499992,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,15904.310826,0.0,0.0,24,1049999224
999995,10499995,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.0,0.0,24,1049999524


In [12]:
query_ids = []
train_labels = []
for (trn_ind, val_ind) in idx_results:
    query_id = stack_labels.loc[val_ind]['query_id'].to_list()
    query_ids.append(query_id)
    train_labels.append(stack_labels.loc[val_ind][['chid', 'dt']])

In [13]:
stack_labels = stack_labels.melt(id_vars=["chid",'dt', 'query_id'], 
        var_name="shop_tag", 
        value_name="txn_amt")

In [14]:
rank_labels = [stack_labels[stack_labels['query_id'].isin(query_ids[0])],
               stack_labels[stack_labels['query_id'].isin(query_ids[1])],
               stack_labels[stack_labels['query_id'].isin(query_ids[2])]]

In [15]:
len(stack_labels)

16000000

In [16]:
test_labels = stack_labels[stack_labels['dt']==24].reset_index(drop=True)
test_labels['dt'] = 25
test_labels['query_id'] = test_labels['chid'].apply(lambda x: str(x)) + test_labels['dt'].apply(lambda x: str(x))

In [17]:
loader.save_data([rank_labels, test_labels, train_labels], '2021_12_20_stack_labels.joblib', 'generate labels')