In [2]:
import os
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image, ImageFile
from dateutil.relativedelta import relativedelta

from torch.utils.data import DataLoader, TensorDataset
from torchvision.transforms import Resize, ToTensor, Normalize, Compose
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from transformers import pipeline

import sys
sys.path.append('../')
from utils.timefeatures import time_features

from scipy.stats import boxcox
ImageFile.LOAD_TRUNCATED_IMAGES = True
torch.set_num_threads(1)


class ZeroShotDataset():
    def __init__(self, sales_total_len, seq_len, output_dim, data_df, img_root,
                 cat_trend, fab_trend, col_trend, trend_len,
                 scaler, no_scaling, meta_df, qcut_df, opt_lambda=None, train=True):
        self.sales_total_len = sales_total_len
        self.seq_len = seq_len
        self.output_dim = output_dim
        self.data_range = self.sales_total_len - self.seq_len - self.output_dim + 1

        self.data_df = data_df
        self.img_root = img_root
        self.cat_trend = cat_trend
        self.fab_trend = fab_trend
        self.col_trend = col_trend

        self.img_df = meta_df.loc[:,meta_df.columns.str.startswith('img')]
        self.text_df = meta_df.loc[:,meta_df.columns.str.startswith('text')]
        self.meta_df = meta_df.iloc[:,3:26]

        # self.text_des = text_des
        # self.text_embedder = pipeline('feature-extraction', model=text_embedder)
        self.trend_len = trend_len
        self.scaler = StandardScaler() if scaler == "standard" else MinMaxScaler()
        self.no_scaling = no_scaling

        
        self.qcut_df = qcut_df
        self.qcut_label_mean = qcut_df.groupby('qcut_label')['sales_mean'].mean().values
        self.qcut_label_median = qcut_df.groupby('qcut_label')['sales_mean'].median().values

        if opt_lambda == None:
            _, self.opt_lambda = boxcox(qcut_df.loc[self.data_df.index, 'sales_mean'])
        else:
            self.opt_lambda = opt_lambda

        self.train = train
        self.past_trend_len = trend_len - sales_total_len
        self.col_imputation = self.color_imputation()

    def __getitem__(self, idx):
        return self.data_df.iloc[idx, :]

    def color_imputation(self):
        col_trend_df = pd.DataFrame(self.col_trend)
        normal_item_df = col_trend_df.drop(columns=[item  for item in col_trend_df.columns if col_trend_df[item][0] == "multi_color"])
        col_imputation = np.mean([np.mean(normal_item_df[item]) for item in normal_item_df.columns])
        return col_imputation

    def preprocess_data(self):
        data = self.data_df.reset_index(drop=True)

        # Get the Gtrends time series associated with each product
        # Read the images (extracted image features) as well
        sales, release_dates, ntrends, image_features, text_features, sales_stamps, scalers, real_value_sales, item_numbers_idx = [], [], [], [], [], [], [], [], []
        metas = []
        qcut_labels = []
        target_regs = []

        img_transforms = Compose([Resize((256, 256)), ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

        for (idx, row) in tqdm(data.iterrows(), total=len(data), ascii=True):
        # for (idx, row) in tqdm(data[:20].iterrows(), total=len(data[:20]), ascii=True):
            item_numbers_idx.append(idx)

            idx = self.data_df.iloc[idx]._name

            meta = self.meta_df.loc[idx].values
            metas.append(meta)

            qcut = int(self.qcut_df.loc[idx]['qcut_label'])
            qcut_labels.append(qcut)

            reg = boxcox(self.qcut_df.loc[idx]['sales_mean'], self.opt_lambda)
            target_regs.append(reg)

            sales_stamp = []

            row.index = pd.to_datetime(row.index)

            release_date = row.dropna().sort_index().index[0]
            release_dates.append([release_date.year, release_date.month, release_date.day])

            row = row[release_date:release_date + relativedelta(weeks=self.output_dim-1)].resample('7d').sum().fillna(0)

            time_feature_range = pd.date_range(release_date - relativedelta(weeks=52), release_date + relativedelta(weeks=self.output_dim-1), freq='7d')
            sales_stamp.append(time_features(time_feature_range, freq='w')[0].tolist())
            sales_stamp.append(time_features(time_feature_range, freq='m')[0].tolist())
            sales_stamp.append(time_features(time_feature_range, freq='y')[0].tolist())

            real_value_sale = torch.FloatTensor(np.array(row))
            real_value_sales.append(real_value_sale)

            # sale = np.array(row) / 290
            sale = self.scaler.fit_transform(np.array(row).reshape(-1, 1)).flatten()

            if self.no_scaling:
                sale = np.array(row)

            sale = torch.FloatTensor(sale)

            sales.append(sale)


            scalers.append([self.scaler.mean_, self.scaler.scale_]) if isinstance(self.scaler, StandardScaler) else scalers.append([self.scaler.data_min_, self.scaler.data_range_])

            if idx in self.cat_trend.keys():
                cat_ntrend = np.array(self.cat_trend[idx]).reshape(-1,1)
                if cat_ntrend.shape[0] != 64:
                    cat_ntrend = torch.zeros(self.trend_len, 1)

                fab_ntrend = np.array(self.fab_trend[idx]).reshape(-1,1)
                if fab_ntrend.shape[0] != 64:
                    fab_ntrend = torch.zeros(self.trend_len, 1)

                col_ntrend = np.array(self.col_trend[idx]).reshape(-1,1)
                if self.col_trend[idx] == "multi_color":
                    col_ntrend = torch.zeros(self.trend_len,1)
                else:
                    col_scaler = StandardScaler().fit(col_ntrend[:self.past_trend_len])
                    col_ntrend = col_scaler.transform(col_ntrend)
                if col_ntrend.shape[0] != 64:
                    col_ntrend = torch.zeros(self.trend_len, 1)

                cat_scaler = StandardScaler().fit(cat_ntrend[:self.past_trend_len])
                cat_ntrend = cat_scaler.transform(cat_ntrend)
                fab_scaler = StandardScaler().fit(fab_ntrend[:self.past_trend_len])
                fab_ntrend = fab_scaler.transform(fab_ntrend)
            else:
                col_ntrend = torch.zeros(self.trend_len, 1)
                cat_ntrend = torch.zeros(self.trend_len, 1)
                fab_ntrend = torch.zeros(self.trend_len, 1)


            
            multitrends = torch.stack([torch.FloatTensor(cat_ntrend), torch.FloatTensor(fab_ntrend), torch.FloatTensor(col_ntrend)]).squeeze()    
            img = Image.open(os.path.join(self.img_root, idx + '.png')).convert('RGB')
            # textual_description = list(self.text_des.loc[idx])
            # word_embeddings = self.text_embedder(textual_description)

            # BERT gives us embeddings for [CLS] ..  [EOS], which is why we only average the embeddings in the range [1:-1]
            # We're not fine tuning BERT and we don't want the noise coming from [CLS] or [EOS]
            word_embeddings = torch.FloatTensor(self.text_df.loc[idx].values)
            text_features.append(word_embeddings)

            # Append them to the lists
            ntrends.append(multitrends)
            # img = img_transforms(self.img_df.loc[idx].values)
            img = img_transforms(img)
            image_features.append(img)

            sales_stamp = torch.FloatTensor(sales_stamp)
            sales_stamps.append(sales_stamp)

        # Create tensors for each part of the input/output
        item_sales = torch.stack(sales, dim=0)
        temporal_features = torch.stack(sales_stamps, dim=0)
        ntrends = torch.stack(ntrends, dim=0)
        images = torch.stack(image_features, dim=0)
        texts = torch.stack(text_features, dim=0)
        scalers = torch.FloatTensor(np.array(scalers)).view(-1, 2)

        real_value_sales = torch.stack(real_value_sales, dim=0)

        release_dates = torch.tensor(release_dates)

        item_numbers_idx = torch.tensor(item_numbers_idx)

        meta_data = torch.FloatTensor(metas)
        qcut_labels = torch.LongTensor(qcut_labels)

        target_reg = torch.FloatTensor(target_regs)

        
        return TensorDataset(item_sales, temporal_features, ntrends, images, texts, scalers,
                             real_value_sales, release_dates, item_numbers_idx, meta_data, 
                             qcut_labels, target_reg)

    def get_loader(self, batch_size, train=True):
        print('Starting dataset creation process...')
        data_with_gtrends = self.preprocess_data()
        data_loader = None
        if train:
            data_loader = DataLoader(data_with_gtrends, batch_size=batch_size, shuffle=False, num_workers=4)
        else:
            data_loader = DataLoader(data_with_gtrends, batch_size=batch_size, shuffle=False, num_workers=4)
        print('Done.')

        return data_loader

    def __len__(self):
        if self.train:
            return len(self.data_df)
        else:
            return len(self.data_df)

In [3]:
class EmptyArgs():
    pass

args = EmptyArgs()

# General arguments
args.log_dir='log'
args.seed=21

# Model specific arguments
args.use_trends=1
args.use_img=1
args.use_text=1
args.num_trends=3

# wandb arguments
args.wandb_entity='bonbak'
args.wandb_proj='sflab-gtm'
args.wandb_run='Run1'

args.use_encoder_mask = False
args.trend_len = 64  # 52
args.prepo_data_folder = "/home/smart01/SFLAB/sanguk/mind_br_data_prepro/"
args.data_folder = "/home/smart01/SFLAB/sanguk/mind_br_data/"
args.text_embedder = 'klue/bert-base'
args.sales_total_len = 12 # 52  # 12
args.seq_len = args.sales_total_len
args.output_dim = args.sales_total_len
args.autoregressive = 1
args.scaler = "standard" # "Minmax"
args.learning_rate = 0.0001
args.lead_time = 2
args.no_scaling = False
args.ahead_step = 6
args.val_output_week = 12
args.val_output_month = 3
args.num_attn_heads = 8
args.hidden_dim = 512
args.embedding_dim = 256
args.only_4weeks_loss = False # True
args.num_hidden_layers = 2

args.model_type = "GTM-Classification"
args.autoregressive_train = False
args.teacher_forcing = True
args.epochs = 100 # 500  # 300  # 150  # 500 # 50 # 300  # 5  # 500  # 100 # 5  # 100
args.batch_size = 16  # 64
# args.gpu_num = 0
args.before_meta = False # True #

In [4]:
class ZeroShotDataset():
    def __init__(self, sales_total_len, seq_len, output_dim, data_df, img_root,
                 cat_trend, fab_trend, col_trend, trend_len,
                 scaler, no_scaling, meta_df, qcut_df, opt_lambda=None, train=True):
        self.sales_total_len = sales_total_len
        self.seq_len = seq_len
        self.output_dim = output_dim
        self.data_range = self.sales_total_len - self.seq_len - self.output_dim + 1

        self.data_df = data_df
        self.img_root = img_root
        self.cat_trend = cat_trend
        self.fab_trend = fab_trend
        self.col_trend = col_trend

        self.img_df = meta_df.loc[:,meta_df.columns.str.startswith('img')]
        self.text_df = meta_df.loc[:,meta_df.columns.str.startswith('text')]
        self.meta_df = meta_df.iloc[:,3:26]

        # self.text_des = text_des
        # self.text_embedder = pipeline('feature-extraction', model=text_embedder)
        self.trend_len = trend_len
        self.scaler = StandardScaler() if scaler == "standard" else MinMaxScaler()
        self.no_scaling = no_scaling

        
        self.qcut_df = qcut_df
        self.qcut_label_mean = qcut_df.groupby('qcut_label')['sales_mean'].mean().values
        self.qcut_label_median = qcut_df.groupby('qcut_label')['sales_mean'].median().values

        if opt_lambda == None:
            _, self.opt_lambda = boxcox(qcut_df.loc[self.data_df.index, 'sales_mean'])
        else:
            self.opt_lambda = opt_lambda

        self.train = train
        self.past_trend_len = trend_len - sales_total_len
        self.col_imputation = self.color_imputation()

    def __getitem__(self, idx):
        return self.data_df.iloc[idx, :]

    def color_imputation(self):
        col_trend_df = pd.DataFrame(self.col_trend)
        normal_item_df = col_trend_df.drop(columns=[item  for item in col_trend_df.columns if col_trend_df[item][0] == "multi_color"])
        col_imputation = np.mean([np.mean(normal_item_df[item]) for item in normal_item_df.columns])
        return col_imputation

    def preprocess_data(self):
        data = self.data_df.reset_index(drop=True)

        # Get the Gtrends time series associated with each product
        # Read the images (extracted image features) as well
        sales, release_dates, ntrends, image_features, text_features, sales_stamps, scalers, real_value_sales, item_numbers_idx = [], [], [], [], [], [], [], [], []
        metas = []
        qcut_labels = []
        target_regs = []

        img_transforms = Compose([Resize((256, 256)), ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

        for (idx, row) in tqdm(data.iterrows(), total=len(data), ascii=True):
        # for (idx, row) in tqdm(data[:20].iterrows(), total=len(data[:20]), ascii=True):
            item_numbers_idx.append(idx)

            idx = self.data_df.iloc[idx]._name

            meta = self.meta_df.loc[idx].values
            metas.append(meta)

            qcut = int(self.qcut_df.loc[idx]['qcut_label'])
            qcut_labels.append(qcut)

            reg = boxcox(self.qcut_df.loc[idx]['sales_mean'], self.opt_lambda)
            target_regs.append(reg)

            sales_stamp = []

            row.index = pd.to_datetime(row.index)

            release_date = row.dropna().sort_index().index[0]
            release_dates.append([release_date.year, release_date.month, release_date.day])

            row = row[release_date:release_date + relativedelta(weeks=self.output_dim-1)].resample('7d').sum().fillna(0)

            time_feature_range = pd.date_range(release_date - relativedelta(weeks=52), release_date + relativedelta(weeks=self.output_dim-1), freq='7d')
            sales_stamp.append(time_features(time_feature_range, freq='w')[0].tolist())
            sales_stamp.append(time_features(time_feature_range, freq='m')[0].tolist())
            sales_stamp.append(time_features(time_feature_range, freq='y')[0].tolist())

            real_value_sale = torch.FloatTensor(np.array(row))
            real_value_sales.append(real_value_sale)

            # sale = np.array(row) / 290
            sale = self.scaler.fit_transform(np.array(row).reshape(-1, 1)).flatten()

            if self.no_scaling:
                sale = np.array(row)

            sale = torch.FloatTensor(sale)

            sales.append(sale)


            scalers.append([self.scaler.mean_, self.scaler.scale_]) if isinstance(self.scaler, StandardScaler) else scalers.append([self.scaler.data_min_, self.scaler.data_range_])

            if idx in self.cat_trend.keys():
                cat_ntrend = np.array(self.cat_trend[idx]).reshape(-1,1)
                if cat_ntrend.shape[0] != 64:
                    cat_ntrend = torch.zeros(self.trend_len, 1)

                fab_ntrend = np.array(self.fab_trend[idx]).reshape(-1,1)
                if fab_ntrend.shape[0] != 64:
                    fab_ntrend = torch.zeros(self.trend_len, 1)

                col_ntrend = np.array(self.col_trend[idx]).reshape(-1,1)
                if self.col_trend[idx] == "multi_color":
                    col_ntrend = torch.zeros(self.trend_len,1)
                else:
                    col_scaler = StandardScaler().fit(col_ntrend[:self.past_trend_len])
                    col_ntrend = col_scaler.transform(col_ntrend)
                if col_ntrend.shape[0] != 64:
                    col_ntrend = torch.zeros(self.trend_len, 1)

                cat_scaler = StandardScaler().fit(cat_ntrend[:self.past_trend_len])
                cat_ntrend = cat_scaler.transform(cat_ntrend)
                fab_scaler = StandardScaler().fit(fab_ntrend[:self.past_trend_len])
                fab_ntrend = fab_scaler.transform(fab_ntrend)
            else:
                col_ntrend = torch.zeros(self.trend_len, 1)
                cat_ntrend = torch.zeros(self.trend_len, 1)
                fab_ntrend = torch.zeros(self.trend_len, 1)


            
            multitrends = torch.stack([torch.FloatTensor(cat_ntrend), torch.FloatTensor(fab_ntrend), torch.FloatTensor(col_ntrend)]).squeeze()    
            img = Image.open(os.path.join(self.img_root, idx + '.png')).convert('RGB')
            # textual_description = list(self.text_des.loc[idx])
            # word_embeddings = self.text_embedder(textual_description)

            # BERT gives us embeddings for [CLS] ..  [EOS], which is why we only average the embeddings in the range [1:-1]
            # We're not fine tuning BERT and we don't want the noise coming from [CLS] or [EOS]
            word_embeddings = torch.FloatTensor(self.text_df.loc[idx].values)
            text_features.append(word_embeddings)

            # Append them to the lists
            ntrends.append(multitrends)
            # img = img_transforms(self.img_df.loc[idx].values)
            img = img_transforms(img)
            image_features.append(img)

            sales_stamp = torch.FloatTensor(sales_stamp)
            sales_stamps.append(sales_stamp)

        # Create tensors for each part of the input/output
        item_sales = torch.stack(sales, dim=0)
        temporal_features = torch.stack(sales_stamps, dim=0)
        ntrends = torch.stack(ntrends, dim=0)
        images = torch.stack(image_features, dim=0)
        texts = torch.stack(text_features, dim=0)
        scalers = torch.FloatTensor(np.array(scalers)).view(-1, 2)

        real_value_sales = torch.stack(real_value_sales, dim=0)

        release_dates = torch.tensor(release_dates)

        item_numbers_idx = torch.tensor(item_numbers_idx)

        meta_data = torch.FloatTensor(metas)
        qcut_labels = torch.LongTensor(qcut_labels)

        target_reg = torch.FloatTensor(target_regs)

        
        return TensorDataset(item_sales, temporal_features, ntrends, images, texts, scalers,
                             real_value_sales, release_dates, item_numbers_idx, meta_data, 
                             qcut_labels, target_reg)

    def get_loader(self, batch_size, train=True):
        print('Starting dataset creation process...')
        data_with_gtrends = self.preprocess_data()
        data_loader = None
        if train:
            data_loader = DataLoader(data_with_gtrends, batch_size=batch_size, shuffle=False, num_workers=4)
        else:
            data_loader = DataLoader(data_with_gtrends, batch_size=batch_size, shuffle=False, num_workers=4)
        print('Done.')

        return data_loader

    def __len__(self):
        if self.train:
            return len(self.data_df)
        else:
            return len(self.data_df)

In [5]:
import pickle
# Load sales data
df = pd.read_csv(os.path.join(args.prepo_data_folder, f"item_sale_per_week_{args.sales_total_len}.csv"), index_col="품번")

# Load Google trends
cat_trend_per_item = pickle.load(
    open(os.path.join(args.prepo_data_folder, "cat_trend_per_item_v3.pkl"), 'rb'))
fab_trend_per_item = pickle.load(
    open(os.path.join(args.prepo_data_folder, "fab_trend_per_item_v3.pkl"), 'rb'))
col_trend_per_item = pickle.load(
    open(os.path.join(args.prepo_data_folder, "col_trend_per_item_v4.pkl"), 'rb'))

df = df.loc[list(set(df.index).intersection(cat_trend_per_item.keys()))]
df = df.drop(index=['MTPT6102', 'MUPT6102'])

data_path = '/home/smart01/SFLAB/su_GTM_t/GTM_T_sanguk/'
meta_df = pd.read_csv(os.path.join(data_path, '240109_all_meta_sales_total.csv'), index_col='item_number')

test_list = pickle.load(
    open(os.path.join(data_path,"12salesweek_test_item_number296.pkl"), 'rb')).drop("MTPT6102")[:]
train_list = df.index[~df.index.isin(test_list)]
train_df = df.loc[train_list]
test_df = df.loc[test_list]

qcut_df = pd.read_csv(open(os.path.join(data_path,"qcut_df_bin10.csv"), 'rb'), index_col='품번')
train_qdf = qcut_df.loc[train_list]
zero_idx = qcut_df['sales_mean']-train_qdf['sales_mean'].mean() <= train_qdf['sales_mean'].std()
qcut_df.loc[zero_idx.values, 'qcut_label'] = 0
qcut_df.loc[~zero_idx.values, 'qcut_label'] = 1

train_dataset = ZeroShotDataset(args.sales_total_len, args.seq_len,
                                args.output_dim, train_df,
                                os.path.join(args.data_folder, "images"),
                                cat_trend_per_item, fab_trend_per_item, col_trend_per_item,
                                args.trend_len, args.scaler, args.no_scaling,
                                meta_df, qcut_df, train=True)

train_loader = train_dataset.get_loader(batch_size=args.batch_size, train=True)

Starting dataset creation process...


100%|##########| 1000/1000 [00:18<00:00, 54.98it/s]


Done.


  meta_data = torch.FloatTensor(metas)


In [11]:
qcut_df.loc[train_list, 'qcut_label'].value_counts()

qcut_label
0    929
1     71
Name: count, dtype: int64

In [25]:
qcut_df.loc[qcut_df['qcut_label'] == 0, 'sales_mean'].mean()

43.2119588062211

In [12]:
qcut_df.loc[test_list, 'qcut_label'].value_counts()

qcut_label
0    268
1     27
Name: count, dtype: int64