In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn 
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence 
import pathlib
import pickle

from sklearn.preprocessing import (
    MinMaxScaler,
    StandardScaler,
    MaxAbsScaler,
    RobustScaler,
    QuantileTransformer,
    PowerTransformer,
)

from sklearn.metrics import mean_squared_log_error as msle
from tqdm import tqdm
#from torch.utils.tensorboard import SummaryWriter
#writer = SummaryWriter()


In [2]:
class Clamp(torch.autograd.Function):

    @staticmethod
    def forward(ctx, input):
        return input.clamp(min=0, max=5000)

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output.clone()

In [3]:
class RMSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, pred, actual):
        return torch.sqrt(self.mse(torch.log(pred + 1), torch.log(actual + 1)))

In [6]:
class MSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, pred, actual):
        return self.mse(torch.log(pred + 1), torch.log(actual + 1))

## Загружу данные и гляну наних

In [5]:
DATA_DIR = pathlib.Path("/Users/ltorrick/DS/sibur_2021/data")
HASH2IND_PATH = pathlib.Path('/Users/ltorrick/DS/sibur_2021/code/rnn_/hash2ind.pickle')
ENCODERS_PATH = pathlib.Path('/Users/ltorrick/DS/sibur_2021/code/rnn_/encoders.pickle')
#HASH2IND_PATH = pathlib.Path(__file__).parent.joinpath("hash2ind.pickle")
#ENCODERS_PATH = pathlib.Path(__file__).parent.joinpath("encoders.pickle")
#DATA_DIR = pathlib.Path(".")
DATA_FILE = "sc2021_train_deals.csv"
AGG_COLS = ["material_code", "company_code", "country", "region", "manager_code"]
#RS = 82736
#PATH_TO_ENCODER_WEIGHTS = pathlib.Path(__file__).parent.joinpath('encoder_w.pt')
#PATH_TO_RNN_WEIGHTS = pathlib.Path(__file__).parent.joinpath('RNN_w.pt')
#PATH_TO_FF_WEIGHTS = pathlib.Path(__file__).parent.joinpath('FF_w.pt')

In [4]:
data = pd.read_csv(DATA_DIR.joinpath(DATA_FILE), parse_dates=["month", "date"])

## Создам энкодер для групп товаров
Хочу в разрезе ИД группы товара затем логику get_item выстроить для torch Dataset

In [13]:
#keep this code
ids = data[AGG_COLS].drop_duplicates()
ids['hash']=ids.material_code.astype(str)+ids.company_code.astype(str)+ids.country.astype(str).str.lower()+ids.region.astype(str).str.lower()+ids.manager_code.astype(str)
ids['ids'] = range(0, ids.shape[0])
ind2hash=ids.set_index('ids').hash.to_dict()
hash2ind = {v:k for k,v in ind2hash.items()}

In [14]:
with open('hash2ind.pickle', 'wb') as handle:
    pickle.dump(hash2ind, handle, protocol=pickle.HIGHEST_PROTOCOL)



In [8]:
with open('hash2ind.pickle', 'rb') as handle:
    hash2ind = pickle.load(handle)

EOFError: Ran out of input

In [9]:
def prepare_data(df, hash2ind):
    df['hash']=df.material_code.astype(str)+df.company_code.astype(str)+df.country.astype(str).str.lower()+df.region.astype(str).str.lower()+df.manager_code.astype(str)
    df['ids'] = df['hash'].map(hash2ind)
    df.drop('hash', axis=1, inplace=True)
    return df

In [80]:
data_p = prepare_data(data, hash2ind)

## Создаю энкодеры для кат фичей
Буду в будущем их использовать для препроцессинга

In [153]:
CAT_FEATS = ['material_code', 'company_code', 'country', 'region', 'manager_code','material_lvl1_name', 'material_lvl2_name',
       'material_lvl3_name', 'contract_type']
encoders = {}
for col in CAT_FEATS:
    data[col] = data[col].astype('category')
    encoders['ind2'+col]=dict(enumerate(data[col].cat.categories))
    encoders[col+'2ind'] = {v:k for k, v in encoders['ind2'+col].items()}
encoders['CAT_FEATS'] = CAT_FEATS

In [154]:
with open('encoders.pickle', 'wb') as handle:
    pickle.dump(encoders, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Тут функция для объявления скалера таргета


In [6]:
def get_scaler(scaler):
    scalers = {
        "minmax": MinMaxScaler,
        "standard": StandardScaler,
        "maxabs": MaxAbsScaler,
        "robust": RobustScaler,
        "quantile": QuantileTransformer,
        "power": PowerTransformer,
        #'log': LogScaler
    }
    return scalers.get(scaler.lower())()

In [7]:
def import_pickle( pickle_path):
    with open(pickle_path, 'rb') as handle:
        res_dict = pickle.load(handle)
    return res_dict

## Класс для базовой предобработки данных
Можно его же потом научить генерить данные для инференса

In [8]:
class DataPreparation:
    def __init__(self, df, target_scaler=None, path_hash2ind=None,
    path_encoders=None
    ) -> None:
        self.df = df.sort_values('date')
        if target_scaler:
            self.target_scaler = get_scaler(target_scaler)
        self.hash2ind = import_pickle(path_hash2ind)
        self.encoders = import_pickle(path_encoders)

    def base_preproccess(self, scale_target=False):
        """ Выполняет базовые преобразования над данными """
        df = self.df.copy()
        df = self.prepare_ids(df)
        df = self.encode_cat(df)
        df = self.add_dt_features(df)
        df = self.add_agg_feat(df)
        if scale_target is not False:
            df['volume'] = self.target_scaler.fit_transform(df['volume'].values.reshape(-1,1))

        return df
        
    def prepare_ids(self, df):
        df['hash']=df.material_code.astype(str)+df.company_code.astype(str)+df.country.astype(str).str.lower()+df.region.astype(str).str.lower()+df.manager_code.astype(str)
        df['ids'] = df['hash'].map(self.hash2ind)
        df.drop('hash', axis=1, inplace=True)
        return df
    def encode_cat(self, df):
        CAT_FEATS = self.encoders['CAT_FEATS']
        for col in CAT_FEATS:
            #df[col] = df[col].astype('category')
            df[col] = df[col].map(self.encoders[col+'2ind']).astype('int')+1
        return df
    def add_dt_features(self, df):
        df['year'] = df['date'].dt.year
        df['month_int'] = df['date'].dt.month
        df['day'] = df['date'].dt.day
        df['quarter'] = df['date'].dt.quarter
        return df
    def add_agg_feat(self, df):
        df_p_agg_sum = df.groupby(AGG_COLS+['month']).agg({'volume':'sum'}).reset_index()
        df_p_agg_sum.rename(columns={'volume':'volume_per_month'}, inplace=True)
        df = pd.merge(df, df_p_agg_sum, on = AGG_COLS+['month'], how='left')

        return df

    

In [9]:
date_prep = DataPreparation(df=data, target_scaler='quantile', path_hash2ind=HASH2IND_PATH, path_encoders=ENCODERS_PATH)
df_p = date_prep.base_preproccess(scale_target=False)

In [18]:
df_p.head(2)

Unnamed: 0,material_code,company_code,country,region,manager_code,month,material_lvl1_name,material_lvl2_name,material_lvl3_name,contract_type,date,volume,ids,year,month_int,day,quarter,volume_per_month
0,3,1,13,40,11,2018-01-01,1,4,3,3,2018-01-01,43.0,0,2018,1,1,1,192.0
1,11,1,12,34,34,2018-01-01,1,4,3,3,2018-01-02,95.0,1,2018,1,2,1,145.0


## Создаем DataSet
Он должен при инициализации:
* Фильтровать данные при инициализации по месяцу
* Принимать на вход длину последовательностей

Метод Len:
* длина уникальных ид клиентов если остановлюсь на таком варианте

Метод Get_item
* фильтрует по ИД группы
* берет крайние N последовательностей
* Если последовательностей меньше - паддит
* Возращает на ИД группы словарь с тензорами  - Каждый тензор последовательность по фиче



Проблема в том, что там разные длины последовательностей

Решение паддить и маскировать https://www.kdnuggets.com/2018/06/taming-lstms-variable-sized-mini-batches-pytorch.html


In [86]:
params = {
    'features' : ['material_code', 'company_code', 'country', 'region', 'manager_code',
       'material_lvl1_name', 'material_lvl2_name',
       'material_lvl3_name', 'contract_type', 'volume',
       'month_int', 'day', 'quarter', 'volume_per_month'],
    'feature_to_emb' : ['material_code', 'company_code', 'country', 'region', 'manager_code',
       'material_lvl1_name', 'material_lvl2_name',
       'material_lvl3_name', 'contract_type',
       'month_int', 'day', 'quarter'],
    'dynamic_feat' : ['contract_type', 'volume',
       'month_int', 'day', 'quarter'],
    'static_feat' : ['material_code', 'company_code', 'country', 'region', 'manager_code',
       'material_lvl1_name', 'material_lvl2_name',
       'material_lvl3_name'],
    'feat_cont': ['volume', 'volume_per_month'],
    'seq_len':512, 
    'emb_params':{"material_code":(91,10,0),
                  'company_code':(230,10,0),
                  'country':(31,15,0),
                  'region':(103,10,0),
                  'manager_code':(60,20,0),
                  'material_lvl1_name':(4,2,0),
                  'material_lvl2_name':(6,3,0),
                  'material_lvl3_name':(6,3,0),
                  'contract_type':(4,2,0),
                  'month_int':(13,6,0),
                  'day':(32,10,0),
                  'quarter':(5,2,0)},
    'encoder_dropout':0.2,
    'type_of_autoregressor':'LSTM',
    'rnn_params':{'num_layers':2,
        'hidden_size':128,
        'bias':False
    },
    'predictor_params': {
    'predictor_output':1,
    'predictor_dropout':0.4
    },
    'lr': 0.03,
    'epochs':1,
    'weight_decay':0.01

}

In [22]:
class RnnDataset(Dataset):
    def __init__(self, df, month, params, is_train=True) -> None:
        self.df = df.loc[df.date<month].copy()
        self.seq_len = params['seq_len']
        self.params = params
        self.is_train = is_train
        if is_train:
            self.targets= df_p[df_p.month == month].groupby('ids').agg({'volume':'sum'}).to_dict()['volume']
        self.ind2ind =  {i:ind for i, ind in enumerate(self.df.ids.unique())}
    
    def __len__(self):
        return self.df.ids.nunique()
    
    def __getitem__(self, index):
        features = self.params['features']
        index = self.ind2ind[index]
        df = self.df.loc[self.df.ids ==index][-self.seq_len:]

        res = {
            feat: torch.from_numpy(df[feat].values) for feat in features
        }
        
        length = df.shape[0]
        group_id = index
        if self.is_train:
            target = self.targets.get(index,0)

            return group_id, res, length, target,
            
        return group_id, res, length, 


In [23]:
class CollatorLearn(object):
    def __init__(self, params, with_targets=False):
        self.params = params
        self.with_targets = with_targets
    def __call__(self, batch):
        #params = self.params
        #print(params)
        columns = self.params['features']
        #columns = ['amount_rur', 'trans_date', 'small_group']
        labels=[]
        features={feat:[] for feat in columns}
        lenghts=[]
        
        if self.with_targets:
            targets=[]
            for (_label, _features, _length, _target) in batch:
                labels.append(_label)
                lenghts.append(_length)
                targets.append(_target)
                for feat in columns:
                    features[feat].append(_features[feat])
            
            res_dict = {
                'labels' :torch.tensor(labels, dtype=torch.long),
                'lenghts': torch.tensor(lenghts, dtype=torch.long),
                'targets': torch.tensor(targets, dtype=torch.float)}
            for feat in columns:
                if feat in self.params['feat_cont']:
                    res_dict[feat]= pad_sequence(features[feat], batch_first=True, padding_value=0).float()
                else:
                    res_dict[feat]= pad_sequence(features[feat], batch_first=True, padding_value=0)
            
            return res_dict
        
        #if NO Targets
        for (_label, _features, _length) in batch:
            labels.append(_label)
            lenghts.append(_length)
            for feat in columns:
                features[feat].append(_features[feat])
        
        res_dict = {
            'labels' :torch.tensor(labels,  dtype=torch.long),
            'lenghts': torch.tensor(lenghts,  dtype=torch.long),
            }
        for feat in columns:
            if feat in self.params['feat_cont']:
                res_dict[feat]= pad_sequence(features[feat], batch_first=True, padding_value=0).float()
            else:
                res_dict[feat]= pad_sequence(features[feat], batch_first=True, padding_value=0)
        
        return res_dict

In [24]:
class Encoder(nn.Module):
    def __init__(self, params, device):
        super(Encoder, self).__init__()
        self.params = params
        self.embeddings = nn.ModuleList([nn.Embedding(*params['emb_params'][feat]).to(device) for feat in params['emb_params']])
        input_size = len(params['feat_cont'])+sum([params['emb_params'][feat][1]for feat in params['emb_params']])
        self.batch_norm = nn.BatchNorm1d(input_size).to(device)
        self.encoder_dropout = nn.Dropout(params['encoder_dropout']).to(device)

    def forward(self,x):
        emb_out = [self.embeddings[i](x[feat]) for i, feat in enumerate(self.params['feature_to_emb'])]
        cont_out = [x[feat].unsqueeze(-1) for feat in self.params['feat_cont']]
        conc = torch.cat(cont_out+emb_out, dim=-1)
        bnd = self.batch_norm(conc.permute(0,2,1)).permute(0,2,1)
        drop = self.encoder_dropout(bnd)

        #shape output Batch_size, Max_len per batch, Input_size

        return drop

In [25]:
class Autoregressor(nn.Module):
    def __init__(self, params, device):
        super(Autoregressor, self).__init__()
        input_size = len(params['feat_cont'])+sum([params['emb_params'][feat][1]for feat in params['emb_params']])
        self.params=params
        if self.params['type_of_autoregressor'] =='GRU':
            self.autoreg = nn.GRU(input_size, hidden_size=params['rnn_params']['hidden_size'],
                              num_layers = params['rnn_params']['num_layers'],
                              bias = params['rnn_params']['bias'],
                              batch_first=True, bidirectional=False).to(device)
        if self.params['type_of_autoregressor'] =='LSTM':
            self.autoreg = nn.LSTM(input_size, hidden_size=params['rnn_params']['hidden_size'],
                              num_layers = params['rnn_params']['num_layers'],
                              bias = params['rnn_params']['bias'],
                              batch_first=True, bidirectional=False).to(device)

    def forward(self, x, lengths):
        x = pack_padded_sequence(x,lengths=lengths, batch_first=True, enforce_sorted=False)
        if self.params['type_of_autoregressor']=='GRU':
            rnn_d, h = self.autoreg(x)
            #return torch.mean(rnn_d, 1)
            
            #shape output Batch_size, hidden_size
            
            return h[-1]
        if self.params['type_of_autoregressor']=='LSTM':
            rnn_d, (h,_) = self.autoreg(x)

            return h[-1]

In [26]:
class Predictor(nn.Module):
    def __init__(self, params, device):
        super(Predictor, self).__init__()
        input_size = params['rnn_params']['hidden_size']
        output_size = params['predictor_params']['predictor_output']
        self.FF1 = nn.Linear(input_size,input_size).to(device)
        self.batch_norm = nn.BatchNorm1d(input_size).to(device)
        self.predictor_dropout = nn.Dropout(params['predictor_params']['predictor_dropout']).to(device)
        self.activation = nn.ReLU()
        self.FF2 = nn.Linear(input_size, output_size).to(device)
    
    def forward(self, x):
        out = self.FF1(x)
        out = self.batch_norm(out)
        out = self.predictor_dropout(out)
        out = self.activation(out)
        out = self.FF2(out)

        # shape output Batch_size, 1
        out = out.view(-1)
        return out

In [27]:
def loss_fn(criterion, preds, target):
    #preds[preds<0]=0
    loss = criterion(preds, np.log1p(target))
    return loss

In [28]:
def val_fn(model1, model2, model3, dataloader, device, criterion, params):
    model1.eval()
    model2.eval()
    model3.eval()
    preds = []
    targets = []
    columns = list(set(params['feature_to_emb']+params['feat_cont']))
    losses = []

    with torch.no_grad():
        for i, batch in tqdm(enumerate(dataloader), desc='batch', leave=False, position=1):
            for feat in columns:
                batch[feat].to(device, non_blocking=True)
            out = model1(batch)
            out2 = model2(out, batch['lenghts'])
            out3= model3(out2)
            loss = loss_fn(criterion, out3, batch['targets'])
            losses.append(loss.detach().cpu().tolist())
            targets.extend(batch['targets'].detach().cpu().tolist())
            preds.extend(out3.detach().cpu().tolist())
            writer.add_scalar('Loss/Val', loss, i)

        return np.mean(losses), np.sqrt(msle(targets, (np.expm1(np.array(preds))).clip(min=0)))



In [44]:
def train_fn(model1, model2, model3, dataloader, optimizer, device, criterion, params):
    model1.train()
    model2.train()
    model3.train()
    clamp_class = Clamp()
    preds = []
    targets = []
    columns = list(set(params['feature_to_emb']+params['feat_cont']))
    losses = []
    with torch.autograd.set_detect_anomaly(False):
        for i, batch in tqdm(enumerate(dataloader), desc='batch', leave=False, position=1):
            def closure():
                loss = loss_fn(criterion, f_out, batch['targets'])
                loss.backward()
                return loss
            optimizer.zero_grad()

            for feat in columns:
                batch[feat].to(device, non_blocking=True)
            out = model1(batch)
            out2 = model2(out, batch['lenghts'])
            out3 = model3(out2)
            clamp = clamp_class.apply
            f_out = clamp(out3)
            loss = loss_fn(criterion, f_out, batch['targets'])
            #loss2 = loss_fn(MSLELoss(), f_out, batch['targets'])
            #t_loss = loss2*0.7+0.3*loss
            loss.backward()

            #SAM
            #optimizer.step(closure)
            #loss_fn(criterion, f_out, batch['targets']).backward()
            #optimizer.second_step(zero_grad=True)
            
            losses.append(loss.detach().cpu().tolist())
            targets.extend(batch['targets'].detach().cpu().tolist())
            preds.extend(f_out.detach().cpu().tolist())
            writer.add_scalar('Loss/Train', loss, i)
            #print(preds)

            torch.nn.utils.clip_grad_norm_(model2.parameters(), 1.5)
            torch.nn.utils.clip_grad_norm_(model1.parameters(), 1.5)
            torch.nn.utils.clip_grad_norm_(model3.parameters(), 1.5)
            optimizer.step()

        #return np.mean(losses), msle(date_prep.target_scaler.inverse_transform(np.array(targets).reshape(-1,1)), date_prep.target_scaler.inverse_transform(np.array(preds).reshape(-1,1)))
        #return np.mean(losses), targets, preds
        return np.mean(losses), np.sqrt(msle(targets, (np.expm1(np.array(preds))).clip(min=0)))

In [79]:
collate_test = CollatorLearn(params, with_targets=False)
collate_train = CollatorLearn(params, with_targets=True)

In [80]:
train_dataset = RnnDataset(df_p, '2020-01-01', params, is_train=True)

In [81]:
test_dl = DataLoader(train_dataset, batch_size=2, collate_fn=collate_train)

In [467]:
model = Encoder(params, device = torch.device('cpu'))
model2 = Autoregressor(params, device= torch.device('cpu'))
model3 = Predictor(params, device=torch.device('cpu'))

In [82]:
dd = next(iter(test_dl))

In [51]:
def train_loop_fn(device, params):
    criterion = nn.MSELoss()
    #criterion = RMSLELoss()
    criterion.to(device)

    
    encoder = Encoder(params, device=device)
    context_encoder = Autoregressor(params, device=device)
    predictor = Predictor(params, device=device)
    #models_params = list(encoder.parameters()) + list(context_encoder.parameters())+ list(predictor.parameters())
    #base_optimizer = torch.optim.AdamW(models_params, lr = params['lr'], weight_decay=params['weight_decay'])
    #base_optimizer = torch.optim.AdamW
    #optimizer = SAM(models_params, base_optimizer=base_optimizer, lr=params['lr'])
    optimizer = torch.optim.AdamW([{'params': encoder.parameters()},
                                   {'params': context_encoder.parameters()},
                                   {'params': predictor.parameters()}], lr = params['lr'], weight_decay=params['weight_decay'], )
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.7)
    train_range = pd.date_range(start='2020-03-01', end='2020-06-01', freq='MS')
    val_range = pd.date_range(start='2020-07-01', end='2020-07-01', freq='MS')
    collate_train = CollatorLearn(params, with_targets=True)
    print('Train it')
    for j in tqdm(range(4), total=4, leave=True, position=0):
        prokhod_loss=[]
        prokhod_rmsle=[]
        for next_month in tqdm(train_range, total=len(list(train_range)), leave=False):
            #print(f'Train for next_month = {next_month}')
            train_dataset = RnnDataset(df_p, next_month, params, is_train=True)
            dataloader = DataLoader(train_dataset, batch_size=128, collate_fn=collate_train, drop_last=True)

            for epoch in range(1):
                losses, msle_mean = train_fn(encoder, context_encoder, predictor, dataloader, optimizer, device, criterion, params)
            #    losses, targets, preds = train_fn(encoder, context_encoder, predictor, dataloader, optimizer, device, criterion, params)
            #print(f'Epoch №{epoch+1}: loss = {losses}, rmsle = {msle_mean}')
            prokhod_loss.append(losses)
            prokhod_rmsle.append(msle_mean)
        print(f'Prokhod №{j+1}: loss = {np.mean(prokhod_loss)}, rmsle = {np.mean(prokhod_rmsle)}')
        scheduler.step()
       #return targets, preds
        print('<<<<<<<<<<<--------------Validate it-------------->>>>>>>>>>>>>>')
        for next_month in val_range:
            #print(f'Val for next_month = {next_month}')
            val_dataset = RnnDataset(df_p, next_month, params, is_train=True)
            val_dataloader = DataLoader(val_dataset, batch_size=128, collate_fn=collate_train, drop_last=False)

            
            losses, msle_mean = val_fn(encoder, context_encoder, predictor, val_dataloader, device, criterion, params)
            #    losses, targets, preds = train_fn(encoder, context_encoder, predictor, dataloader, optimizer, device, criterion, params)
            print(f'Month {next_month}: loss = {losses}, rmsle = {msle_mean}')
        
    return encoder, context_encoder, predictor
            

In [87]:
encoder, context_encoder, predictor = train_loop_fn(torch.device('cpu'), params)
#writer.flush()

  0%|          | 0/4 [00:00<?, ?it/s]

Train it


  0%|          | 0/4 [04:07<?, ?it/s]


KeyboardInterrupt: 

In [508]:
t, p = train_loop_fn(torch.device('cpu'), params)

  0%|          | 0/12 [00:00<?, ?it/s]

Train for next_month = 2019-05-01 00:00:00


  0%|          | 0/12 [00:03<?, ?it/s]


In [559]:

torch.save(encoder.state_dict(), PATH_TO_ENCODER_WEIGHTS)
torch.save(context_encoder.state_dict(), PATH_TO_RNN_WEIGHTS)
torch.save(predictor.state_dict(), PATH_TO_FF_WEIGHTS)

In [557]:
PATH_TO_ENCODER_WEIGHTS = 'encoder_w.pt'
PATH_TO_RNN_WEIGHTS = 'RNN_w.pt'
PATH_TO_FF_WEIGHTS = 'FF_w.pt'

In [53]:
df_p

Unnamed: 0,material_code,company_code,country,region,manager_code,month,material_lvl1_name,material_lvl2_name,material_lvl3_name,contract_type,date,volume,ids,year,month_int,day,quarter,volume_per_month
0,3,1,13,40,11,2018-01-01,1,4,3,3,2018-01-01,43.0,0,2018,1,1,1,192.0
1,11,1,12,34,34,2018-01-01,1,4,3,3,2018-01-02,95.0,1,2018,1,2,1,145.0
2,74,41,10,4,5,2018-01-01,1,2,2,2,2018-01-02,57.0,2,2018,1,2,1,394.0
3,3,1,13,40,11,2018-01-01,1,4,3,3,2018-01-02,21.0,0,2018,1,2,1,192.0
4,2,1,12,34,45,2018-01-01,1,4,3,3,2018-01-02,150.0,3,2018,1,2,1,1500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92301,14,46,17,102,39,2020-07-01,3,5,5,1,2020-07-31,12.0,636,2020,7,31,3,206.0
92302,42,181,17,81,13,2020-07-01,1,2,2,2,2020-07-31,62.0,59,2020,7,31,3,723.0
92303,75,225,17,45,19,2020-07-01,1,2,2,2,2020-07-31,20.0,875,2020,7,31,3,60.0
92304,80,143,29,95,31,2020-07-01,1,2,2,2,2020-07-31,25.0,926,2020,7,31,3,328.0


## Тут Catboost в пайплайн пытаюсь вкорячить

In [54]:
from catboost import CatBoostRegressor

In [15]:
group_ts = df_p.groupby(AGG_COLS +['ids']+["month"])["volume"].sum().unstack(fill_value=0)
#group_ts = ids2group(group_ts.reset_index())
#group_ts = group_ts.set_index(AGG_COLS)
#group_ts = group_ts[['ids']+list(group_ts.columns[:-1])]

In [14]:
def get_features(df: pd.DataFrame, month: pd.Timestamp) -> pd.DataFrame:
    """Calculate features for `month`."""

    start_period = month - pd.offsets.MonthBegin(6)
    end_period = month - pd.offsets.MonthBegin(1)

    df = df.loc[:, :end_period]

    features = pd.DataFrame([], index=df.index)
    features["month"] = month.month
    #features['ids'] = df['ids']
    features[[f"vol_tm{i}" for i in range(6, 0, -1)]] = df.loc[:, start_period:end_period].copy()

    rolling = df.rolling(12, axis=1, min_periods=1)
    features = features.join(rolling.mean().iloc[:, -1].rename("last_year_avg"))
    features = features.join(rolling.min().iloc[:, -1].rename("last_year_min"))
    features = features.join(rolling.max().iloc[:, -1].rename("last_year_max"))
    features = features.join(rolling.median().iloc[:, -1].rename("last_year_median"))
    features["month"] = month.month
    return features

In [16]:
tr_range = pd.date_range("2020-03-01", "2020-06-01", freq="MS")
#val_range = pd.date_range("2019-07-01", "2019-12-01", freq="MS")
#ts_range = pd.date_range("2020-01-01", "2020-07-01", freq="MS")
val_range = pd.date_range("2020-07-01", "2020-07-01", freq="MS")
ts_range = pd.date_range("2020-01-01", "2020-07-01", freq="MS")

In [17]:
full_features = {}

for dataset, dataset_range in zip(["tr", "val", "ts"], [tr_range, val_range, ts_range]):
    dataset_features = []
    for target_month in dataset_range:
        features = get_features(group_ts, target_month)
        features["target"] = group_ts[target_month]
        dataset_features.append(features.reset_index())
    full_features[dataset] = pd.concat(dataset_features, ignore_index=True)

In [18]:
CAT_COLS = ["material_code", "company_code", "country", "region", "manager_code", "month"]
FTS_COLS = ["material_code", "company_code", "country", "region", "manager_code", "month", 
            "vol_tm6", "vol_tm5", "vol_tm4", 
            "vol_tm3", "vol_tm2", "vol_tm1", 
            "last_year_avg", 
            "last_year_min", 
            "last_year_max",
            #"last_year_median"
            ]
TARGET = "target"

In [76]:
model = CatBoostRegressor(iterations=600,
                          early_stopping_rounds=150,
                          learning_rate=0.009,
                          #loss_function ='Tweedie:variance_power=1.4',
                          #MSE, MultiRMSE, SurvivalAft, MAE, Quantile, LogLinQuantile, Poisson, MAPE, Lq 
                          #loss_function ='SurvivalAft',
                          depth=4,
                          cat_features=CAT_COLS,
                          random_state=333,
                          verbose=50)

model.fit(full_features["tr"][FTS_COLS], np.log1p(full_features["tr"][TARGET]),
          eval_set=(full_features["val"][FTS_COLS], np.log1p(full_features["val"][TARGET]))
          )

0:	learn: 2.2718969	test: 2.3302482	best: 2.3302482 (0)	total: 54.8ms	remaining: 32.8s
50:	learn: 2.0142869	test: 2.0234176	best: 2.0234176 (50)	total: 154ms	remaining: 1.66s
100:	learn: 1.8760824	test: 1.8425487	best: 1.8425487 (100)	total: 242ms	remaining: 1.2s
150:	learn: 1.8039681	test: 1.7419432	best: 1.7419432 (150)	total: 326ms	remaining: 968ms
200:	learn: 1.7643247	test: 1.6826129	best: 1.6826129 (200)	total: 406ms	remaining: 805ms
250:	learn: 1.7381793	test: 1.6457485	best: 1.6457485 (250)	total: 486ms	remaining: 675ms
300:	learn: 1.7190786	test: 1.6212338	best: 1.6212338 (300)	total: 576ms	remaining: 572ms
350:	learn: 1.7051307	test: 1.6049976	best: 1.6049976 (350)	total: 675ms	remaining: 479ms
400:	learn: 1.6943600	test: 1.5926992	best: 1.5926447 (399)	total: 764ms	remaining: 379ms
450:	learn: 1.6858805	test: 1.5857326	best: 1.5857326 (450)	total: 899ms	remaining: 297ms
500:	learn: 1.6786129	test: 1.5801611	best: 1.5799187 (499)	total: 1.04s	remaining: 206ms
550:	learn: 1.67

<catboost.core.CatBoostRegressor at 0x7ff159558f10>

In [78]:
full_features['tr']

Unnamed: 0,material_code,company_code,country,region,manager_code,ids,month,vol_tm6,vol_tm5,vol_tm4,vol_tm3,vol_tm2,vol_tm1,last_year_avg,last_year_min,last_year_max,last_year_median,target
0,1,161,17,64,42,72,3,0.0,0.0,0.0,0.0,0.0,0.0,45.833333,0.0,200.0,0.0,0.0
1,2,1,2,42,5,607,3,124.0,181.0,208.0,207.0,17.0,72.0,102.666667,0.0,208.0,90.0,250.0
2,2,1,2,43,5,648,3,0.0,0.0,0.0,0.0,0.0,0.0,32.500000,0.0,145.0,0.0,0.0
3,2,1,2,99,5,526,3,83.0,82.0,42.0,0.0,0.0,0.0,20.666667,0.0,83.0,0.0,0.0
4,2,1,10,101,17,548,3,0.0,45.0,50.0,45.0,0.0,50.0,29.166667,0.0,50.0,40.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3759,89,228,17,75,42,933,6,0.0,0.0,21.0,63.0,125.0,84.0,24.416667,0.0,125.0,0.0,84.0
3760,90,1,17,38,48,706,6,6.0,5.0,5.0,5.0,0.0,3.0,5.333333,0.0,10.0,5.0,3.0
3761,90,70,17,38,25,820,6,29.0,73.0,74.0,122.0,100.0,15.0,93.166667,15.0,129.0,108.5,30.0
3762,90,109,17,102,25,43,6,100.0,100.0,180.0,180.0,100.0,140.0,123.333333,80.0,180.0,120.0,40.0


In [10]:
from lightgbm import LGBMRegressor

In [None]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    #'verbose': 0,
    "max_depth": 8,
    "num_leaves": 128,  
    "max_bin": 512,
    "num_iterations": 100000
}

Error: Kernel is dead

In [12]:
gbm = LGBMRegressor(**hyper_params)

In [19]:
gbm.fit(full_features["tr"][FTS_COLS], np.log1p(full_features["tr"][TARGET]),
        eval_set=[(full_features["val"][FTS_COLS], np.log1p(full_features["val"][TARGET]))],
        eval_metric='l2',
        early_stopping_rounds=1000)

In [589]:
class SAM(torch.optim.Optimizer):
    def __init__(self, params, base_optimizer, rho=0.05, adaptive=False, **kwargs):
        assert rho >= 0.0, f"Invalid rho, should be non-negative: {rho}"

        defaults = dict(rho=rho, adaptive=adaptive, **kwargs)
        super(SAM, self).__init__(params, defaults)

        self.base_optimizer = base_optimizer(self.param_groups, **kwargs)
        self.param_groups = self.base_optimizer.param_groups

    @torch.no_grad()
    def first_step(self, zero_grad=False):
        grad_norm = self._grad_norm()
        for group in self.param_groups:
            scale = group["rho"] / (grad_norm + 1e-12)

            for p in group["params"]:
                if p.grad is None: continue
                self.state[p]["old_p"] = p.data.clone()
                e_w = (torch.pow(p, 2) if group["adaptive"] else 1.0) * p.grad * scale.to(p)
                p.add_(e_w)  # climb to the local maximum "w + e(w)"

        if zero_grad: self.zero_grad()

    @torch.no_grad()
    def second_step(self, zero_grad=False):
        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None: continue
                p.data = self.state[p]["old_p"]  # get back to "w" from "w + e(w)"

        self.base_optimizer.step()  # do the actual "sharpness-aware" update

        if zero_grad: self.zero_grad()

    @torch.no_grad()
    def step(self, closure=None):
        assert closure is not None, "Sharpness Aware Minimization requires closure, but it was not provided"
        closure = torch.enable_grad()(closure)  # the closure should do a full forward-backward pass

        self.first_step(zero_grad=True)
        closure()
        self.second_step()

    def _grad_norm(self):
        shared_device = self.param_groups[0]["params"][0].device  # put everything on the same device, in case of model parallelism
        norm = torch.norm(
                    torch.stack([
                        ((torch.abs(p) if group["adaptive"] else 1.0) * p.grad).norm(p=2).to(shared_device)
                        for group in self.param_groups for p in group["params"]
                        if p.grad is not None
                    ]),
                    p=2
               )
        return norm

    def load_state_dict(self, state_dict):
        super().load_state_dict(state_dict)
        self.base_optimizer.param_groups = self.param_groups

In [84]:
dd.keys()

dict_keys(['labels', 'lenghts', 'targets', 'material_code', 'company_code', 'country', 'region', 'manager_code', 'material_lvl1_name', 'material_lvl2_name', 'material_lvl3_name', 'contract_type', 'volume', 'month_int', 'day', 'quarter', 'volume_per_month'])

In [85]:
dd['month_int']

tensor([[ 1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,
          3,  3,  4,  4,  4,  4,  5,  5,  5,  5,  5, 11, 11, 11, 11, 12, 12, 12,
         12,  1,  1,  1,  1,  1,  2, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11,
         11, 11, 11, 11, 12, 12, 12, 12, 12, 12],
        [ 1,  1,  2,  3,  4,  7,  9, 10, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])