# Regression Baseline

In [None]:
import os
os.system('pip install iterative-stratification==0.1.7')
os.system('pip install transformers')
os.system('pip install sentencepiece')

In [1]:
import os
import pandas as pd
import numpy as np
import random
import torch
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from torch.utils.checkpoint import checkpoint
from torch import nn
from transformers import AdamW


from configs.regressor0_cfg import SimpleRegressorConfig
from src.models.regressors import BaselineRegressor
from src.models.layers import MeanPooling

from src.utils.trainer import Trainer

In [2]:
# get the config
cfg = SimpleRegressorConfig()
cfg

SimpleRegressorConfig(seed=1997, n_fold=5, target_columns=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'], data_dir='./data/fb3', save_dir='./outputs', load_dir='./outputs', model='microsoft/deberta-v3-base', criterion='l2', plm_size=768, gradient_checkpointing=False, apex=True, num_workers=4, epoch=10, batch_size=8, max_len=512, encoder_lr=1e-05, decoder_lr=1e-05, weight_decay=0.01, eps=1e-06, betas=(0.9, 0.999), scheduler='cosine', warmup_ratio=0.1, num_cycles=5, print_freq=20, device='cpu', max_grad_norm=1000)

In [3]:
# change cfg 
cfg.device = 'cuda'
cfg.data_dir = './data/fb3'
cfg.save_dir = './outputs'
cfg.max_len = 768
cfg.gradient_checkpointing = True
cfg.epoch = 5
cfg.num_cycles = 1.0
cfg.encoder_lr = 2e-5
cfg.decoder_lr = 5e-5
cfg.warmup_ratio = 0.0
cfg.criterion = 'l1'
cfg.seed = 42
cfg.n_fold = 4

cfg.layerwise_lr = 2e-5
cfg.layerwise_lr_decay = 0.9
cfg.layerwise_weight_decay = 0.01
cfg.layerwise_adam_epsilon = 1e-6
cfg.layerwise_use_bertadam = False

cfg.reinit = True
cfg.reinit_n = 1

cfg.fgm = False

cfg

SimpleRegressorConfig(seed=42, n_fold=4, target_columns=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'], data_dir='./data/fb3', save_dir='./outputs', load_dir='./outputs', model='microsoft/deberta-v3-base', criterion='l1', plm_size=768, gradient_checkpointing=True, apex=True, num_workers=4, epoch=5, batch_size=8, max_len=768, encoder_lr=2e-05, decoder_lr=2e-05, weight_decay=0.01, eps=1e-06, betas=(0.9, 0.999), scheduler='cosine', warmup_ratio=0.0, num_cycles=1.0, print_freq=20, device='cpu', max_grad_norm=1000)

In [4]:
def seed_everything(seed = 42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(cfg.seed)

In [5]:
# split the data
train_df = pd.read_csv(os.path.join(cfg.data_dir, 'train.csv'))
Fold = MultilabelStratifiedKFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train_df, train_df[cfg.target_columns])):
    train_df.loc[val_index, 'fold'] = int(n)
train_df['fold'] = train_df['fold'].astype(int)

train_df.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,fold
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,2
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,0
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,1
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,3
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,3


In [6]:
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, ft_all_layers):
        all_layer_embedding = torch.stack(ft_all_layers)
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

In [None]:
# attention layer
class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
        nn.Linear(in_dim, in_dim),
        nn.LayerNorm(in_dim),
        nn.GELU(),
        nn.Linear(in_dim, 1),
        )

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

In [7]:
class BaselineRegressor_LayerwiseLRD_WeightedLayer(BaselineRegressor):
    def __init__(
        self, 
        cfg,
        cfg_path: str = None,
        use_pretrained: bool = None
    ):
        BaselineRegressor.__init__(self, cfg, cfg_path, use_pretrained)
        self.weighted_pool = WeightedLayerPooling(self.model_config.num_hidden_layers, 4, None)
        #self._init_weights(self.fc)
    
    def forward(self, inputs: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        last_hidden_state = self.transformer(input_ids=inputs, attention_mask=attention_mask)
        features = self.weighted_pool(last_hidden_state[1])
        features = self.pool(features, attention_mask)
        out = self.fc(features)
        return out
    

    def get_optimizer(self):
        no_decay = ["bias", "LayerNorm.weight"]
        # initialize lr for task specific layer
        optimizer_grouped_parameters = [{"params": [p for n, p in self.named_parameters() if "transformer" not in n],
                                            "weight_decay": 0.0,
                                            "lr": self.cfg.decoder_lr,
                                        },]
        # initialize lr for extra params in encoder
        extra_params = [(n,p) for n, p in self.named_parameters() if "transformer" in n and "transformer.embeddings" not in n and "transformer.encoder.layer" not in n]
        optimizer_grouped_parameters += [{"params": [p for n, p in extra_params if not any(nd in n for nd in no_decay)],
                                            "weight_decay": self.cfg.layerwise_weight_decay,
                                            "lr": self.cfg.layerwise_lr,
                                        },]
        optimizer_grouped_parameters += [{"params": [p for n, p in extra_params if any(nd in n for nd in no_decay)],"weight_decay": 0.0,
                                            "lr": self.cfg.layerwise_lr,
                                        },]
        # initialize lrs for every layer
        layers = [self.transformer.embeddings] + list(self.transformer.encoder.layer)
        layers.reverse()
        lr = self.cfg.layerwise_lr
        for layer in layers:
            optimizer_grouped_parameters += [{"params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                                                "weight_decay": self.cfg.layerwise_weight_decay,
                                                "lr": lr,
                                                },
                                                {"params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                                                "weight_decay": 0.0,
                                                "lr": lr,
                                                },]
            lr *= self.cfg.layerwise_lr_decay
            
        return AdamW(optimizer_grouped_parameters,
                            lr = self.cfg.layerwise_lr,
                            eps = self.cfg.layerwise_adam_epsilon,
                            correct_bias = not self.cfg.layerwise_use_bertadam)

In [8]:
def re_initializing_layer(model, config, layer_num):
    for module in model.transformer.encoder.layer[-layer_num:].modules():
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
                
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
                
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    return model 
# train for folds

In [9]:
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon = 1., emb_name = 'word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name = 'word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {}

In [10]:
import os
import time 
import logging
import gc
import torch
from torch import nn
import pandas as pd
from typing import Callable, Dict, List, Tuple, Union

from src.utils.train_utils import AverageMeter, KeepAll, to_cuda, count_parameters


class FGM_Trainer(Trainer):
    def __init__(self,
        cfg: object,
        model: nn.Module,
        fold: int,
        train_samples: Union[List, pd.DataFrame] = None,
        val_samples: Union[List, pd.DataFrame] = None,
        test_samples: Union[List, pd.DataFrame] = None,
        device: str = 'cpu',
        checkpoint_path: str = None,
        ):
        
        Trainer.__init__(
            self, 
            cfg,
            model,
            fold,
            train_samples,
            val_samples,
            test_samples,
            device,
            checkpoint_path,
            )

    @staticmethod
    def get_logger(log_file: str):
        logger = logging.getLogger('trainer')
        if logger.hasHandlers():
            logger.handlers.clear()
        logger.setLevel(logging.INFO)
        handler1 = logging.StreamHandler()
        handler1.setFormatter(logging.Formatter("%(message)s"))
        handler2 = logging.FileHandler(filename=f"{log_file}")
        handler2.setFormatter(logging.Formatter("%(message)s"))
        logger.addHandler(handler1)
        logger.addHandler(handler2)
        return logger
    
    def _optimize(
        self,
        batch: Dict,
        model: nn.Module,
        optimizer: torch.optim.Optimizer,
        scaler: object,
        criterion: nn.Module,
        scheduler: Union[torch.optim.lr_scheduler._LRScheduler, List],
        fgm: object
    ) -> Tuple[Dict, Dict]:
        with torch.cuda.amp.autocast(enabled=self.cfg.apex):
            outputs_dict, losses_dict = self._model_train_step(model, batch, criterion)

        scaler.scale(losses_dict["loss"]).backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), self.cfg.max_grad_norm)

        if fgm is not None:
            fgm.attack()
            with torch.cuda.amp.autocast(enabled = self.cfg.apex):
                outputs_dict, losses_dict = self._model_train_step(model, batch, criterion)
                losses_dict["loss"].backward()
            fgm.restore()
        
        scaler.step(optimizer)
        scaler.update()
        losses_dict["amp_scaler"] = scaler.get_scale()

        if scheduler is not None:
            scheduler.step()

        losses_dict_detached = self._detach_loss_dict(losses_dict)

        optimizer.zero_grad(set_to_none=True)
        return outputs_dict, losses_dict_detached

    def train_step(self, batch: Dict, fgm) -> Tuple[Dict, Dict, float]:
        start = time.time()
        outputs, losses = self._optimize(batch, self.model, self.optimizer, self.scaler, \
                                    self.criterion, self.scheduler, fgm)
        elapsed = time.time() - start
        # just in case
        self.model.zero_grad(set_to_none=True)
        self.steps_done += 1
        return outputs, losses, elapsed

    
    def train_epoch(self) -> None:
        self.model.train()
        fgm = None
        if self.cfg.fgm:
            fgm = FGM(model)
        train_losses_epoch = AverageMeter()
        start = time.time()
        for cur_step, batch in enumerate(self.train_loader):
            for k, v in batch.items():
                batch[k] = to_cuda(v, self.cfg.device)
            _, losses, elapsed = self.train_step(batch, fgm)

            train_losses_epoch.update(losses['loss'])
            if cur_step % self.cfg.print_freq == 0 or cur_step == (len(self.train_loader)-1):
                self.logger.info(f"Epoch: {self.epochs_done+1}[{cur_step}/{len(self.train_loader)}] Elapsed: {elapsed} Loss: {losses['loss']:.4f}")

        self.train_losses.append(train_losses_epoch.avg)
        epoch_time = time.time() - start
        self.logger.info(f'Epoch{self.epochs_done+1} overall info: avg_train_loss={train_losses_epoch.avg}; {epoch_time} seconds')
        torch.cuda.empty_cache()
        gc.collect()

In [None]:
# train for folds
best_scores = []
for fold_id in range(cfg.n_fold):
    fold_train = train_df[train_df['fold'] != fold_id].reset_index(drop=True)
    fold_valid = train_df[train_df['fold'] == fold_id].reset_index(drop=True)

    model = BaselineRegressor_LayerwiseLRD_WeightedLayer(cfg, cfg_path=None, use_pretrained=True)
    if cfg.reinit:
        model = re_initializing_layer(model, model.model_config, cfg.reinit_n)
        
    trainer = FGM_Trainer(
        cfg=cfg,
        model=model,
        fold=fold_id,
        train_samples=fold_train,
        val_samples=fold_valid,
        test_samples=None,
        device=cfg.device
    )
    trainer.logger.info(f"========== fold: {fold_id} training ==========")
    trainer.fit()
    best_scores.append(trainer.best_score)
    break

In [None]:
trainer.logger.info(f"best scores={best_scores}")
trainer.logger.info(f"cv score={np.mean(best_scores)}")