In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# !pip install torch==1.9.0

In [None]:
# TPU

# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --version 20210331 --apt-packages libomp5 libopenblas-dev
# !rm -rf /kaggle/working/*.whl
# !rm -rf /kaggle/working/pytorch-xla-env-setup.py
# !pip install accelerate

In [None]:
import os
import gc
import sys
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from time import time

import os
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
# import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn import init
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler, Sampler
from torch.nn.functional import mse_loss

import transformers
from transformers import AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup, AutoConfig, AdamW

try:
    from torch.optim.swa_utils import AveragedModel, update_bn, SWALR
    SWA_AVAILABLE = True
except ImportError:
    SWA_AVAILABLE = False
finally:
    print(f"SWA Available :: {SWA_AVAILABLE}")
    
try: 
    import torch_xla.core.xla_model as xm
    import torch_xla.distributed.parallel_loader as pl
    import torch_xla.distributed.xla_multiprocessing as xmp
    from accelerate import Accelerator
    XLA_AVAILABLE = True
except ImportError:
    XLA_AVAILABLE = False
finally:
    print(f"XLA AVAILABLE :: {XLA_AVAILABLE}")
    

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-talk')
# print(plt.style.available)

from colorama import Fore, Back, Style
r_ = Fore.RED
b_ = Fore.BLUE
g_ = Fore.GREEN
y_ = Fore.YELLOW
w_ = Fore.WHITE
bb_ = Back.BLACK
by_ = Back.YELLOW
sr_ = Style.RESET_ALL

print("torch: ", torch.__version__)
print("transformers: ", transformers.__version__)

In [None]:
# Path('./scripts').mkdir(exist_ok=True)
models_dir = Path('./model')
models_dir.mkdir(exist_ok=True)


#############################################################################
kfold_df = pd.read_csv('../input/d/racleray/cmlit-fold/train_data.csv')

# Remove incomplete entries if any.
# kfold_df.drop(kfold_df[(kfold_df.target == 0) & (kfold_df.standard_error == 0)].index, inplace=True)
# kfold_df.reset_index(drop=True, inplace=True)


# Other set
# kfold_df = pd.read_csv('/kaggle/input/cmlit-fold/kfold_parsed.csv')
# kfold_df["standard_error"] = kfold_df["fold"]


print("Trainset shape: ", kfold_df.shape)

## Common

In [None]:
class Config:
#     model_name = 'roberta-base'
#     pretrained_model_path = '/kaggle/input/comlitrobertabasescript/'
    
    model_name = 'roberta-large'
    pretrained_model_path = '../input/comlitrobertalargescript'
    
    output_hidden_states = True
    epochs = 3
    num_labels = 1
    
    
    device = 'cuda'
    use_tpu = True
    
    
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01
    # 不是bert中的hidden
    head_hidden = 512
    
    
    warmup_steps = 50
    
    
    #batch_size = 24
    #eval_schedule = [(float('inf'), 10), (0.5, 4), (0.49, 3), (0.48, 2), (0.47, 1), (0, 0)]
    
    #For RAM saving
#     batch_size = 16
#     eval_schedule = [(float('inf'), 16), (0.5, 8), (0.49, 4), (0.48, 2), (0.47, 1), (0, 0)]
    
    #For RAM saving
    batch_size = 16
    eval_schedule = [(float('inf'), 16), (0.5, 8), (0.49, 5), (0.48, 2), (0.47, 1), (0, 0)]
    
    tolerance = 10
    
    
    use_multi_sample_dropout = False
    
    # roberta base weighted model setting
    num_hidden_layers = 12
    layer_start = 9
    
    
    # stochastic weight averaging
    swa = False
    swa_start = 2
    swa_learning_rate = 2e-4
    anneal_epochs = 1
    anneal_strategy='cos'

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=Config.seed)

### Dataset

In [None]:
class RoBERTaDataset(Dataset):
    def __init__(self, df, tokenizer, for_test=False):
        super().__init__()
        self.text = df['excerpt'].values
        self.for_test = for_test
        if not for_test:
            self.std_err = df['standard_error'].values
            self.target = df['target'].values
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        text = ' '.join(text.split())
        inputs = self.tokenizer.encode_plus(text,
                                            None,
                                            truncation=True,
                                            add_special_tokens=True,
                                            max_length=Config.max_len,
                                            padding='max_length',
                                            return_token_type_ids=True)

        if not self.for_test:
            return {
                'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
                'std_err': torch.tensor(self.std_err[index], dtype=torch.float),
                'label': torch.tensor(self.target[index], dtype=torch.float)
            }
        else:
            return {
                'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)
            }

### Model 1

In [None]:
# Bad
# class AttentionAggregation(nn.Module):
#     def __init__(self, d_model):
#         super().__init__()
#         self.query = nn.Linear(d_model, 1, bias=False)

#     def forward(self, x):  # (b, s, m)
#         attns = self.query(x).softmax(dim=1)  # (b, s, 1)
#         enc = torch.bmm(attns.transpose(1, 2), x)  # (b, 1, m)
#         return enc.squeeze(1)

两个都是 attn 模型，只是实现格式稍微不同

In [None]:
import torch
import torch.nn as nn


class AttentionHead_Ori(nn.Module):
    def __init__(self, h_size, hidden_dim=512):
        super().__init__()
        self.W = nn.Linear(h_size, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        
    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)
        
        #context_vector = torch.bmm(attention_weights.transpose(1, 2), features)  # (b, 1, m)

        return context_vector

class CLRPModel(nn.Module):
    def __init__(self):
        super(CLRPModel,self).__init__()
        config = AutoConfig.from_pretrained(Config.pretrained_model_path)
        config.update({
                "output_hidden_states":True,
                "hidden_dropout_prob": 0.0,
                "layer_norm_eps": 1e-7
                }) 
        self.h_size = config.hidden_size
        self.transformer = AutoModel.from_pretrained(Config.pretrained_model_path, config=config)  
        self.head = AttentionHead_Ori(self.h_size)
        self.linear = nn.Linear(self.h_size, 1)

    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
        x = self.head(transformer_out.last_hidden_state)
        x = self.linear(x)
        return x

In [None]:
class LitModel(nn.Module):
    "LB: 0.467 使用的模型代码"
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(Config.pretrained_model_path)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(Config.pretrained_model_path, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        last_layer_hidden_states = roberta_output.hidden_states[-1]
        weights = self.attention(last_layer_hidden_states)
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        return self.regressor(context_vector)

### Weighted

In [None]:
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start, layer_weights=None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(torch.tensor([1] * (num_hidden_layers + 1 - layer_start), dtype=torch.float))

    def forward(self, all_hidden_states, attention_mask):
        all_layer_embedding = all_hidden_states[self.layer_start: ]
        all_layer_embedding = torch.stack(all_layer_embedding, dim=0)
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor * all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average
    
    
    
class WeightedModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.config = AutoConfig.from_pretrained(Config.pretrained_model_path)
        self.config.update({
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.1,
                "attention_probs_dropout_prob": 0.1,
                "layer_norm_eps": 1e-7
                }) 
        self.roberta = AutoModel.from_pretrained(Config.pretrained_model_path,
                                                 config=self.config)
        self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=1e-5)

        self.pooler = WeightedLayerPooling(Config.num_hidden_layers, Config.layer_start)

        self.low_dropout = nn.Dropout(0.1)
        self.dropout = nn.Dropout(p=0.5)
        self.regressor = nn.Linear(self.config.hidden_size, Config.num_labels)

        self.std = 0.02
        self._init_weights(self.layer_norm)
        self._init_weights(self.regressor)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            # module.weight.data.normal_(mean=0.0, std=self.std)
            init.kaiming_normal_(m.weight, mode='fan_in')
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        
        ## WeightedLayerPooling
        hidden_states = outputs[2]
        pool_out = self.pooler(hidden_states, attention_mask)[:, 0]
        # pool_out = self.low_dropout(pool_out)
        pool_out = self.layer_norm(pool_out)
            
        if Config.use_multi_sample_dropout:
            logits = torch.mean(torch.stack([self.regressor(self.dropout(pool_out)) for _ in range(5)], dim=0), dim=0)
        else:
            logits = self.regressor(pool_out)

        return logits

### Mean

##### V1: This is weaker

In [None]:
# V1 large :  inference model

class MeanPooling(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        
    def forward(self, hidden_state, attention_mask):
        # last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
        sum_embeddings = torch.sum(hidden_state * input_mask_expanded, 1)

        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        
        return mean_embeddings

    
class MeanModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.config = AutoConfig.from_pretrained(Config.pretrained_model_path)
        self.config.update({
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.0,
                "attention_probs_dropout_prob": 0.1,
                "layer_norm_eps": 1e-7
                }) 
        self.roberta = AutoModel.from_pretrained(Config.pretrained_model_path,
                                                 config=self.config)
        
        self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=1e-7)
        self.pooler = MeanPooling(self.config.hidden_size)

        self.low_dropout = nn.Dropout(0.1)
        self.dropout = nn.Dropout(p=0.5)
        self.regressor = nn.Linear(self.config.hidden_size, Config.num_labels)

        self.std = 0.02
        self._init_weights(self.regressor)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            # module.weight.data.normal_(mean=0.0, std=self.std)
            init.kaiming_normal_(module.weight, mode='fan_in')
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        
        ## MeanPooling
        hidden_states = outputs[0]
        pool_out = self.pooler(hidden_states, attention_mask)
        # pool_out = self.low_dropout(pool_out)
        # pool_out = self.layer_norm(pool_out)

        if Config.use_multi_sample_dropout:  # Don`t help
            logits = torch.mean(torch.stack([self.regressor(self.dropout(pool_out)) for _ in range(5)], dim=0), dim=0)
        else:
            logits = self.regressor(pool_out)

        return logits

In [None]:
class MeanModelEmbedding(nn.Module):
    def __init__(self):
        super().__init__()
        self.config = AutoConfig.from_pretrained(Config.pretrained_model_path)
        self.config.update({
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.0,
                "attention_probs_dropout_prob": 0.1,
                "layer_norm_eps": 1e-7
                }) 
        self.roberta = AutoModel.from_pretrained(Config.pretrained_model_path,
                                                 config=self.config)
        
        self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=1e-7)
        self.pooler = MeanPooling(self.config.hidden_size)

        self.low_dropout = nn.Dropout(0.1)
        self.dropout = nn.Dropout(p=0.5)
        self.regressor = nn.Linear(self.config.hidden_size, Config.num_labels)

        self.std = 0.02
        self._init_weights(self.regressor)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            # module.weight.data.normal_(mean=0.0, std=self.std)
            init.kaiming_normal_(module.weight, mode='fan_in')
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        
        ## MeanPooling
        hidden_states = outputs[0]
        pool_out = self.pooler(hidden_states, attention_mask)
        # pool_out = self.low_dropout(pool_out)

        return pool_out

###### V2

In [None]:
# V2 large :  inference model

class MeanPooling(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        
    def forward(self, hidden_state, attention_mask):
        # last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
        sum_embeddings = torch.sum(hidden_state * input_mask_expanded, 1)

        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        
        return mean_embeddings




#Mish - "Mish: A Self Regularized Non-Monotonic Neural Activation Function"
#https://arxiv.org/abs/1908.08681v1
class Mish(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x *( torch.tanh(F.softplus(x)))



# def gelu(x):
#     """Implementation of the gelu activation function.
#             For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
#             0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
#     """
#     return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2)))


# class GeLU(nn.Module):
#     def __init__(self):
#         super().__init__()

#     def forward(self, x):
#         return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2)))
    
    
    
class MeanModel_v2(nn.Module):
    def __init__(self):
        super().__init__()
        self.config = AutoConfig.from_pretrained(Config.pretrained_model_path)
        self.config.update({
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.0,
                "attention_probs_dropout_prob": 0.1,
                "layer_norm_eps": 1e-7
                }) 
        self.roberta = AutoModel.from_pretrained(Config.pretrained_model_path,
                                                 config=self.config)
        
        self.pooler = MeanPooling(self.config.hidden_size)
        self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=1e-5)
        
        self.dropout = nn.Dropout(0.5)
        self.regressor = nn.Sequential(
                nn.Linear(self.config.hidden_size, 512),
                Mish(),
                nn.Linear(512, 1)
        )
        
        self.std = 0.02
        self._init_weights(self.regressor)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            # module.weight.data.normal_(mean=0.0, std=self.std)
            init.kaiming_normal_(module.weight, mode='fan_in')
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        
        ## MeanPooling
        hidden_states = outputs[0]
        pool_out = self.pooler(hidden_states, attention_mask)
        # pool_out = self.dropout(pool_out)
        pool_out = self.layer_norm(pool_out)
        logits = self.regressor(pool_out)

        return logits

### AttModel

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

        self.std = 0.02
#         self.init_weights()

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                init.kaiming_normal_(m.weight, mode='fan_out')
                if m.bias is not None:
                    init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                init.normal_(m.weight, std=self.std)
                # init.kaiming_normal_(m.weight, mode='fan_in')
                if m.bias is not None:
                    init.constant_(m.bias, 0)

    def forward(self, hidden_state, attention_mask):

        att = torch.tanh(self.W(hidden_state))
        score = self.V(att)

        mask_expanded = attention_mask.unsqueeze(-1)
        score[~mask_expanded] = -1e9

        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * hidden_state
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector
    
    
class AttModel(nn.Module):
    def __init__(self, attn_type='tradition'):
        super().__init__()
        self.config = AutoConfig.from_pretrained(Config.pretrained_model_path)
        self.config.update({
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.0,
                "attention_probs_dropout_prob": 0.1,
                "layer_norm_eps": 1e-7
                }) 
        self.roberta = AutoModel.from_pretrained(Config.pretrained_model_path,
                                                 config=self.config)
        self.head = AttentionHead(self.config.hidden_size, Config.head_hidden)
        
        self.dropout = nn.Dropout(p=0.1)
        self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=1e-5)

        self.regressor = nn.Linear(self.config.hidden_size, Config.num_labels)
#         self.regressor = nn.Sequential(
#                 nn.Linear(self.config.hidden_size, 512),
#                 Mish(),
#                 nn.Linear(512, 1)
#         )
        
        self.m_dropout = nn.Dropout(p=0.5)

        self.std = 0.02
#         self._init_weights(self.regressor)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.std)
            # init.kaiming_normal_(module.weight, mode='fan_in')
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        hidden_states = outputs[2][-1]

        x = self.head(hidden_states, attention_mask)
#         x = self.dropout(x)
#         x = self.layer_norm(x)
        logits = self.regressor(x)
#         logits = torch.mean(torch.stack([self.regressor(self.m_dropout(x)) for _ in range(5)], dim=0), dim=0)


        return logits

### Optim

In [None]:
# def create_optimizer(model, train_loader_len):
#     named_parameters = list(model.named_parameters())    
    
#     roberta_parameters = named_parameters[:197]    
#     attention_parameters = named_parameters[199:203]
#     regressor_parameters = named_parameters[203:]
        
#     attention_group = [params for (name, params) in attention_parameters]
#     regressor_group = [params for (name, params) in regressor_parameters]

#     parameters = []
#     parameters.append({"params": attention_group})
#     parameters.append({"params": regressor_group})

#     for layer_num, (name, params) in enumerate(roberta_parameters):
#         weight_decay = 0.0 if "bias" in name else 0.01

#         lr = Config.lr

#         if layer_num >= 69:        
#             lr = Config.lr * 2.5

#         if layer_num >= 133:
#             lr = Config.lr * 5

#         parameters.append({"params": params,
#                            "weight_decay": weight_decay,
#                            "lr": lr})
        
#     max_train_steps = Config.epochs * train_loader_len
#     warmup_steps = Config.warmup_steps if Config.warmup_steps >= 0 else math.ceil(max_train_steps * 0.05)
#     # print(">>Config max_train_steps: ", max_train_steps)
#     # print(">>Config warmup_steps: ", warmup_steps)
    
#     optimizer = optim.AdamW(parameters)
#     # Defining LR Scheduler
#     scheduler = get_cosine_schedule_with_warmup(
#         optimizer,
#         num_warmup_steps=warmup_steps,
#         num_training_steps=max_train_steps
#     )

#     return optimizer, scheduler

In [None]:
def create_optimizer(model, train_loader_len, custom_multiples=50, group_diff=1.5):
    # base 2.5
    # large 1.5
    
    model_params = list(model.named_parameters())    

    no_decay = ["bias", "gamma", "beta", "LayerNorm.weight"]
    
#     group1=['embeddings', 'layer.0.','layer.1.','layer.2.','layer.3.']
#     group2=['layer.4.','layer.5.','layer.6.','layer.7.']
#     group3=['layer.8.','layer.9.','layer.10.','layer.11.']
    
    group1=['embeddings', 'layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.']
    group2=['layer.8.','layer.9.','layer.10.','layer.11.','layer.12.','layer.13.','layer.14.','layer.15.']
    group3=['layer.16.','layer.17.','layer.18.','layer.19.','layer.20.','layer.21.','layer.22.','layer.23.']
    
    group_all = group1 + group2 + group3
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model_params if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
         'weight_decay': Config.weight_decay,
         'lr': Config.lr * custom_multiples},
        
        {'params': [p for n, p in model_params if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
         'weight_decay': Config.weight_decay,
         'lr': Config.lr * group_diff},
        
        {'params': [p for n, p in model_params if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
         'weight_decay': Config.weight_decay,
         'lr': Config.lr},
        
        {'params': [p for n, p in model_params if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
         'weight_decay': Config.weight_decay,
         'lr': Config.lr * group_diff * group_diff},
        
        {'params': [p for n, p in model_params if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],
         'weight_decay': 0.0,
         'lr': Config.lr * custom_multiples},
        
        {'params': [p for n, p in model_params if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
         'weight_decay': 0.0,
         'lr': Config.lr * group_diff},
        
        {'params': [p for n, p in model_params if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
         'weight_decay': 0.0,
         'lr': Config.lr},
        
        {'params': [p for n, p in model_params if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
         'weight_decay': 0.0,
         'lr': Config.lr * group_diff * group_diff},
    ]

    optimizer = optim.AdamW(optimizer_grouped_parameters, lr=Config.lr)

    max_train_steps = Config.epochs * train_loader_len

    warmup_steps = Config.warmup_steps if Config.warmup_steps >= 0 else math.ceil(max_train_steps * 0.05)
    # print(">>Config max_train_steps: ", max_train_steps)
    # print(">>Config warmup_steps: ", warmup_steps)
    
    # Defining LR Scheduler
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=max_train_steps
    )
    
    return optimizer, scheduler

### EvaluationScheduler

In [None]:
import math

class AvgCounter:
    def __init__(self):
        self.reset()
        
    def update(self, loss, n_samples):
        self.loss += loss * n_samples
        self.n_samples += n_samples
        
    def avg(self):
        return math.sqrt(self.loss / self.n_samples) # rmse
    
    def reset(self):
        self.loss = 0
        self.n_samples = 0


class EvaluationScheduler:
    def __init__(self, evaluation_schedule, penalize_factor=1, max_penalty=8):
        self.evaluation_schedule = evaluation_schedule
        self.evaluation_interval = self.evaluation_schedule[0][1]
        self.last_evaluation_step = 0
        self.prev_loss = float('inf')
        self.penalize_factor = penalize_factor
        self.penalty = 0
        self.prev_interval = -1
        self.max_penalty = max_penalty

    def step(self, step):
        # should we to make evaluation right now
        if step >= self.last_evaluation_step + self.evaluation_interval:
            self.last_evaluation_step = step
            return True
        else:
            return False
        
    def update_evaluation_interval(self, last_loss):
        # set up evaluation_interval depending on loss value
        cur_interval = -1
        for i, (loss, interval) in enumerate(self.evaluation_schedule[:-1]):
            if self.evaluation_schedule[i+1][0] < last_loss < loss:
                self.evaluation_interval = interval
                cur_interval = i
                break
        ## for less evaluation
        # if last_loss > self.prev_loss and self.prev_interval == cur_interval:
        #     self.penalty += self.penalize_factor
        #     self.penalty = min(self.penalty, self.max_penalty)
        #     self.evaluation_interval += self.penalty
        # else:
        #     self.penalty = 0
            
        self.prev_loss = last_loss
        self.prev_interval = cur_interval

### Trainer

In [None]:
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=0.001, emb_name='word_embeddings.'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings.'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name: 
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}

In [None]:
def mse_loss(y_true, y_pred, standard_error=None):
    return nn.functional.mse_loss(y_true, y_pred)


def gll_loss(y_true, y_pred, standard_error):
    # crit = torch.nn.GaussianNLLLoss()
    y_pred = y_pred.view(-1)
    y_true = y_true.view(-1)
    standard_error = standard_error.view(-1)
    loss = torch.nn.GaussianNLLLoss()(input=y_pred,
                                     target=y_true, 
                                     var=standard_error ** 2)
    return loss


def metrics(y_true, y_pred):
    return nn.functional.mse_loss(y_true, y_pred)

In [None]:
class Trainer:
    def __init__(self, train_dl, val_dl, model, optimizer, scheduler, criterion, metrics, model_num,
                 use_gll=False, use_fgm=False, 
                 swa_model=None, swa_scheduler=None,
                 use_tpu=False, device="cuda"):
        self.train_dl = train_dl
        self.val_dl = val_dl
        self.model = model
        
        self.use_fgm = use_fgm
        if use_fgm:
            self.fgm = FGM(self.model)
        self.use_swa = False
        if swa_model:
            self.use_swa = True
            self.swa_model = swa_model
            self.swa_scheduler = swa_scheduler
        
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = device
        self.batches_per_epoch = len(self.train_dl)
        self.criterion = criterion
        self.use_gll = use_gll
        self.metrics = metrics
        self.model_num = model_num
        
        self.use_tpu = use_tpu
        if use_tpu and XLA_AVAILABLE:
            xm.master_print(f">>> {device} is used")
            
    
    def log(self, string):
        if self.use_tpu:
            xm.master_print(string)
        else:
            print(string)
    
    
    def save(self):
        if self.use_tpu:
            xm.save(self.model.state_dict(), models_dir / f'model_{self.model_num}.bin')
            self.model.config.to_json_file(models_dir / 'config.json', use_diff=True)
        else:
            torch.save(self.model.state_dict(), models_dir / f'model_{self.model_num}.bin')
            self.model.config.to_json_file(models_dir / 'config.json', use_diff=True)
             
                
    def run(self, begin_val_epoch=3):
        record_info = {'train_loss': [], 'val_loss': [], 'swa_loss': []}
        
        best_val_loss = float('inf')
        evaluation_scheduler = EvaluationScheduler(Config.eval_schedule)
        train_loss_counter = AvgCounter()
        step = 0            
        
        for epoch in range(Config.epochs):
            self.log(f'{r_}Epoch: {epoch+1}/{Config.epochs}{sr_}')
    
            start_epoch_time = time()
            tolerance = 0
            
            for batch_num, batch in enumerate(self.train_dl):
                train_loss = self.train_step(batch, epoch)
                # self.log(f'{epoch+1}#[{step+1}/{len(self.train_dl)}]: train loss - {train_loss}')

                train_loss_counter.update(train_loss, len(batch))
                record_info['train_loss'].append((step, train_loss))
                
                # check if valid by setting eval_schedule
                if (epoch + 1) >= begin_val_epoch and evaluation_scheduler.step(step):  
                    tolerance += 1
                    val_loss = self.evaluate()
                    record_info['val_loss'].append((step, val_loss))
                    self.log(f'\t\t{epoch+1}#[{batch_num+1}/{self.batches_per_epoch}]: train loss - {train_loss_counter.avg():.4f} | val loss - {val_loss:.4f}')
                    
                    train_loss_counter.reset()
                    if val_loss < best_val_loss:
                        tolerance = 0
                        self.log(f"\t\t{g_}Val loss decreased from {best_val_loss:.4f} to {val_loss:.4f}{sr_}")
                        best_val_loss = val_loss
                        if best_val_loss < 0.495:
                            self.save()
                        
#                         ########################## testing
#                         if not self.use_swa:
#                             self.scheduler.step()
#                         else:
#                             if (epoch+1) < Config.swa_start:
#                                 self.scheduler.step()
#                         ########################## testing
                        
                    evaluation_scheduler.update_evaluation_interval(val_loss)
                    
                
                if (epoch + 1) == Config.epochs and tolerance > Config.tolerance:
                    self.log(f"\t\t{r_}Val loss stable, break. Cur best loss {best_val_loss:.4f} {sr_}")
                    break

                step += 1
                
            gc.collect()
            
            if self.use_swa and (epoch+1) >= Config.swa_start:
                self.swa_model.update_parameters(self.model)
                self.swa_scheduler.step()
                val_loss_s = self.swa_evaluate()
                record_info['swa_loss'].append((step, val_loss_s))
                
            end_epoch_time = time()
            self.log(f'{bb_}{y_}The epoch took {end_epoch_time - start_epoch_time:.4f} sec {sr_}')
            
        if self.use_swa:
            update_bn(self.train_dl, self.swa_model, device=torch.device('cuda'))    
        
        return record_info, best_val_loss
            
    def train_step(self, batch, epoch):
        self.model.train()
        sent_id, mask, std_err, labels = batch['input_ids'].to(self.device), \
                                         batch['attention_mask'].to(self.device), \
                                         batch['std_err'].to(self.device), \
                                         batch['label'].to(self.device)
        
        self.model.zero_grad() 
        preds = self.model(sent_id, mask)
        train_loss = self.criterion(labels.unsqueeze(1), preds, std_err)
        train_loss.backward(retain_graph=self.use_fgm)
        
        if self.use_fgm:
            self.fgm.attack()
            preds_adv = self.model(sent_id, mask)
            adv_loss = self.criterion(labels.unsqueeze(1), preds, std_err)
            adv_loss.backward()
            self.fgm.restore()
        
        self.optimizer.step()
        if not self.use_swa:
            self.scheduler.step()
        else:
            if (epoch+1) < Config.swa_start:
                self.scheduler.step()
        
        if self.use_gll:
            return self.metrics(labels.unsqueeze(1), preds).item()
        else:
            return train_loss.item()


    def evaluate(self):
        self.model.eval()
        val_loss_counter = AvgCounter()

        for step, batch in enumerate(self.val_dl):
            sent_id, mask, labels = batch['input_ids'].to(self.device), \
                                    batch['attention_mask'].to(self.device), \
                                    batch['label'].to(self.device)
            with torch.no_grad():
                preds = self.model(sent_id, mask)
                
                if self.use_gll:
                    val_loss_counter.update(self.metrics(labels.unsqueeze(1), preds).item(), len(labels))
                else:
                    loss = self.criterion(labels.unsqueeze(1), preds, None)
                    val_loss_counter.update(loss.item(), len(labels))
                
        return val_loss_counter.avg()
    
    
    def swa_evaluate(self):
        self.swa_model.eval()
        val_loss_counter = AvgCounter()

        for step, batch in enumerate(self.val_dl):
            sent_id, mask, labels = batch['input_ids'].to(self.device), \
                                    batch['attention_mask'].to(self.device), \
                                    batch['label'].to(self.device)
            with torch.no_grad():
                preds = self.swa_model(sent_id, mask)
                
                if self.use_gll:
                    val_loss_counter.update(self.metrics(labels.unsqueeze(1), preds).item(), len(labels))
                else:
                    loss = self.criterion(labels.unsqueeze(1), preds, None)
                    val_loss_counter.update(loss.item(), len(labels))
                
        return val_loss_counter.avg()

In [None]:
def get_train_data(fold, df, tokenizer):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # 使用pl TPU 效率变低，因为pl会使用sampler_ddp优化对TPU的训练
    # train_dataset = SmartBatchingDataset(df_train)
    # tokenizer = AutoTokenizer.from_pretrained(Config.tokenizer)
    # train_loader = train_dataset.get_dataloader(batch_size=Config.batch_size,
    #                                             max_len=Config.max_len,
    #                                             pad_id=tokenizer.pad_token_id)

    train_dataset = RoBERTaDataset(df_train, tokenizer)
    sampler = RandomSampler(train_dataset)
    train_loader = DataLoader(train_dataset, batch_size=Config.batch_size,
                              num_workers=4, sampler=sampler, pin_memory=True)

    valid_dataset = RoBERTaDataset(df_valid, tokenizer)
    valid_loader = DataLoader(valid_dataset, batch_size=Config.batch_size,
                              num_workers=4, shuffle=False, pin_memory=True)

    return train_loader, valid_loader

### Preditc

In [None]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

In [None]:
def reparams(model, reinit_layers=4):
    # TF version uses truncated_normal for initialization. This is Pytorch
    if reinit_layers > 0:
        print(f'Reinitializing Last {reinit_layers} Layers ...')
        encoder_temp = getattr(model, "roberta")
        for layer in encoder_temp.encoder.layer[-reinit_layers: ]:
            for module in layer.modules():
                if isinstance(module, nn.Linear):
                    module.weight.data.normal_(mean=0.0, std=0.02)
                    if module.bias is not None:
                        module.bias.data.zero_()
                elif isinstance(module, nn.Embedding):
                    module.weight.data.normal_(mean=0.0, std=0.02)
                    if module.padding_idx is not None:
                        module.weight.data[module.padding_idx].zero_()
                elif isinstance(module, nn.LayerNorm):
                    module.bias.data.zero_()
                    module.weight.data.fill_(1.0)
        print('Reinitializing Done.!')

### Run

In [None]:
def run(fold_init=0, fold_end=5):
    best_scores = []
    records = []
    tokenizer = AutoTokenizer.from_pretrained(Config.pretrained_model_path)
    tokenizer.save_pretrained(models_dir)
    
    for fold_num in range(fold_init, fold_end): 
        if Config.use_tpu and XLA_AVAILABLE:
            accelerator = Accelerator()
            xm.master_print(f'{by_}{r_}  Model#{fold_num+1}  {sr_}')
            device = accelerator.device
        else:
            print(f'{by_}{r_}  Model#{fold_num+1}  {sr_}')
            device = Config.device
        

        # seed_everything(Config.seed + fold) 

        train_dl, val_dl = get_train_data(fold_num, kfold_df, tokenizer)
        
        #########################################################
        ###### Change Model
        #########################################################
        model = AttModel()
        # model = CLRPModel()
        # model = MeanModel()
        # model = WeightedModel()
        # model = MeanModel_v2()
        model = model.to(device)
        
        reparams(model, reinit_layers=2)
        
        optimizer, scheduler = create_optimizer(model, len(train_dl))
        
        if Config.swa:
            # stochastic weight averaging
            swa_model = AveragedModel(model)
            swa_scheduler = SWALR(
                optimizer, swa_lr=Config.swa_learning_rate, 
                anneal_epochs=Config.anneal_epochs, 
                anneal_strategy=Config.anneal_strategy
            )
        
        if Config.use_tpu and XLA_AVAILABLE:
            model, train_dl, val_dl, optimizer, scheduler = accelerator.prepare(
                              model, train_dl, val_dl, optimizer, scheduler)
            
            # gll not work
#             trainer = Trainer(train_dl, val_dl, model, optimizer, scheduler, 
#                               gll_loss, metrics, fold_num, use_gll=True, 
#                               use_tpu=True, device=device)

            trainer = Trainer(train_dl, val_dl, model, optimizer, scheduler, 
                              mse_loss, metrics, fold_num, use_gll=False, 
                              use_tpu=True, device=device)
            
            # fgm
#             trainer = Trainer(train_dl, val_dl, model, optimizer, scheduler, 
#                               mse_loss, metrics, fold_num, use_gll=False, 
#                               use_tpu=True, use_fgm=True, device=device)

        else:
            trainer = Trainer(train_dl, val_dl, model, optimizer, scheduler, 
                              mse_loss, metrics, fold_num, use_gll=False)
            # pytorch 1.9
#             trainer = Trainer(train_dl, val_dl, model, optimizer, scheduler, 
#                               gll_loss, metrics, fold_num, use_gll=True)

############################################################################
# For memory saving
        if fold_num < 3:
            record_info, best_val_loss = trainer.run(begin_val_epoch=4)
        else:
            record_info, best_val_loss = trainer.run(begin_val_epoch=3)
############################################################################

        #  record_info, best_val_loss = trainer.run(begin_val_epoch=3)
        best_scores.append(best_val_loss)  
        records.append(record_info)
        
        if not Config.use_tpu:
            steps, train_losses = list(zip(*record_info['train_loss']))
            plt.plot(steps, train_losses, label='train_loss')
            steps, val_losses = list(zip(*record_info['val_loss']))
            plt.plot(steps, val_losses, label='val_loss')
            plt.legend()
            plt.show()
            
        del model
        gc.collect()
        # torch.cuda.empty_cache()

    
    if Config.use_tpu and XLA_AVAILABLE:
        xm.master_print(f'Best val losses:, ',  best_scores)
        xm.master_print(f'Avg val loss: {np.array(best_scores).mean():.4f}')        
    else:
        print(f'Best val losses:, ',  best_scores)
        print(f'Avg val loss: {np.array(best_scores).mean():.4f}')
    
    return best_scores, records

# !date '+%A %W %Y %X' > execution_time

In [None]:
class Config:
#     model_name = 'roberta-base'
#     pretrained_model_path = '/kaggle/input/comlitrobertabasescript/'
    
    model_name = 'roberta-large'
    pretrained_model_path = '../input/comlitrobertalargescript'
    
    output_hidden_states = True
    epochs = 3
    num_labels = 1
    
    
    device = 'cuda'
    use_tpu = True
    
    
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01
    # 不是bert中的hidden
    head_hidden = 512
    
    
    warmup_steps = 50
    
    
    #batch_size = 24
    #eval_schedule = [(float('inf'), 10), (0.5, 4), (0.49, 3), (0.48, 2), (0.47, 1), (0, 0)]
    
    #For RAM saving
#     batch_size = 16
#     eval_schedule = [(float('inf'), 16), (0.5, 8), (0.49, 4), (0.48, 2), (0.47, 1), (0, 0)]
    
    #For RAM saving
    batch_size = 16
    eval_schedule = [(float('inf'), 16), (0.5, 8), (0.49, 5), (0.48, 2), (0.47, 1), (0, 0)]
    
    tolerance = 10
    
    
    use_multi_sample_dropout = False
    
    # roberta base weighted model setting
    num_hidden_layers = 12
    layer_start = 9
    
    
    # stochastic weight averaging
    swa = False
    swa_start = 2
    swa_learning_rate = 2e-4
    anneal_epochs = 1
    anneal_strategy='cos'

In [None]:
gc.collect()

In [None]:
if __name__ == "__main__":
    # del model
    # torch.cuda.synchronize()
    gc.collect()
    # torch.cuda.empty_cache()
    # torch.cuda.synchronize()
    best_scores, records = run(0, 5)

In [None]:
def show_records(records, index):
    record_info = records[index]
    
    steps, train_losses = list(zip(*record_info['train_loss']))
    plt.plot(steps, train_losses, label='train_loss')
    steps, val_losses = list(zip(*record_info['val_loss']))
    plt.plot(steps, val_losses, label='val_loss')
    plt.legend()
    plt.show()

In [None]:
# show_records(records[1])

### SVR Stack

In [None]:
def get_test_data(df):
    tokenizer = AutoTokenizer.from_pretrained(Config.pretrained_model_path)
    test_dataset = RoBERTaDataset(df, tokenizer, for_test=True)
    test_loader = DataLoader(test_dataset, batch_size=24,
                             num_workers=4, shuffle=False, pin_memory=True,
                             drop_last=False)
    return test_loader

In [None]:
# rmse_score SVR
from joblib import dump, load
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold


train_data = kfold_df

## Reset bins
# num_bins = int(np.floor(1 + np.log2(len(train_data))))
# train_data.loc[:, 'bins'] = pd.cut(train_data['target'], bins=num_bins, labels=False)

target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()

In [None]:
bins

In [None]:
def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))


# input whole train_data not shuffled
def embedding_svr_train(df, bins, save_dir, bert_path, bert_nums=5, svr_nfolds=10, C=8, kernel='rbf'):
    mean_scores = []
    records = []

    # get embeddings
    models_embedding = []
    for fold_num in range(bert_nums):
        print(f'{by_}{r_}  Model#{fold_num+1} inferencing {sr_}')
        device = Config.device

        test_dataloader = get_test_data(df)

        model = MeanModelEmbedding()
        model.load_state_dict(torch.load(bert_path + f'model_{fold_num}.bin'))
        model.to(device)
        model.eval()

        embeddings = []
        with torch.no_grad():
            for i, batch in tqdm(enumerate(test_dataloader)):
                sent_id, mask = batch['input_ids'].to(Config.device), batch['attention_mask'].to(Config.device)
                outputs = model(sent_id, mask)
                outputs = outputs.detach().cpu().numpy()
                embeddings.extend(outputs)
            embeddings = np.array(embeddings)
        models_embedding.append(embeddings)

        del model
        gc.collect()
        #torch.cuda.empty_cache()

    #print(models_embedding[0].shape)
    #print(type(models_embedding[0]))    
    print("Embedding got.")    

    
    # SVM training: 5 SVR model
    for index, X in enumerate(models_embedding):
        print(f'{by_}{r_}  SVR#{index+1} training {sr_}')
        scores = []
        # new kfold
        kfold = StratifiedKFold(n_splits=svr_nfolds, shuffle=True, random_state=42)
        for i, (train_idx, valid_idx) in tqdm(enumerate(kfold.split(X, bins))):
            model = SVR(C=C, kernel=kernel, gamma='auto')
            X_train, y_train = X[train_idx], target[train_idx]
            X_valid, y_valid = X[valid_idx], target[valid_idx]
            
            model.fit(X_train, y_train)
            
            prediction = model.predict(X_valid)
            score = rmse_score(prediction, y_valid)
            scores.append(score)
            print(f'\t\t{y_}SVR {index} Fold {i} , rmse score: {score:.4f} {sr_}')

            os.makedirs(save_dir, exist_ok=True)
            dump(model, save_dir + f'svr_{index}_{i}.bin')

        mean_score = np.mean(scores)
        print(f'\t{r_}SVR {index} mean rmse score: {mean_score:.4f} {sr_}')
        mean_scores.append(mean_score)
        records.append(scores)

    print(f'Avg rmse score of 5 SVR: {np.mean(mean_scores):.4f}')

    return records

In [None]:
records = embedding_svr_train(train_data, bins, save_dir=models_dir, bert_path="../input/clrobertalarger/", bert_nums=5, svr_nfolds=10)

过拟合严重，需要bagging

In [None]:
def embedding_svr_test(df, save_dir, bert_path, bert_nums=5, svr_nfolds=10):
    # get embeddings
    models_embedding = []
    for fold_num in range(bert_nums):
        print(f'{by_}{r_}  Model#{fold_num+1} inferencing {sr_}')
        device = Config.device

        test_dataloader = get_test_data(df)

        model = MeanModelEmbedding()
        model.load_state_dict(torch.load(bert_path + f'model_{fold_num}.bin'))
        model.to(device)
        model.eval()

        embeddings = []
        with torch.no_grad():
            for i, batch in tqdm(enumerate(test_dataloader)):
                sent_id, mask = batch['input_ids'].to(Config.device), batch['attention_mask'].to(Config.device)
                outputs = model(sent_id, mask)
                outputs = outputs.detach().cpu().numpy()
                embeddings.extend(outputs)
            embeddings = np.array(embeddings)
        models_embedding.append(embeddings)

        del model
        gc.collect()
        torch.cuda.empty_cache()
        
    print(f'Embedding got.')

    # SVM predict: 5 SVR model
    results = np.zeros((df.shape[0]))
    for index, X_test in enumerate(models_embedding):
        print(f'{by_}{r_}  SVR#{index+1} predicting {sr_}')
        for i in range(svr_nfolds):
            svr = load(save_dir + f'svr_{index}_{i}.bin')
            preds = svr.predict(X_test)
            results += preds
            
    print(f'Complete.')

    return results / bert_nums / svr_nfolds

In [None]:
test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
test_df['excerpt'] = test_df['excerpt'].apply(lambda x: x.replace('\n',' '))
embedding_svr_test(df=test_df, save_dir=models_dir, bert_path="../input/clrobertalarger/")

### CV res test in whole trainset

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
def get_test_data(df):
    tokenizer = AutoTokenizer.from_pretrained(Config.pretrained_model_path)
    test_dataset = RoBERTaDataset(df, tokenizer, for_test=True)
    test_loader = DataLoader(test_dataset, batch_size=24,
                             num_workers=4, shuffle=False, pin_memory=True,
                             drop_last=False)
    return test_loader

In [None]:
def check_in_origin_data(test_df):
    models_folder_path = Path('/kaggle/input/commonlit-roberta-0467/')
    models_preds = []
    n_models = 5
    
    for model_num in range(n_models):
        print(f'Inference # {model_num+1}/{n_models} ...', end=' ')
        test_dataloader = get_test_data(test_df)
        
        # My save style used
        # model = torch.load(models_folder_path / f'best_model_{model_num}.pt').to(Config.device)
        
        # Litmodel
        model_path = models_folder_path / f'model_{model_num+1}.pth'
        model = LitModel()
        model.load_state_dict(torch.load(model_path, map_location=Config.device))    
        model.to(Config.device)

        all_preds = []
        model.eval()

        for step, batch in enumerate(test_dataloader):
            sent_id, mask = batch['input_ids'].to(Config.device), batch['attention_mask'].to(Config.device)
            with torch.no_grad():
                preds = model(sent_id, mask)
                all_preds += preds.flatten().cpu().tolist()

        models_preds.append(all_preds)
        
        print(' Completed')
        
    return models_preds

In [None]:
# test_df = pd.read_csv('../input/cmlit-fold/train_data.csv')

# models_preds = check_in_origin_data(test_df)

# models_preds = np.array(models_preds)
# all_preds = models_preds.mean(axis=0)

# rmse_score(test_df.target.values, all_preds)

- commonlit-roberta-0467： 0.2924174434645643

### Upload model to cloud

In [None]:
!ls model

In [None]:
import os

# !pip install kaggle

os.environ["KAGGLE_USERNAME"] = ""
os.environ["KAGGLE_KEY"] = ""

!kaggle datasets metadata racleray/attlargereinit
!mv dataset-metadata.json model/

# 最好不要有文件夹
!kaggle datasets version -p ./model -m "Updated data base fine"