In [9]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/meanlargereinit/config.json
/kaggle/input/meanlargereinit/merges.txt
/kaggle/input/meanlargereinit/model_0.bin
/kaggle/input/meanlargereinit/model_1.bin
/kaggle/input/meanlargereinit/tokenizer.json
/kaggle/input/meanlargereinit/vocab.json
/kaggle/input/meanlargereinit/tokenizer_config.json
/kaggle/input/meanlargereinit/model_3.bin
/kaggle/input/meanlargereinit/model_2.bin
/kaggle/input/meanlargereinit/model_4.bin
/kaggle/input/meanlargereinit/special_tokens_map.json
/kaggle/input/largeattnlit/config.json
/kaggle/input/largeattnlit/merges.txt
/kaggle/input/largeattnlit/model_0.bin
/kaggle/input/largeattnlit/model_1.bin
/kaggle/input/largeattnlit/tokenizer.json
/kaggle/input/largeattnlit/vocab.json
/kaggle/input/largeattnlit/tokenizer_config.json
/kaggle/input/largeattnlit/model_3.bin
/kaggle/input/largeattnlit/model_2.bin
/kaggle/input/largeattnlit/model_4.bin
/kaggle/input/largeattnlit/special_tokens_map.json
/kaggle/input/pretrained/roberta_tk.pt
/kaggle/input/pretrained

In [10]:
import os
import gc
import sys
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from joblib import dump, load

import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn import init
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler, Sampler
from transformers import AutoModel, AutoTokenizer, AutoConfig

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

from time import time
from colorama import Fore, Back, Style

r_ = Fore.RED
b_ = Fore.BLUE
g_ = Fore.GREEN
y_ = Fore.YELLOW
w_ = Fore.WHITE
bb_ = Back.BLACK
by_ = Back.YELLOW
sr_ = Style.RESET_ALL

## Global config

In [11]:
class Config:
    epochs = 3
    batch_size = 16
    test_batch = 32
    
    device = 'cuda'
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01
    
    num_labels = 1

## Dataset

In [12]:
class RoBERTaDataset(Dataset):
    def __init__(self, df, tokenizer, for_test=False):
        super().__init__()
        self.text = df['excerpt'].values
        self.for_test = for_test
        if not for_test:
            self.target = df['target'].values
        self.max_len = Config.max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        text = ' '.join(text.split())
        inputs = self.tokenizer.encode_plus(text,
                                            None,
                                            truncation=True,
                                            add_special_tokens=True,
                                            max_length=self.max_len,
                                            padding='max_length')

        if not self.for_test:
            return {
                'input_ids':
                    torch.tensor(inputs['input_ids'], dtype=torch.long),
                'attention_mask':
                    torch.tensor(inputs['attention_mask'], dtype=torch.long),
                'label':
                    torch.tensor(self.target[index], dtype=torch.float)
            }
        else:
            return {
                'input_ids':
                    torch.tensor(inputs['input_ids'], dtype=torch.long),
                'attention_mask':
                    torch.tensor(inputs['attention_mask'], dtype=torch.long)
            }

### Model Attn 1

In [13]:
# CLRPModel: 467    BaseOne: 474  Base 2: 475     
class AttnOneConifg:
    model_name = 'roberta-base'
    pretrained_model_path = '../input/pretrained/base_s'
    
    epochs = 3
    batch_size = 16
    test_batch = 32
    
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01

In [14]:
import torch
import torch.nn as nn

# Using Model
# Base Model One         Base Model Two
class AttentionHead_Ori(nn.Module):
    def __init__(self, h_size, hidden_dim=512):
        super().__init__()
        self.W = nn.Linear(h_size, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class CLRPModel(nn.Module):
    def __init__(self):
        super(CLRPModel, self).__init__()
        config = AutoConfig.from_pretrained(AttnOneConifg.pretrained_model_path)
        config.update({"output_hidden_states":True,
                        "hidden_dropout_prob": 0.0,
                        "layer_norm_eps": 1e-7})
        self.h_size = config.hidden_size
        self.transformer = AutoModel.from_pretrained(AttnOneConifg.pretrained_model_path, config=config)
        self.head = AttentionHead_Ori(self.h_size)
        self.linear = nn.Linear(self.h_size, 1)

    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
        x = self.head(transformer_out.last_hidden_state)
        x = self.linear(x)
        return x

In [15]:
# 467
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()
        config = AutoConfig.from_pretrained(AttnOneConifg.pretrained_model_path)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(AttnOneConifg.pretrained_model_path, config=config)  
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        
        self.regressor = nn.Sequential(nn.Linear(768, 1))
        
    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids, attention_mask)        
        last_hidden_states = roberta_output.hidden_states[-1]
        
        weights = self.attention(last_hidden_states)
        context_vector = torch.sum(weights * last_hidden_states, dim=1)        
        
        return self.regressor(context_vector)

### Model MeanPooling Large and MeanEmbedding

MeanEmbedding完全可以和Model MeanPooling Large合在一起

- MeanEmbedding V1 + SVM 473

In [16]:
# Mean Model
class MeanLargeConfig:
    model_name = 'roberta-large'
    pretrained_model_path = '../input/pretrained/large_s'
    
    epochs = 3
    batch_size = 16
    test_batch = 32
    
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01
    
    head_hidden = 512
    
    use_multi_sample_dropout = True # this model didn`t help

In [17]:
# V1 large :  inference model  同时输出 logits 和 pool_out embedding，可以选择 如何取舍两个输出。
#             MeanEmbedding V1 + SVM 473
#                    logits          472
class MeanPooling(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        
    def forward(self, hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
        sum_embeddings = torch.sum(hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

    
class MeanModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.config = AutoConfig.from_pretrained(MeanLargeConfig.pretrained_model_path)
        self.config.update({"output_hidden_states": True,
                            "hidden_dropout_prob": 0.0,
                            "attention_probs_dropout_prob": 0.1,
                            "layer_norm_eps": 1e-7}) 
        self.roberta = AutoModel.from_pretrained(MeanLargeConfig.pretrained_model_path,
                                                 config=self.config)
        self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=1e-7)
        self.pooler = MeanPooling(self.config.hidden_size)

        self.low_dropout = nn.Dropout(0.1)
        self.dropout = nn.Dropout(p=0.5)
        self.regressor = nn.Linear(self.config.hidden_size, 1)


    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        
        ## MeanPooling
        hidden_states = outputs[0]
        pool_out = self.pooler(hidden_states, attention_mask)
        # pool_out = self.low_dropout(pool_out)
        
        # didn`t help
        logits = torch.mean(torch.stack([self.regressor(self.dropout(pool_out)) for _ in range(5)], dim=0), dim=0)

        return (logits, pool_out)

### Attn v2

In [18]:
class AttConfig1:
    model_name = 'roberta-large'
    pretrained_model_path = '../input/pretrained/large_s'
    
    output_hidden_states = True
    epochs = 3
    num_labels = 1
    
    device = 'cuda'
    
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01
    head_hidden = 512
    
    warmup_steps = 50

In [19]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, hidden_state, attention_mask):
        att = torch.tanh(self.W(hidden_state))
        score = self.V(att)

        mask_expanded = attention_mask.unsqueeze(-1)
        score[~mask_expanded] = -1e9

        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * hidden_state
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector
    
    
class AttModel(nn.Module):
    def __init__(self, config, attn_type='tradition'):
        super().__init__()
        self.config = AutoConfig.from_pretrained(config.pretrained_model_path)
        self.config.update({
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.0,
                "attention_probs_dropout_prob": 0.1,
                "layer_norm_eps": 1e-7
                }) 
        self.roberta = AutoModel.from_pretrained(config.pretrained_model_path,
                                                 config=self.config)
        self.head = AttentionHead(self.config.hidden_size, config.head_hidden)
        
        self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=1e-5)
        self.regressor = nn.Linear(self.config.hidden_size, config.num_labels)
        
        self.dropout = nn.Dropout(p=0.1)
        self.m_dropout = nn.Dropout(p=0.5)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        hidden_states = outputs[2][-1]

        x = self.head(hidden_states, attention_mask)
        logits = self.regressor(x)

        return logits, x

### Attn 3

In [20]:
class AttConfig2:
    model_name = 'roberta-base'
    pretrained_model_path = '../input/pretrained/base_s'
    
    output_hidden_states = True
    epochs = 3
    num_labels = 1
    
    device = 'cuda'
    
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01
    head_hidden = 512
    
    warmup_steps = 50

In [21]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, hidden_state, attention_mask):

        att = torch.tanh(self.W(hidden_state))
        score = self.V(att)

        mask_expanded = attention_mask.unsqueeze(-1)
        score[~mask_expanded] = -1e9

        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * hidden_state
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector
    
    
class Mish(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x *( torch.tanh(F.softplus(x)))
    
    
class AttModel2(nn.Module):
    def __init__(self, config, attn_type='tradition'):
        super().__init__()
        self.config = AutoConfig.from_pretrained(config.pretrained_model_path)
        self.config.update({
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.0,
                "attention_probs_dropout_prob": 0.1,
                "layer_norm_eps": 1e-7
                }) 
        self.roberta = AutoModel.from_pretrained(config.pretrained_model_path,
                                                 config=self.config)
        self.head = AttentionHead(self.config.hidden_size, config.head_hidden)

        self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=1e-5)
        self.regressor = nn.Sequential(
                nn.Linear(self.config.hidden_size, 512),
                Mish(),
                nn.Linear(512, 1)
        )
        
        self.dropout = nn.Dropout(p=0.1)
        self.m_dropout = nn.Dropout(p=0.5)
        

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        hidden_states = outputs[2][-1]

        x = self.head(hidden_states, attention_mask)

        x = self.layer_norm(x)
        logits = self.regressor(x)
        
        return logits, x

### Mean v2

In [22]:
# Mean Model
class MeanV2BaseConfig:
    model_name = 'roberta-base'
    pretrained_model_path = '../input/pretrained/base_s'
    
    epochs = 3
    batch_size = 16
    test_batch = 32
    
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01
    
    head_hidden = 512

In [23]:
# Mean Large Model
class MeanV2LargeConfig:
    model_name = 'roberta-large'
    pretrained_model_path = '../input/pretrained/large_s'
    
    epochs = 3
    batch_size = 16
    test_batch = 32
    
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01
    
    head_hidden = 512

In [24]:
# V2 base :  inference model

class MeanPooling(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        
    def forward(self, hidden_state, attention_mask):
        # last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
        sum_embeddings = torch.sum(hidden_state * input_mask_expanded, 1)

        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        
        return mean_embeddings


#Mish - "Mish: A Self Regularized Non-Monotonic Neural Activation Function"
#https://arxiv.org/abs/1908.08681v1
class Mish(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x *( torch.tanh(F.softplus(x)))

    
class MeanModel_v2(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = AutoConfig.from_pretrained(config.pretrained_model_path)
        self.config.update({
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.0,
                "attention_probs_dropout_prob": 0.1,
                "layer_norm_eps": 1e-7
                }) 
        self.roberta = AutoModel.from_pretrained(config.pretrained_model_path,
                                                 config=self.config)
        
        self.pooler = MeanPooling(self.config.hidden_size)
        self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=1e-5)
        
        self.dropout = nn.Dropout(0.5)
        self.regressor = nn.Sequential(
                nn.Linear(self.config.hidden_size, 512),
                Mish(),
                nn.Linear(512, 1)
        )
        
        self.std = 0.02
        self._init_weights(self.regressor)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            # module.weight.data.normal_(mean=0.0, std=self.std)
            init.kaiming_normal_(module.weight, mode='fan_in')
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        
        ## MeanPooling
        hidden_states = outputs[0]
        pool_out = self.pooler(hidden_states, attention_mask)
        # pool_out = self.dropout(pool_out)
        pool_out = self.layer_norm(pool_out)
        logits = self.regressor(pool_out)

        return logits, pool_out

### Infer func

In [25]:
def get_test_data(df):
    tokenizer = torch.load('../input/pretrained/roberta_tk.pt') 
    test_dataset = RoBERTaDataset(df, tokenizer, for_test=True)
    test_loader = DataLoader(test_dataset, batch_size=Config.test_batch,
                             num_workers=4, shuffle=False, pin_memory=True,
                             drop_last=False)
    return test_loader

In [26]:
def reset_memory():
    gc.collect()
    torch.cuda.synchronize()
    torch.cuda.empty_cache()

In [27]:
def inference(test_dataloader, model_dirs, model=None, n_models=5, ckpt_bias=0, with_embedding=False):
    models_preds = []
    models_embedding = []
    for model_num in range(n_models):
        print(f'{by_}{r_}  >>> Inference # {model_num+1}/{n_models}  {sr_}')
        torch.cuda.synchronize()

        # load
        model_path = model_dirs[model_num]
        print(f" ### Using {model_path}")
        if model:
            model.load_state_dict(torch.load(model_path, map_location=Config.device))
        else:
            model = torch.load(model_path)
        model.to(Config.device)

        # predict
        fold_preds = []
        embeddings = []
        model.eval()
        with torch.no_grad():
            for step, batch in enumerate(test_dataloader):
                sent_id, mask = batch['input_ids'].to(Config.device), batch['attention_mask'].to(Config.device)
                preds = model(sent_id, mask)

                if with_embedding and len(preds) == 2:
                    preds, embed = preds[0], preds[1]
                    embed = embed.detach().cpu().numpy()
                    embeddings.extend(embed)
                if len(preds) == 2:
                    preds = preds[0]
                fold_preds += preds.flatten().cpu().tolist()

        # records
        models_preds.append(fold_preds)
        if with_embedding:
            models_embedding.append(np.array(embeddings))

        if not model:  # load_state_dict 方式，不能在这里删除
            del model
            gc.collect()
            torch.cuda.synchronize()
            torch.cuda.empty_cache()

        print(f'! Model Complete. ++++++++++')
    print()

    # output
    models_preds = np.array(models_preds).mean(axis=0)
    if not with_embedding:
        return models_preds
    else:
        return models_preds, models_embedding

In [28]:
def embedding_svr_test(embeddings, num_pred, bert_nums=5, svr_nfolds=10):
    # SVM predict: 5 SVR model
    results = np.zeros(num_pred)
    for index, X_test in enumerate(embeddings):
        print(f'{by_}{r_}  SVR#{index+1} predicting {sr_}')
        for i in range(svr_nfolds):
            svr = load(save_dir + f'svr_{index}_{i}.bin')
            preds = svr.predict(X_test)
            results += preds
    print(f'SVR Complete.')

    return results / bert_nums / svr_nfolds

## MAIN

In [29]:
test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
test_df['excerpt'] = test_df['excerpt'].apply(lambda x: x.replace('\n',' '))

submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [30]:
def run_m1():
    #### LitModel: 467
    test_dataloader = get_test_data(test_df)
    
    Lit = [f'../input/comlitothers/model_{i + 1}.bin' for i in range(5)]
    litmodel = LitModel()  # if use load_state_dict, init model
    pred_lit = inference(test_dataloader, Lit, model=litmodel, ckpt_bias=1)
    del litmodel, test_dataloader
    gc.collect()
    torch.cuda.synchronize()
    torch.cuda.empty_cache()
    return pred_lit

In [31]:
def run_m2():
    #### mean large v2 reinit: 0.466
    test_dataloader = get_test_data(test_df)
    
    MeanL_4 = [ f'/kaggle/input/meanlargereinit/model_{i}.bin' for i in range(5)]
    config = MeanV2LargeConfig()
    meanmodel4 = MeanModel_v2(config)
    pred_mean_v4 = inference(test_dataloader, MeanL_4, model=meanmodel4, with_embedding=False)
    # svr_preds = embedding_svr_test(embeddings, len(test_df))
    del meanmodel4, test_dataloader
    gc.collect()
    torch.cuda.synchronize()
    torch.cuda.empty_cache()

    return pred_mean_v4

In [32]:
def run_m3():
    #### attn large 1:  0.464
    test_dataloader = get_test_data(test_df)
    
    AttL_1 = [ f'/kaggle/input/largeattnlit/model_{i}.bin' for i in range(5)]
    config = AttConfig1()
    attmodel1 = AttModel(config)
    pred_attL_v1 = inference(test_dataloader, AttL_1, model=attmodel1, with_embedding=False)
    # svr_preds = embedding_svr_test(embeddings, len(test_df))
    del attmodel1, test_dataloader
    gc.collect()
    torch.cuda.synchronize()
    torch.cuda.empty_cache()
    return pred_attL_v1

In [33]:
def run_m4():
    #### attn large reinit 2:  0.467
    test_dataloader = get_test_data(test_df)
    
    AttL_2 = [ f'/kaggle/input/attlargereinit/model_{i}.bin' for i in range(5)]
    config = AttConfig1()
    attmodel2 = AttModel(config)
    pred_attL_v2 = inference(test_dataloader, AttL_2, model=attmodel2, with_embedding=False)
    # svr_preds = embedding_svr_test(embeddings, len(test_df))
    del attmodel2, test_dataloader
    gc.collect()
    torch.cuda.synchronize()
    torch.cuda.empty_cache()
    return pred_attL_v2

In [34]:
################### Post Process ########################
pred_attL_v2 = run_m4()
pred_attL_v1 = run_m3()
pred_mean_v4 = run_m2()
pred_lit = run_m1()


# predictions = pred_attL_v1 * 0.5 + pred_mean_v4 * 0.3 + pred_lit * 0.2
predictions = pred_attL_v2 * 0.15 + pred_attL_v1 * 0.4 + pred_mean_v4 * 0.3 + pred_lit * 0.15

# predictions = svr_preds * 0.5 + pred_lit * 0.5  # or whatever


################### Submisson ########################
submission_df.target = predictions
submission_df.to_csv('submission.csv')
print(submission_df.head(10))

Some weights of the model checkpoint at ../input/pretrained/large_s were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/pretrained/large_s and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[43m[31m  >>> Inference # 1/5  [0m
 ### Using /kaggle/input/largeattnlit/model_0.bin
! Model Complete. ++++++++++
[43m[31m  >>> Inference # 2/5  [0m
 ### Using /kaggle/input/largeattnlit/model_1.bin
! Model Complete. ++++++++++
[43m[31m  >>> Inference # 3/5  [0m
 ### Using /kaggle/input/largeattnlit/model_2.bin
! Model Complete. ++++++++++
[43m[31m  >>> Inference # 4/5  [0m
 ### Using /kaggle/input/largeattnlit/model_3.bin
! Model Complete. ++++++++++
[43m[31m  >>> Inference # 5/5  [0m
 ### Using /kaggle/input/largeattnlit/model_4.bin
! Model Complete. ++++++++++



Some weights of the model checkpoint at ../input/pretrained/large_s were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/pretrained/large_s and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[43m[31m  >>> Inference # 1/5  [0m
 ### Using /kaggle/input/meanlargereinit/model_0.bin
! Model Complete. ++++++++++
[43m[31m  >>> Inference # 2/5  [0m
 ### Using /kaggle/input/meanlargereinit/model_1.bin
! Model Complete. ++++++++++
[43m[31m  >>> Inference # 3/5  [0m
 ### Using /kaggle/input/meanlargereinit/model_2.bin
! Model Complete. ++++++++++
[43m[31m  >>> Inference # 4/5  [0m
 ### Using /kaggle/input/meanlargereinit/model_3.bin
! Model Complete. ++++++++++
[43m[31m  >>> Inference # 5/5  [0m
 ### Using /kaggle/input/meanlargereinit/model_4.bin
! Model Complete. ++++++++++



Some weights of the model checkpoint at ../input/pretrained/base_s were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/pretrained/base_s and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[43m[31m  >>> Inference # 1/5  [0m
 ### Using ../input/comlitothers/model_1.bin
! Model Complete. ++++++++++
[43m[31m  >>> Inference # 2/5  [0m
 ### Using ../input/comlitothers/model_2.bin
! Model Complete. ++++++++++
[43m[31m  >>> Inference # 3/5  [0m
 ### Using ../input/comlitothers/model_3.bin
! Model Complete. ++++++++++
[43m[31m  >>> Inference # 4/5  [0m
 ### Using ../input/comlitothers/model_4.bin
! Model Complete. ++++++++++
[43m[31m  >>> Inference # 5/5  [0m
 ### Using ../input/comlitothers/model_5.bin
! Model Complete. ++++++++++

          id    target
0  c0f722661 -0.406300
1  f0953f0a5 -0.482320
2  0df072751 -0.514308
3  04caf4e0c -2.333567
4  0e63f8bea -1.933935
5  12537fe78 -1.253635
6  965e592c0  0.236844


In [35]:
# if __name__ == "__main__":
#     test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
#     test_df['excerpt'] = test_df['excerpt'].apply(lambda x: x.replace('\n',' '))
#     test_dataloader = get_test_data(test_df)


#     ################### Predict ########################
#     ################## load model
#     #### Base CLRPModel 474
# #     CLRP_1 = [f'../input/comlitmodelone/best_model_{i}.pt' for i in range(5)]
# #     preds_c1 = inference(test_dataloader, CLRP_1, with_embedding=False)
# #     reset_memory()


#     #### Base CLRPModel 475
# #     CLRP_2 = [f'../input/comlitbase2/best_model_{i}.pt' for i in range(5)]
# #     preds_c2 = inference(test_dataloader, CLRP_2, with_embedding=False)
# #     reset_memory()



#     ################## load_state_dict
#     #### LitModel: 467
#     Lit = [f'../input/comlitothers/model_{i + 1}.bin' for i in range(5)]
#     litmodel = LitModel()  # if use load_state_dict, init model
#     pred_lit = inference(test_dataloader, Lit, model=litmodel, ckpt_bias=1)
#     del litmodel
#     reset_memory()


#     #### mean large v1: 472  ;    MeanEmbedding V1 + SVM 473
# #     MeanL_1 = [ f'../input/clrobertalarger/model_{i}.bin' for i in range(5)]
# #     meanmodel = MeanModel()
# #     pred_mean_v1, embeddings = inference(test_dataloader, MeanL_1, model=meanmodel, with_embedding=True)
# #     svr_preds = embedding_svr_test(embeddings, len(test_df))
# #     del meanmodel
# #     reset_memory()


#     #### mean base v2: 478 bad
# #     MeanB_2 = [ f'../input/newmeanbase/model_{i}.bin' for i in range(5)]
# #     config = MeanV2BaseConfig()
# #     meanmodel2 = MeanModel_v2(config)
# #     pred_mean_v2 = inference(test_dataloader, MeanB_2, model=meanmodel2, with_embedding=False)
# #     # svr_preds = embedding_svr_test(embeddings, len(test_df))
# #     del meanmodel2
# #     reset_memory()


#     #### mean base fgm v2: 481 bad     fgm not help
# #     MeanBF_2 = [ f'../input/newmeanbasefgm/model_{i}.bin' for i in range(5)]
# #     config = MeanV2BaseConfig()
# #     meanmodel2f = MeanModel_v2(config)
# #     pred_mean_v2f = inference(test_dataloader, MeanBF_2, model=meanmodel2f, with_embedding=False)
# #     # svr_preds = embedding_svr_test(embeddings, len(test_df))
# #     del meanmodel2f
# #     reset_memory()
    
    
#     #### mean large v2: 0.469  Nice
# #     MeanL_3 = [ f'/kaggle/input/newmeanlarge/model_{i}.bin' for i in range(5)]
# #     config = MeanV2LargeConfig()
# #     meanmodel3 = MeanModel_v2(config)
# #     pred_mean_v3 = inference(test_dataloader, MeanL_3, model=meanmodel3, with_embedding=False)
# #     # svr_preds = embedding_svr_test(embeddings, len(test_df))
# #     del meanmodel3
# #     reset_memory()


#     #### mean large v2 reinit: 0.466
#     MeanL_4 = [ f'/kaggle/input/meanlargereinit/model_{i}.bin' for i in range(5)]
#     config = MeanV2LargeConfig()
#     meanmodel4 = MeanModel_v2(config)
#     pred_mean_v4 = inference(test_dataloader, MeanL_4, model=meanmodel4, with_embedding=False)
#     # svr_preds = embedding_svr_test(embeddings, len(test_df))
#     del meanmodel4
#     reset_memory()
    

#     #### attn large 1:  0.464
#     AttL_1 = [ f'/kaggle/input/largeattnlit/model_{i}.bin' for i in range(5)]
#     config = AttConfig1()
#     attmodel1 = AttModel(config)
#     pred_attL_v1 = inference(test_dataloader, AttL_1, model=attmodel1, with_embedding=False)
#     # svr_preds = embedding_svr_test(embeddings, len(test_df))
#     del attmodel1
#     reset_memory()


#     #### attn large reinit 2:  0.467
#     AttL_2 = [ f'/kaggle/input/attlargereinit/model_{i}.bin' for i in range(5)]
#     config = AttConfig1()
#     attmodel2 = AttModel(config)
#     pred_attL_v2 = inference(test_dataloader, AttL_2, model=attmodel2, with_embedding=False)
#     # svr_preds = embedding_svr_test(embeddings, len(test_df))
#     del attmodel2
#     reset_memory()
    
    
#     #### attn base 2: 
# #     AttB_2 = [ f'/kaggle/input/meanattnreinit/model_{i}.bin' for i in range(5)]
# #     config = AttConfig2()
# #     attmodel2 = AttModel2(config)
# #     pred_attL_v2 = inference(test_dataloader, AttL_2, model=attmodel2, with_embedding=False)
# #     # svr_preds = embedding_svr_test(embeddings, len(test_df))
# #     del attmodel2
# #     reset_memory()
    
#     # ................................................

    
#     ################### Post Process ########################
#     predictions = pred_attL_v2 * 0.15 + pred_attL_v1 * 0.4 + pred_mean_v4 * 0.3 + pred_lit * 0.15
# #     predictions = svr_preds * 0.5 + pred_lit * 0.5  # or whatever


#     ################### Submisson ########################
#     result_df = pd.DataFrame({'id': test_df.id, 'target': predictions})
#     result_df.to_csv('submission.csv', index=False)
#     print(result_df.head(10))

#### Only SVR inference[Test using]

-   id    target
- 0  c0f722661 -0.367501
- 1  f0953f0a5 -0.398820
- 2  0df072751 -0.537474
- 3  04caf4e0c -2.299636
- 4  0e63f8bea -1.962926
- 5  12537fe78 -1.093182
- 6  965e592c0  0.077126