# Model 4

In [None]:
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig

from sklearn.model_selection import KFold
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import gc
gc.enable()

In [None]:
BATCH_SIZE = 32
MAX_LEN = 248
EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
# ROBERTA_PATH = "/kaggle/input/roberta-base"
# TOKENIZER_PATH = "/kaggle/input/roberta-base"
ROBERTA_PATH = "../input/clrp-roberta-base/clrp_roberta_base"
TOKENIZER_PATH = "../input/clrp-roberta-base/clrp_roberta_base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

train_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")
print(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index)
# Remove incomplete entries if any.
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
# DATASET
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)
# MODEL
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

In [None]:
def rmse(targets, preds):
    return round(np.sqrt(mean_squared_error(targets, preds)), 4)

def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True
        
def predict(model, data_loader,is_test=False):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    if is_test:
        with torch.no_grad():
            for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
                input_ids = input_ids.to(DEVICE)
                attention_mask = attention_mask.to(DEVICE)

                pred = model(input_ids, attention_mask)                        

                result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
                index += pred.shape[0]
    else:
        with torch.no_grad():
            for batch_num, (input_ids, attention_mask, target) in enumerate(data_loader):
                input_ids = input_ids.to(DEVICE)
                attention_mask = attention_mask.to(DEVICE)

                pred = model(input_ids, attention_mask)                        

                result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
                index += pred.shape[0]
            

    return result

In [None]:
gc.collect()
NUM_FOLDS = 5
SEED = 1000
kfold = KFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)
valid_prediction = np.zeros(len(train_df))

for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df)):    
    model_path = f"../input/commonlit-roberta-0467/model_{fold+1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))    
    model.to(DEVICE)
        
    set_random_seed(SEED + fold)
    
#     train_dataset = LitDataset(train_df.loc[train_indices])    
    val_dataset = LitDataset(train_df.loc[val_indices])    
        
#     train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
#                               drop_last=True, shuffle=True, num_workers=2)    
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=2)    
    
    set_random_seed(SEED + fold)   
    
    pred = predict(model,val_loader)
    
    valid_prediction[val_indices] = pred
        
    del model
    gc.collect()  
    print(rmse(train_df.target.values, valid_prediction))
    
    
    
print('CV’s RMSE:{}'.format(rmse(train_df.target.values, valid_prediction)))

In [None]:
import copy
for_meta = copy.deepcopy(valid_prediction)
for_meta = np.insert(for_meta,106,0)  

In [None]:
import os
from pathlib import Path
import pandas as pd
from sklearn import model_selection
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")

def create_folds(data, num_splits):
    data["kfold"] = -1
    kf = model_selection.KFold(n_splits=num_splits, shuffle=True, random_state=2021)
    for f, (t_, v_) in enumerate(kf.split(X=data)):
        data.loc[v_, 'kfold'] = f
    return data
train = create_folds(train_df, num_splits=5)

In [None]:
oof_train = pd.DataFrame()

oof_train['model4'] = for_meta

oof_train['target'] = train.target.values

oof_train = create_folds(oof_train, num_splits=5)
display(oof_train.shape)
oof_train.head()

In [None]:
train.target.values

In [None]:
valid_prediction

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
preds_df = pd.DataFrame({'Label': oof_train['target'], 'Prediction': oof_train['model4']})

fig, ax = plt.subplots(1, 1, figsize=(20, 6))
sns.distplot(preds_df['Label'], ax=ax, label='Label')
sns.distplot(preds_df['Prediction'], ax=ax, label='Prediction')
ax.legend()
plt.show()

In [None]:
test_dataset = LitDataset(test_df, inference_only=True)

In [None]:
NUM_MODELS = 5

all_predictions = np.zeros((NUM_MODELS, len(test_df)))

test_dataset = LitDataset(test_df, inference_only=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

for model_index in range(NUM_MODELS):            
    model_path = f"../input/commonlit-roberta-0467/model_{model_index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))    
    model.to(DEVICE)
    all_predictions[model_index] = predict(model, test_loader,is_test=True)
                
    del model
    gc.collect()

In [None]:
submission = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")
submission.target = all_predictions.mean(axis=0)
print(submission)
submission.to_csv("submission.csv", index=False)