In [None]:
import torch 
import numpy as np 
import pandas as pd 
import warnings 
warnings.filterwarnings("ignore")
import torch.nn as nn
from transformers import AutoModel , AutoTokenizer
from colorama import Fore,Style
from torch.utils.data import DataLoader,Dataset
import random 
import os 
import re 
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
g_ = Fore.GREEN
r_ = Fore.RED
b_ = Fore.BLUE
y_ = Fore.YELLOW
st_ = Style.RESET_ALL

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
def clean_text(excerpt):
    punctuations = ".,?!;\(\":-)‘"
    extrait = excerpt
    for p in punctuations : 
      extrait = extrait.replace(p,f" {p} ")
    extrait = re.sub(r"'s"," is ",extrait)
    extrait = extrait.replace("i'm","I'm")
    extrait = extrait.replace("don't","do not")
    extrait = extrait.replace("didn't","did not")
    extrait = extrait.replace("can't","cannot")
    extrait = extrait.replace("i'll","I will")
    extrait = extrait.replace("wouldn't","would not")
    extrait = extrait.replace("i've","I have")
    extrait = re.sub(r"i've","I have",extrait)
    extrait = extrait.replace("won't","will not")
    extrait = extrait.replace("couldn't","could not")
    extrait = extrait.replace("wasn't","was not")
    extrait = extrait.replace("you'll","you will")
    extrait = extrait.replace("isn't","is not")
    extrait = extrait.replace("you're","you are")
    extrait = extrait.replace("hadn't","had not")
    extrait = extrait.replace("you've","you have")
    extrait = extrait.replace("doesn't","does not")
    extrait = extrait.replace("haven't","have not")
    extrait = extrait.replace("they're","they are")
    extrait = extrait.replace("we're","we are")
    #extrait = re.sub(r"(/s+)i(/s+)","I",excerpt)
    #extrait = re.sub(r"don't","do not",extrait)
    #extrait = re.sub(r"i'm","I'm",extrait)
    #extrait = re.sub(r"man's","man is",extrait)
    #extrait = re.sub(r"it's","it is",extrait)
    #extrait = re.sub(r"didn't","did not",extrait)
    #extrait = re.sub(r"can't","cannot",extrait)
    #extrait = re.sub(r"earth's","earth is",extrait)
    #extrait = re.sub(r"father's","father is",extrait)
    #extrait = re.sub(r"i'll","I will",extrait)
    #extrait = re.sub(r"i've","I have",extrait)
    #extrait = re.sub(r"i\'",r"I'",extrait)
    #extrait = re.sub(r"children\'s","children is",extrait)
    
    return extrait 

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
config = {
    "lr" : 5e-5,
    "wd" : 0.01,
    "batch_size" : 16,
    "seed" : 42 ,
    "max_len": 256,
    "folds":5,
    "model" : "../input/roberta-base"
    }
seed_everything(config["seed"])

In [None]:
train["cleaned_excerpt"] = train["excerpt"].map(clean_text)
test["cleaned_excerpt"] = test["excerpt"].map(clean_text)

In [None]:
train["bins_target"] = pd.cut(train["target"],bins=[train["target"].min()-1,train["target"].\
                                                   quantile(0.25),train["target"].quantile(0.5),train["target"].quantile(0.75),train["target"].quantile(1)],\
                             labels=["Q1","Q2","Q3","Q4"])

In [None]:
class AttentionHead(nn.Module):
    def __init__(self,input_dim,hidden_dim,out_dim):
        super(AttentionHead,self).__init__()
        self.input_dim = input_dim 
        self.hidden_dim = hidden_dim 
        self.out_dim = out_dim 
        self.W = nn.Linear(input_dim,hidden_dim)
        self.V = nn.Linear(hidden_dim,out_dim)
    def forward(self,input_features):
        att = torch.tanh(self.W(input_features))
        score = self.V(att)
        attention_weights = torch.softmax(score,dim=1)
        context_vector = attention_weights * input_features 
        context_vector = torch.sum(context_vector,axis=1)
        return context_vector 

In [None]:
class Roberta(nn.Module):
    
    def __init__(self):
        super(Roberta,self).__init__()
        self.layer = AutoModel.from_pretrained(config["model"],output_hidden_states = True)
        self.attention = AttentionHead(768,768,1)
        self.dropout = nn.Dropout(p=0.1)
        self.dense = nn.Linear(768,1)
    def forward(self,**xd) :
        x = self.layer(**xd)[0]
        x = self.attention(x)
        #x = self.dropout(x)
        #x = self.dense(768,1)
        return x 

In [None]:
pretrained_model = "../input/roberta-training/clrp_roberta_base_chk/"

In [None]:
class GenerateData(Dataset) :
    
    def __init__(self,texts,tokenizer,max_len=config["max_len"]):
        self.texts = texts 
        #self.labels = labels 
        self.tokenizer = tokenizer 
        self.max_len = max_len 
    def __getitem__(self,idx):
        encode = self.tokenizer(self.texts[idx],return_tensors="pt",max_length=self.max_len,\
                               padding="max_length",truncation = True)
        #target = torch.tensor(self.labels[idx])
        
        return encode #, target
    def __len__(self) :
        return len(self.texts)

In [None]:
def  create_dataloader (texts,tokenizer):
    dataset = GenerateData(texts,tokenizer)
    dataloader = DataLoader(dataset,batch_size=config["batch_size"],shuffle=False,num_workers=4)
    return dataloader

In [None]:
def get_embedding (texts,tokenizer,md,device):
    embedding = []
    dataloader = create_dataloader(texts,tokenizer)
    md.eval()
    
    for i,inp_dat in enumerate(dataloader) :
        X = {key:val.reshape(val.shape[0],-1).to(device) for (key,val) in inp_dat.items()}
        #Y = inp_tar.to(device)
        
        with torch.no_grad() :
            out = md(**X)
            out = out.detach().cpu().numpy()
            embedding.append(out)
    embedding = np.concatenate(embedding,axis=0)
    return embedding

In [None]:
params={"num_leaves":300,
       "max_bin":450,
       "feature_fraction":0.52,
       "bagging_fraction":0.52,
       "objective":"regression",
       "learning_rate":0.05,
       "boosting_type":"gbdt",
       "metric":"rmse"
       }

In [None]:
def train_lgbm_model(embedding_tr,labels_tr,embedding_ts,labels_ts):
     
     d_train = lgbm.Dataset(embedding_tr,label=labels_tr) 
     d_eval = lgbm.Dataset(embedding_ts,label=labels_ts,reference=d_train)
     clf = lgbm.train(params,d_train,valid_sets=[d_train,d_eval],num_boost_round=1500,\
                    early_stopping_rounds=50,verbose_eval=50)
     return clf 

In [None]:
paths=[f"../input/roberta-fine-tunning/model{i}/model{i}.bin" for i in range(5)]

models = defaultdict(list)
tokenizer = AutoTokenizer.from_pretrained(config["model"])
if torch.cuda.is_available():
    device = torch.device("cuda")
else :
    device = torch.device("cpu")
for i in range(5):
    st = StratifiedKFold(n_splits=5,shuffle=True,random_state=config["seed"])
    for fold,(tr_ind,val_ind) in enumerate(st.split(train,train["bins_target"])) :
        
        text_tr = train.reset_index(drop=True).loc[tr_ind,"cleaned_excerpt"].values
        target_tr = train.reset_index(drop=True).loc[tr_ind,"target"].values
        
        text_val = train.reset_index(drop=True).loc[val_ind,"cleaned_excerpt"].values
        target_val = train.reset_index(drop=True).loc[val_ind,"target"].values
        
        md = Roberta()
        #md.to(device)
        md.load_state_dict(torch.load(paths[i]),strict=False)
        md.to(device)
        tr_embedding = get_embedding(text_tr,tokenizer,md,device)
        val_embedding = get_embedding(text_val,tokenizer,md,device)
        
        lgb_model = train_lgbm_model(tr_embedding,target_tr,val_embedding,target_val)
        
        models[f"model_{i}"].append(lgb_model)
        
        

In [None]:
prediction = []
for i in range(5) : 
    md = Roberta()
    md.load_state_dict(torch.load(paths[i]),strict=False)
    md.to(device)
    test_embedding = get_embedding (test["cleaned_excerpt"].values,tokenizer,md,device)
    for model in models[f"model_{i}"]:
        prediction.append(model.predict(test_embedding))

In [None]:
predict = np.vstack(prediction)

In [None]:
prediction = np.mean(prediction,axis=0)

In [None]:
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
submission["target"] = prediction 

In [None]:
submission.to_csv("submission.csv",index=False)