In [None]:
import torch 
import pandas as pd 
import numpy as np 
import warnings
warnings.filterwarnings("ignore")
import re 
import random
import torch.nn as nn 
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Dataset,DataLoader
from transformers import AutoModel,AutoTokenizer,AdamW
from tqdm import tqdm
import os 
from transformers import get_cosine_schedule_with_warmup
from colorama import Fore,Back,Style
r_ = Fore.RED
g_ = Fore.GREEN
y_ = Fore.YELLOW
c_ = Fore.CYAN
b_ = Fore.BLUE
bl_ = Fore.BLACK
sr_ = Style.RESET_ALL

In [None]:
train_data = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

In [None]:
def clean_text(excerpt):
    punctuations = ".,?!;\(\":-)‘"
    extrait = excerpt
    for p in punctuations : 
      extrait = extrait.replace(p,f" {p} ")
    extrait = re.sub(r"'s"," is ",extrait)
    extrait = extrait.replace("i'm","I'm")
    extrait = extrait.replace("don't","do not")
    extrait = extrait.replace("didn't","did not")
    extrait = extrait.replace("can't","cannot")
    extrait = extrait.replace("i'll","I will")
    extrait = extrait.replace("wouldn't","would not")
    extrait = extrait.replace("i've","I have")
    extrait = re.sub(r"i've","I have",extrait)
    extrait = extrait.replace("won't","will not")
    extrait = extrait.replace("couldn't","could not")
    extrait = extrait.replace("wasn't","was not")
    extrait = extrait.replace("you'll","you will")
    extrait = extrait.replace("isn't","is not")
    extrait = extrait.replace("you're","you are")
    extrait = extrait.replace("hadn't","had not")
    extrait = extrait.replace("you've","you have")
    extrait = extrait.replace("doesn't","does not")
    extrait = extrait.replace("haven't","have not")
    extrait = extrait.replace("they're","they are")
    extrait = extrait.replace("we're","we are")
    #extrait = re.sub(r"(/s+)i(/s+)","I",excerpt)
    #extrait = re.sub(r"don't","do not",extrait)
    #extrait = re.sub(r"i'm","I'm",extrait)
    #extrait = re.sub(r"man's","man is",extrait)
    #extrait = re.sub(r"it's","it is",extrait)
    #extrait = re.sub(r"didn't","did not",extrait)
    #extrait = re.sub(r"can't","cannot",extrait)
    #extrait = re.sub(r"earth's","earth is",extrait)
    #extrait = re.sub(r"father's","father is",extrait)
    #extrait = re.sub(r"i'll","I will",extrait)
    #extrait = re.sub(r"i've","I have",extrait)
    #extrait = re.sub(r"i\'",r"I'",extrait)
    #extrait = re.sub(r"children\'s","children is",extrait)
    
    return extrait 

In [None]:
train_data["cleaned_excerpt"] = train_data["excerpt"].map(clean_text)

In [None]:
for i in range(5):
    vars() [f"q{i}"] = train_data["target"].quantile(0.25 * i)
    if i == 0 :
        vars() [f"q{i}"] = vars() [f"q{i}"] - 1
train_data["bins_target"] = pd.cut(train_data["target"],bins=[q0,q1,q2,q3,q4],\
                                  labels=[f"Q{i}" for i in range(1,5)])

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
config = {"seed":42,
         "lr":5e-5,
         "wd":0.01,
         "epochs":5,
         "folds":5,
         "batch_size":16,
         "max_len":256,
         "valid_step":10,
         "model":"../input/roberta-training/clrp_roberta_base_chk/checkpoint-1050/config.json"}

loss_fn = lambda out,true_v : torch.sqrt(nn.MSELoss()(out.view(-1),true_v.view(-1)))

st = StratifiedKFold(n_splits=5,shuffle=True,random_state=config["seed"])

In [None]:
class DataGenerator(Dataset) :
    
    def __init__(self,texts,labels,tokenizer,max_len):
        super(DataGenerator,self).__init__()
        self.tokenizer = tokenizer 
        self.max_len = max_len 
        self.texts = texts
        self.labels = labels
    def __len__(self):
        return len(self.texts)
    def __getitem__(self,item):
        encode = self.tokenizer(self.texts[item],return_tensors="pt",max_length=self.max_len,\
                               padding ="max_length",truncation = True)
        target = torch.tensor(self.labels[item],dtype=torch.float)
        
        return encode,target 
            
        

In [None]:
class AttentionHead(nn.Module) :
    def __init__(self,in_features,hidden_dim,num_target):
        super(AttentionHead,self).__init__()
        self.in_features = in_features 
        self.hidden_dim = hidden_dim 
        self.num_target = num_target 
        self.W = nn.Linear(in_features,hidden_dim)
        self.V = nn.Linear(hidden_dim,1)
    def forward(self,features) :
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score,dim=1)
        context_vector = attention_weights * features 
        context_vector = torch.sum(context_vector,axis=1)
        
        return context_vector

In [None]:
class roberta(nn.Module) :
    
    def __init__(self,path):
        super(roberta,self).__init__()
        self.pret_layer = AutoModel.from_pretrained(path,output_hidden_states = True)
        self.dropout = nn.Dropout(p=0.1)
        #self.batch_n = nn.BatchNorm2d()
        self.linear = nn.Linear(768,1)
        self.att = AttentionHead(768,768,1)
    def forward(self,**xd):
        x = self.pret_layer(**xd)[0]
        x = self.att(x)
        x = self.dropout(x)
        x = self.linear(x)
        return x 

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("We use cuda device")
else :
    device = torch.device("cpu")
    print("No cuda is available , we use CPU instead !")

In [None]:
def create_dataloader(texts,labels,tokenizer) :
    dataset = DataGenerator(texts,labels,tokenizer,config["max_len"])
    dataloader = DataLoader(dataset,batch_size = config["batch_size"],shuffle=False,\
                           num_workers = 4)
    return dataloader
def train_and_evaluate_loop(tr_loader,val_loader,device,md,optimizer,best_loss,epoch,fold,tokenizer,\
                            verbose=True) :
    train_loss = 0
    for i,(inp_data ,inp_target) in enumerate(tr_loader) :
        md.train()
        optimizer.zero_grad()
        #X_id,X_mask,Y = (t.to(device) for t in data)
        inp_data = {key : vl.reshape(vl.shape[0],-1).to(device) for (key,vl) in inp_data.items()}
        Y = inp_target.to(device)
        # compute_prediction 
        output = md(**inp_data)
        loss = loss_fn(output.float(),Y.float())
        #backpropagation
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
        if (i+1) == len(tr_loader) :
            
            md.eval()
            val_loss = 0
            
            for i , (dt_inp,dt_target) in enumerate(val_loader) :
                #X_val,Y_val = (t.to(device) for t in dt)
                Y_val = dt_target.to(device)
                dt_inp = {key:val.reshape(val.shape[0],-1).to(device) for (key,val) in dt_inp.items()}
                with torch.no_grad() :
                    out = md(**dt_inp)
                    loss_v = loss_fn(out.float(),Y_val.float())
                    val_loss += loss_v 
            
            val_loss /= len(val_loader)
            
            if verbose :
                print(f"Epoch {epoch} : | Train Loss : {train_loss/len(tr_loader)}")
                print(f"Epoch {epoch} : | Val Loss : {val_loss}")
                if val_loss < best_loss :
                    torch.save(md.state_dict(),f"./model{fold}/model{fold}.bin")
                    tokenizer.save_pretrained(f"./model{fold}")
                    print(f"{g_}best validation loss decreased from {best_loss} to {val_loss}{sr_}")
                    best_loss = val_loss
                      
    return best_loss    
    
def run(st,device,path="../input/roberta-training/clrp_roberta_base_chk/") :
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    for fold , (tr_ind,val_ind) in enumerate(st.split(train_data["cleaned_excerpt"],\
                                                      train_data["bins_target"])) :
        
        texts_tr = train_data.reset_index(drop=True).loc[tr_ind,"cleaned_excerpt"].values
        texts_val = train_data.reset_index(drop=True).loc[val_ind,"cleaned_excerpt"].values
    
        labels_tr = train_data.reset_index(drop=True).loc[tr_ind,"target"].values
        labels_val = train_data.reset_index(drop=True).loc[val_ind,"target"].values
        
        tr_dataloader = create_dataloader(texts_tr,labels_tr,tokenizer)
        val_dataloader = create_dataloader(texts_val,labels_val,tokenizer)
        
        md = roberta(path).to(device)
        opt = AdamW(md.parameters(),lr=config["lr"],weight_decay=config["wd"])
        lr_scheduler = get_cosine_schedule_with_warmup(opt,num_warmup_steps=0,num_training_steps= 25 * len(tr_dataloader))
        best_loss = float("inf")
        os.makedirs(f"model{fold}",exist_ok=True)
        print(f"{r_}Fold {fold} Starting ...{sr_}")
        for ep in tqdm(range(config["epochs"])) :
            print("="*100)
            print(" "*35,f"Epoch {ep+1} : Train & Validation ")
            print("="*100)
            best_loss = train_and_evaluate_loop(tr_dataloader,val_dataloader,device,md,opt,best_loss,ep+1,\
                                                fold,tokenizer)
                      

In [None]:
run(st,device)