In [None]:
import pandas as pd 
import numpy as np 
import warnings 
warnings.filterwarnings("ignore")
from colorama import Fore , Style,Back
import re 
import torch 
import os 
import torch.nn as nn 
from transformers import AutoModelForMaskedLM,AutoTokenizer,Trainer,LineByLineTextDataset,\
DataCollatorForLanguageModeling , TrainingArguments , AutoModel ,AdamW
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import StratifiedKFold
import random
r_ = Fore.RED
g_ = Fore.GREEN
y_ = Fore.YELLOW
st_ = Style.RESET_ALL

In [None]:
config = {
    "batch_size" : 16 ,
    "lr" : 5e-5,
    "wb" : 2e-5,
    "batch_size" : 16,
    "max_len" : 256,
    "fold" : 5,
    "seed" : 42,
    "epochs" : 5
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [None]:
seed_everything(config["seed"])

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
def clean_text(excerpt):
    punctuations = ".,?!;\(\":-)‘"
    extrait = excerpt
    for p in punctuations : 
      extrait = extrait.replace(p,f" {p} ")
    extrait = re.sub(r"'s"," is ",extrait)
    extrait = extrait.replace("i'm","I'm")
    extrait = extrait.replace("don't","do not")
    extrait = extrait.replace("didn't","did not")
    extrait = extrait.replace("can't","cannot")
    extrait = extrait.replace("i'll","I will")
    extrait = extrait.replace("wouldn't","would not")
    extrait = extrait.replace("i've","I have")
    extrait = re.sub(r"i've","I have",extrait)
    extrait = extrait.replace("won't","will not")
    extrait = extrait.replace("couldn't","could not")
    extrait = extrait.replace("wasn't","was not")
    extrait = extrait.replace("you'll","you will")
    extrait = extrait.replace("isn't","is not")
    extrait = extrait.replace("you're","you are")
    extrait = extrait.replace("hadn't","had not")
    extrait = extrait.replace("you've","you have")
    extrait = extrait.replace("doesn't","does not")
    extrait = extrait.replace("haven't","have not")
    extrait = extrait.replace("they're","they are")
    extrait = extrait.replace("we're","we are")
    #extrait = re.sub(r"(/s+)i(/s+)","I",excerpt)
    #extrait = re.sub(r"don't","do not",extrait)
    #extrait = re.sub(r"i'm","I'm",extrait)
    #extrait = re.sub(r"man's","man is",extrait)
    #extrait = re.sub(r"it's","it is",extrait)
    #extrait = re.sub(r"didn't","did not",extrait)
    #extrait = re.sub(r"can't","cannot",extrait)
    #extrait = re.sub(r"earth's","earth is",extrait)
    #extrait = re.sub(r"father's","father is",extrait)
    #extrait = re.sub(r"i'll","I will",extrait)
    #extrait = re.sub(r"i've","I have",extrait)
    #extrait = re.sub(r"i\'",r"I'",extrait)
    #extrait = re.sub(r"children\'s","children is",extrait)
    
    return extrait 

In [None]:
train["cleaned_excerpt"] = train["excerpt"].map(clean_text)
test["cleaned_excerpt"] = test["excerpt"].map(clean_text)

In [None]:
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
class Lisibility(nn.Module) :
    
    def __init__(self,path):
        super(Lisibility,self).__init__()
        self.bert_layer = AutoModel.from_pretrained(path)
        self.dropout = nn.Dropout(p=0.1)
        self.dense = nn.Linear(768,1)
    def forward(self,**x) :
        x = self.bert_layer(**x)[0][:,0,:]
        x = self.dropout(x)
        x = self.dense(x)
        return x

In [None]:
class DataGenerator (Dataset) :
    
    def __init__(self,textes,labels,tokenizer,max_len) :
        
        self.textes = textes 
        self.labels = labels 
        self.tokenizer = tokenizer 
        self.max_len = max_len
    def __getitem__(self,idx) :
        
        encode = self.tokenizer(self.textes[idx],return_tensors="pt",max_length=self.max_len,\
                               padding="max_length",truncation = True)
        target = torch.tensor(self.labels[idx])
        
        return encode , target
    
    def __len__(self):
        return len(self.textes)

In [None]:
train["bins_target"] = pd.cut(train["target"],bins=[train["target"].min()-1,train["target"].\
                                                   quantile(0.25),train["target"].quantile(0.5),train["target"].quantile(0.75),train["target"].quantile(1)],labels=["Q1","Q2","Q3","Q4"])

In [None]:
def create_dataloader(texts,labels,tokenizer):
    dataset = DataGenerator(texts,labels,tokenizer,config["max_len"])
    dataloader = DataLoader(dataset,batch_size = config["batch_size"],num_workers=4,\
                            shuffle=False)
    return dataloader

In [None]:
def train_and_validate(tr_dataloader,val_dataloader,model,device,loss_fn,optimizer,best_loss,\
                       epoch,fold,tokenizer,verbose=True):
    train_loss = 0
    for i, (tr_datas,tr_lab) in enumerate(tr_dataloader) :
        model.train()
        optimizer.zero_grad()
        X = {key:val.reshape(val.shape[0],-1).to(device) for (key,val) \
                    in tr_datas.items()}
        Y = tr_lab.to(device)
        
        # Compute output 
        out = model(**X)
        loss = loss_fn(out.float(),Y.float())
        
        # Backpropagation
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
        if (i+1) == len(tr_dataloader) : 
            val_loss = 0
            for j,(val_datas,val_lab) in enumerate(val_dataloader):
                model.eval()
                with torch.no_grad() :
                    X_val = {key:val.reshape(val.shape[0],-1).to(device) for (key,val)\
                            in val_datas.items()}
                    Y_val = val_lab.to(device)
                    
                    output  = model(**X_val)
                    
                    loss = loss_fn(output.float(),Y_val.float())
                    val_loss += loss.item()
            val_loss /= len(val_dataloader)
            train_loss /= len(tr_dataloader)
            if verbose :
                print(f"Epoch: {epoch} | Train Loss : {train_loss}")
                print(f"Epoch: {epoch} | Validation Loss :{val_loss}")
            
                if val_loss < best_loss :
                    print(f"{g_}Validation loss decrease from {best_loss} to {val_loss}{st_}")
                    
                    best_loss = val_loss
                    torch.save(model.state_dict(),f"./model_{fold}/model{fold}.bin")
                    tokenizer.save_pretrained(f"./model_{fold}/")
                return best_loss

In [None]:
sp = StratifiedKFold(n_splits= config["fold"],shuffle=True,random_state=config["seed"])

In [None]:
loss_fn = lambda out,y_t : torch.sqrt(nn.MSELoss()(out.view(-1),y_t.view(-1)))

In [None]:
if torch.cuda.is_available() :
    device = torch.device("cuda")
else :
    device = torch.device("cpu")

In [None]:
path = "../input/training-bert-models/bert_base_chk/"

In [None]:
for fold,(tr_ind,val_ind) in enumerate(sp.split(train["cleaned_excerpt"],train["bins_target"])):
    
    tr_texts = train.reset_index(drop=True).loc[tr_ind,"cleaned_excerpt"].values
    tr_labels = train.reset_index(drop=True).loc[tr_ind,"target"].values
    
    val_texts = train.reset_index(drop=True).loc[val_ind,"cleaned_excerpt"].values
    val_labels = train.reset_index(drop=True).loc[val_ind,"target"].values
    
    tr_dataloader = create_dataloader(tr_texts,tr_labels,tokenizer)
    val_dataloader = create_dataloader(val_texts,val_labels,tokenizer)
   
    os.makedirs(f"model_{fold}",exist_ok = True)
    md = Lisibility(path)
    md.to(device) 
    optimizer = AdamW(md.parameters(),lr=config["lr"],weight_decay=config["wb"])
    best_loss = float("inf")
    print(f"{r_}Training model {fold} startings ...\n{st_}")
    for ep in range(config["epochs"]) : 
        print("="*100)
        print(" " * 40 ,f"Epoch{ep} : Train & Validation")
        best_loss = train_and_validate(tr_dataloader,val_dataloader,md,device,loss_fn,optimizer,best_loss,\
                       ep,fold,tokenizer,verbose=True)