In [1]:
import pandas as pd 
import numpy as np 
import warnings 
warnings.filterwarnings("ignore")
from colorama import Fore , Style,Back
import re 
import torch 
import os 
import torch.nn as nn 
from transformers import AutoModelForMaskedLM,AutoTokenizer,Trainer,LineByLineTextDataset,\
DataCollatorForLanguageModeling , TrainingArguments , AutoModel ,AdamW
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import StratifiedKFold
import random
r_ = Fore.RED
g_ = Fore.GREEN
y_ = Fore.YELLOW
st_ = Style.RESET_ALL

In [2]:
config = {
    "batch_size" : 16 ,
    "lr" : 5e-5,
    "wb" : 2e-5,
    "batch_size" : 16,
    "max_len" : 256,
    "fold" : 5,
    "seed" : 42,
    "epochs" : 5
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [4]:
seed_everything(config["seed"])

In [5]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [6]:
def clean_text(excerpt):
    punctuations = ".,?!;\(\":-)‘"
    extrait = excerpt
    for p in punctuations : 
      extrait = extrait.replace(p,f" {p} ")
    extrait = re.sub(r"'s"," is ",extrait)
    extrait = extrait.replace("i'm","I'm")
    extrait = extrait.replace("don't","do not")
    extrait = extrait.replace("didn't","did not")
    extrait = extrait.replace("can't","cannot")
    extrait = extrait.replace("i'll","I will")
    extrait = extrait.replace("wouldn't","would not")
    extrait = extrait.replace("i've","I have")
    extrait = re.sub(r"i've","I have",extrait)
    extrait = extrait.replace("won't","will not")
    extrait = extrait.replace("couldn't","could not")
    extrait = extrait.replace("wasn't","was not")
    extrait = extrait.replace("you'll","you will")
    extrait = extrait.replace("isn't","is not")
    extrait = extrait.replace("you're","you are")
    extrait = extrait.replace("hadn't","had not")
    extrait = extrait.replace("you've","you have")
    extrait = extrait.replace("doesn't","does not")
    extrait = extrait.replace("haven't","have not")
    extrait = extrait.replace("they're","they are")
    extrait = extrait.replace("we're","we are")
    #extrait = re.sub(r"(/s+)i(/s+)","I",excerpt)
    #extrait = re.sub(r"don't","do not",extrait)
    #extrait = re.sub(r"i'm","I'm",extrait)
    #extrait = re.sub(r"man's","man is",extrait)
    #extrait = re.sub(r"it's","it is",extrait)
    #extrait = re.sub(r"didn't","did not",extrait)
    #extrait = re.sub(r"can't","cannot",extrait)
    #extrait = re.sub(r"earth's","earth is",extrait)
    #extrait = re.sub(r"father's","father is",extrait)
    #extrait = re.sub(r"i'll","I will",extrait)
    #extrait = re.sub(r"i've","I have",extrait)
    #extrait = re.sub(r"i\'",r"I'",extrait)
    #extrait = re.sub(r"children\'s","children is",extrait)
    
    return extrait 

In [7]:
train["cleaned_excerpt"] = train["excerpt"].map(clean_text)
test["cleaned_excerpt"] = test["excerpt"].map(clean_text)

In [8]:
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [9]:
class Lisibility(nn.Module) :
    
    def __init__(self,path):
        super(Lisibility,self).__init__()
        self.bert_layer = AutoModel.from_pretrained(path)
        self.dropout = nn.Dropout(p=0.1)
        self.dense = nn.Linear(768,1)
    def forward(self,**x) :
        x = self.bert_layer(**x)[0][:,0,:]
        x = self.dropout(x)
        x = self.dense(x)
        return x

In [10]:
class DataGenerator (Dataset) :
    
    def __init__(self,textes,labels,tokenizer,max_len) :
        
        self.textes = textes 
        self.labels = labels 
        self.tokenizer = tokenizer 
        self.max_len = max_len
    def __getitem__(self,idx) :
        
        encode = self.tokenizer(self.textes[idx],return_tensors="pt",max_length=self.max_len,\
                               padding="max_length",truncation = True)
        target = torch.tensor(self.labels[idx])
        
        return encode , target
    
    def __len__(self):
        return len(self.textes)

In [11]:
train["bins_target"] = pd.cut(train["target"],bins=[train["target"].min()-1,train["target"].\
                                                   quantile(0.25),train["target"].quantile(0.5),train["target"].quantile(0.75),train["target"].quantile(1)],labels=["Q1","Q2","Q3","Q4"])

In [12]:
def create_dataloader(texts,labels,tokenizer):
    dataset = DataGenerator(texts,labels,tokenizer,config["max_len"])
    dataloader = DataLoader(dataset,batch_size = config["batch_size"],num_workers=4,\
                            shuffle=False)
    return dataloader

In [13]:
def train_and_validate(tr_dataloader,val_dataloader,model,device,loss_fn,optimizer,best_loss,\
                       epoch,fold,tokenizer,verbose=True):
    train_loss = 0
    for i, (tr_datas,tr_lab) in enumerate(tr_dataloader) :
        model.train()
        optimizer.zero_grad()
        X = {key:val.reshape(val.shape[0],-1).to(device) for (key,val) \
                    in tr_datas.items()}
        Y = tr_lab.to(device)
        
        # Compute output 
        out = model(**X)
        loss = loss_fn(out.float(),Y.float())
        
        # Backpropagation
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
        if (i+1) == len(tr_dataloader) : 
            val_loss = 0
            for j,(val_datas,val_lab) in enumerate(val_dataloader):
                model.eval()
                with torch.no_grad() :
                    X_val = {key:val.reshape(val.shape[0],-1).to(device) for (key,val)\
                            in val_datas.items()}
                    Y_val = val_lab.to(device)
                    
                    output  = model(**X_val)
                    
                    loss = loss_fn(output.float(),Y_val.float())
                    val_loss += loss.item()
            val_loss /= len(val_dataloader)
            train_loss /= len(tr_dataloader)
            if verbose :
                print(f"Epoch: {epoch} | Train Loss : {train_loss}")
                print(f"Epoch: {epoch} | Validation Loss :{val_loss}")
            
                if val_loss < best_loss :
                    print(f"{g_}Validation loss decrease from {best_loss} to {val_loss}{st_}")
                    
                    best_loss = val_loss
                    torch.save(model.state_dict(),f"./model_{fold}/model{fold}.bin")
                    tokenizer.save_pretrained(f"./model_{fold}/")
                return best_loss

In [14]:
sp = StratifiedKFold(n_splits= config["fold"],shuffle=True,random_state=config["seed"])

In [15]:
loss_fn = lambda out,y_t : torch.sqrt(nn.MSELoss()(out.view(-1),y_t.view(-1)))

In [16]:
if torch.cuda.is_available() :
    device = torch.device("cuda")
else :
    device = torch.device("cpu")

In [17]:
path = "../input/training-bert-models/bert_base_chk/"

In [18]:
for fold,(tr_ind,val_ind) in enumerate(sp.split(train["cleaned_excerpt"],train["bins_target"])):
    
    tr_texts = train.reset_index(drop=True).loc[tr_ind,"cleaned_excerpt"].values
    tr_labels = train.reset_index(drop=True).loc[tr_ind,"target"].values
    
    val_texts = train.reset_index(drop=True).loc[val_ind,"cleaned_excerpt"].values
    val_labels = train.reset_index(drop=True).loc[val_ind,"target"].values
    
    tr_dataloader = create_dataloader(tr_texts,tr_labels,tokenizer)
    val_dataloader = create_dataloader(val_texts,val_labels,tokenizer)
   
    os.makedirs(f"model_{fold}",exist_ok = True)
    md = Lisibility(path)
    md.to(device) 
    optimizer = AdamW(md.parameters(),lr=config["lr"],weight_decay=config["wb"])
    best_loss = float("inf")
    print(f"{r_}Training model {fold} startings ...\n{st_}")
    for ep in range(config["epochs"]) : 
        print("="*100)
        print(" " * 40 ,f"Epoch{ep} : Train & Validation")
        best_loss = train_and_validate(tr_dataloader,val_dataloader,md,device,loss_fn,optimizer,best_loss,\
                       ep,fold,tokenizer,verbose=True)

Some weights of BertModel were not initialized from the model checkpoint at ../input/training-bert-models/bert_base_chk/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[31mTraining model 0 startings ...
[0m
                                         Epoch0 : Train & Validation
Epoch: 0 | Train Loss : 0.7084508624295114
Epoch: 0 | Validation Loss :1.0064800249205694
[32mValidation loss decrease from inf to 1.0064800249205694[0m
                                         Epoch1 : Train & Validation
Epoch: 1 | Train Loss : 0.5903417713625331
Epoch: 1 | Validation Loss :0.6289059445261955
[32mValidation loss decrease from 1.0064800249205694 to 0.6289059445261955[0m
                                         Epoch2 : Train & Validation
Epoch: 2 | Train Loss : 0.5037101784222563
Epoch: 2 | Validation Loss :0.5906699887580342
[32mValidation loss decrease from 0.6289059445261955 to 0.5906699887580342[0m
                                         Epoch3 : Train & Validation
Epoch: 3 | Train Loss : 0.42940970867032735
Epoch: 3 | Validation Loss :0.5978154788414637
                                         Epoch4 : Train & Validation
Epoch: 4 | Train Loss : 0.44

Some weights of BertModel were not initialized from the model checkpoint at ../input/training-bert-models/bert_base_chk/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[31mTraining model 1 startings ...
[0m
                                         Epoch0 : Train & Validation
Epoch: 0 | Train Loss : 0.6885350705452369
Epoch: 0 | Validation Loss :0.9909275786744224
[32mValidation loss decrease from inf to 0.9909275786744224[0m
                                         Epoch1 : Train & Validation
Epoch: 1 | Train Loss : 0.5966432738891789
Epoch: 1 | Validation Loss :0.6037937907709016
[32mValidation loss decrease from 0.9909275786744224 to 0.6037937907709016[0m
                                         Epoch2 : Train & Validation
Epoch: 2 | Train Loss : 0.4831899251316635
Epoch: 2 | Validation Loss :0.6448499866657786
                                         Epoch3 : Train & Validation
Epoch: 3 | Train Loss : 0.4693288139893975
Epoch: 3 | Validation Loss :0.7567248915632566
                                         Epoch4 : Train & Validation
Epoch: 4 | Train Loss : 0.3705501294052097
Epoch: 4 | Validation Loss :0.812422347565492


Some weights of BertModel were not initialized from the model checkpoint at ../input/training-bert-models/bert_base_chk/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[31mTraining model 2 startings ...
[0m
                                         Epoch0 : Train & Validation
Epoch: 0 | Train Loss : 0.68234156386953
Epoch: 0 | Validation Loss :0.9444734305143356
[32mValidation loss decrease from inf to 0.9444734305143356[0m
                                         Epoch1 : Train & Validation
Epoch: 1 | Train Loss : 0.5974738757375261
Epoch: 1 | Validation Loss :0.5999991587466664
[32mValidation loss decrease from 0.9444734305143356 to 0.5999991587466664[0m
                                         Epoch2 : Train & Validation
Epoch: 2 | Train Loss : 0.5213810856073675
Epoch: 2 | Validation Loss :0.5713058511416117
[32mValidation loss decrease from 0.5999991587466664 to 0.5713058511416117[0m
                                         Epoch3 : Train & Validation
Epoch: 3 | Train Loss : 0.4489798807132412
Epoch: 3 | Validation Loss :0.5670556583338313
[32mValidation loss decrease from 0.5713058511416117 to 0.5670556583338313[0m
                    

Some weights of BertModel were not initialized from the model checkpoint at ../input/training-bert-models/bert_base_chk/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[31mTraining model 3 startings ...
[0m
                                         Epoch0 : Train & Validation
Epoch: 0 | Train Loss : 0.772502312567872
Epoch: 0 | Validation Loss :0.6903757353623708
[32mValidation loss decrease from inf to 0.6903757353623708[0m
                                         Epoch1 : Train & Validation
Epoch: 1 | Train Loss : 0.5380668719889412
Epoch: 1 | Validation Loss :0.6599937122729089
[32mValidation loss decrease from 0.6903757353623708 to 0.6599937122729089[0m
                                         Epoch2 : Train & Validation
Epoch: 2 | Train Loss : 0.46763013261304776
Epoch: 2 | Validation Loss :0.687760166823864
                                         Epoch3 : Train & Validation
Epoch: 3 | Train Loss : 0.47308274725793115
Epoch: 3 | Validation Loss :0.6610564763347307
                                         Epoch4 : Train & Validation
Epoch: 4 | Train Loss : 0.4164438178421746
Epoch: 4 | Validation Loss :0.6840336248278618


Some weights of BertModel were not initialized from the model checkpoint at ../input/training-bert-models/bert_base_chk/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[31mTraining model 4 startings ...
[0m
                                         Epoch0 : Train & Validation
Epoch: 0 | Train Loss : 0.8564985740772435
Epoch: 0 | Validation Loss :0.6683869577116437
[32mValidation loss decrease from inf to 0.6683869577116437[0m
                                         Epoch1 : Train & Validation
Epoch: 1 | Train Loss : 0.5652521718555773
Epoch: 1 | Validation Loss :0.9539757089482414
                                         Epoch2 : Train & Validation
Epoch: 2 | Train Loss : 0.4907858585807639
Epoch: 2 | Validation Loss :0.5271309498283598
[32mValidation loss decrease from 0.6683869577116437 to 0.5271309498283598[0m
                                         Epoch3 : Train & Validation
Epoch: 3 | Train Loss : 0.5037170452341228
Epoch: 3 | Validation Loss :0.528467116256555
                                         Epoch4 : Train & Validation
Epoch: 4 | Train Loss : 0.3796147782827767
Epoch: 4 | Validation Loss :0.6150040634804301
