In [1]:
import torch 
import pandas as pd 
import numpy as np 
import warnings
warnings.filterwarnings("ignore")
import re 
import random
import torch.nn as nn 
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Dataset,DataLoader
from transformers import AutoModel,AutoTokenizer,AdamW
from tqdm import tqdm
import os 
from transformers import get_cosine_schedule_with_warmup
from colorama import Fore,Back,Style
r_ = Fore.RED
g_ = Fore.GREEN
y_ = Fore.YELLOW
c_ = Fore.CYAN
b_ = Fore.BLUE
bl_ = Fore.BLACK
sr_ = Style.RESET_ALL

In [2]:
train_data = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

In [3]:
def clean_text(excerpt):
    punctuations = ".,?!;\(\":-)‘"
    extrait = excerpt
    for p in punctuations : 
      extrait = extrait.replace(p,f" {p} ")
    extrait = re.sub(r"'s"," is ",extrait)
    extrait = extrait.replace("i'm","I'm")
    extrait = extrait.replace("don't","do not")
    extrait = extrait.replace("didn't","did not")
    extrait = extrait.replace("can't","cannot")
    extrait = extrait.replace("i'll","I will")
    extrait = extrait.replace("wouldn't","would not")
    extrait = extrait.replace("i've","I have")
    extrait = re.sub(r"i've","I have",extrait)
    extrait = extrait.replace("won't","will not")
    extrait = extrait.replace("couldn't","could not")
    extrait = extrait.replace("wasn't","was not")
    extrait = extrait.replace("you'll","you will")
    extrait = extrait.replace("isn't","is not")
    extrait = extrait.replace("you're","you are")
    extrait = extrait.replace("hadn't","had not")
    extrait = extrait.replace("you've","you have")
    extrait = extrait.replace("doesn't","does not")
    extrait = extrait.replace("haven't","have not")
    extrait = extrait.replace("they're","they are")
    extrait = extrait.replace("we're","we are")
    #extrait = re.sub(r"(/s+)i(/s+)","I",excerpt)
    #extrait = re.sub(r"don't","do not",extrait)
    #extrait = re.sub(r"i'm","I'm",extrait)
    #extrait = re.sub(r"man's","man is",extrait)
    #extrait = re.sub(r"it's","it is",extrait)
    #extrait = re.sub(r"didn't","did not",extrait)
    #extrait = re.sub(r"can't","cannot",extrait)
    #extrait = re.sub(r"earth's","earth is",extrait)
    #extrait = re.sub(r"father's","father is",extrait)
    #extrait = re.sub(r"i'll","I will",extrait)
    #extrait = re.sub(r"i've","I have",extrait)
    #extrait = re.sub(r"i\'",r"I'",extrait)
    #extrait = re.sub(r"children\'s","children is",extrait)
    
    return extrait 

In [4]:
train_data["cleaned_excerpt"] = train_data["excerpt"].map(clean_text)

In [5]:
for i in range(5):
    vars() [f"q{i}"] = train_data["target"].quantile(0.25 * i)
    if i == 0 :
        vars() [f"q{i}"] = vars() [f"q{i}"] - 1
train_data["bins_target"] = pd.cut(train_data["target"],bins=[q0,q1,q2,q3,q4],\
                                  labels=[f"Q{i}" for i in range(1,5)])

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
config = {"seed":42,
         "lr":5e-5,
         "wd":0.01,
         "epochs":5,
         "folds":5,
         "batch_size":16,
         "max_len":256,
         "valid_step":10,
         "model":"../input/roberta-training/clrp_roberta_base_chk/checkpoint-1050/config.json"}

loss_fn = lambda out,true_v : torch.sqrt(nn.MSELoss()(out.view(-1),true_v.view(-1)))

st = StratifiedKFold(n_splits=5,shuffle=True,random_state=config["seed"])

In [7]:
class DataGenerator(Dataset) :
    
    def __init__(self,texts,labels,tokenizer,max_len):
        super(DataGenerator,self).__init__()
        self.tokenizer = tokenizer 
        self.max_len = max_len 
        self.texts = texts
        self.labels = labels
    def __len__(self):
        return len(self.texts)
    def __getitem__(self,item):
        encode = self.tokenizer(self.texts[item],return_tensors="pt",max_length=self.max_len,\
                               padding ="max_length",truncation = True)
        target = torch.tensor(self.labels[item],dtype=torch.float)
        
        return encode,target 
            
        

In [8]:
class AttentionHead(nn.Module) :
    def __init__(self,in_features,hidden_dim,num_target):
        super(AttentionHead,self).__init__()
        self.in_features = in_features 
        self.hidden_dim = hidden_dim 
        self.num_target = num_target 
        self.W = nn.Linear(in_features,hidden_dim)
        self.V = nn.Linear(hidden_dim,1)
    def forward(self,features) :
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score,dim=1)
        context_vector = attention_weights * features 
        context_vector = torch.sum(context_vector,axis=1)
        
        return context_vector

In [9]:
class roberta(nn.Module) :
    
    def __init__(self,path):
        super(roberta,self).__init__()
        self.pret_layer = AutoModel.from_pretrained(path,output_hidden_states = True)
        self.dropout = nn.Dropout(p=0.1)
        #self.batch_n = nn.BatchNorm2d()
        self.linear = nn.Linear(768,1)
        self.att = AttentionHead(768,768,1)
    def forward(self,**xd):
        x = self.pret_layer(**xd)[0]
        x = self.att(x)
        x = self.dropout(x)
        x = self.linear(x)
        return x 

In [10]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("We use cuda device")
else :
    device = torch.device("cpu")
    print("No cuda is available , we use CPU instead !")

We use cuda device


In [11]:
def create_dataloader(texts,labels,tokenizer) :
    dataset = DataGenerator(texts,labels,tokenizer,config["max_len"])
    dataloader = DataLoader(dataset,batch_size = config["batch_size"],shuffle=False,\
                           num_workers = 4)
    return dataloader
def train_and_evaluate_loop(tr_loader,val_loader,device,md,optimizer,best_loss,epoch,fold,tokenizer,\
                            verbose=True) :
    train_loss = 0
    for i,(inp_data ,inp_target) in enumerate(tr_loader) :
        md.train()
        optimizer.zero_grad()
        #X_id,X_mask,Y = (t.to(device) for t in data)
        inp_data = {key : vl.reshape(vl.shape[0],-1).to(device) for (key,vl) in inp_data.items()}
        Y = inp_target.to(device)
        # compute_prediction 
        output = md(**inp_data)
        loss = loss_fn(output.float(),Y.float())
        #backpropagation
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
        if (i+1) == len(tr_loader) :
            
            md.eval()
            val_loss = 0
            
            for i , (dt_inp,dt_target) in enumerate(val_loader) :
                #X_val,Y_val = (t.to(device) for t in dt)
                Y_val = dt_target.to(device)
                dt_inp = {key:val.reshape(val.shape[0],-1).to(device) for (key,val) in dt_inp.items()}
                with torch.no_grad() :
                    out = md(**dt_inp)
                    loss_v = loss_fn(out.float(),Y_val.float())
                    val_loss += loss_v 
            
            val_loss /= len(val_loader)
            
            if verbose :
                print(f"Epoch {epoch} : | Train Loss : {train_loss/len(tr_loader)}")
                print(f"Epoch {epoch} : | Val Loss : {val_loss}")
                if val_loss < best_loss :
                    torch.save(md.state_dict(),f"./model{fold}/model{fold}.bin")
                    tokenizer.save_pretrained(f"./model{fold}")
                    print(f"{g_}best validation loss decreased from {best_loss} to {val_loss}{sr_}")
                    best_loss = val_loss
                      
    return best_loss    
    
def run(st,device,path="../input/roberta-training/clrp_roberta_base_chk/") :
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    for fold , (tr_ind,val_ind) in enumerate(st.split(train_data["cleaned_excerpt"],\
                                                      train_data["bins_target"])) :
        
        texts_tr = train_data.reset_index(drop=True).loc[tr_ind,"cleaned_excerpt"].values
        texts_val = train_data.reset_index(drop=True).loc[val_ind,"cleaned_excerpt"].values
    
        labels_tr = train_data.reset_index(drop=True).loc[tr_ind,"target"].values
        labels_val = train_data.reset_index(drop=True).loc[val_ind,"target"].values
        
        tr_dataloader = create_dataloader(texts_tr,labels_tr,tokenizer)
        val_dataloader = create_dataloader(texts_val,labels_val,tokenizer)
        
        md = roberta(path).to(device)
        opt = AdamW(md.parameters(),lr=config["lr"],weight_decay=config["wd"])
        lr_scheduler = get_cosine_schedule_with_warmup(opt,num_warmup_steps=0,num_training_steps= 25 * len(tr_dataloader))
        best_loss = float("inf")
        os.makedirs(f"model{fold}",exist_ok=True)
        print(f"{r_}Fold {fold} Starting ...{sr_}")
        for ep in tqdm(range(config["epochs"])) :
            print("="*100)
            print(" "*35,f"Epoch {ep+1} : Train & Validation ")
            print("="*100)
            best_loss = train_and_evaluate_loop(tr_dataloader,val_dataloader,device,md,opt,best_loss,ep+1,\
                                                fold,tokenizer)
                      

In [12]:
run(st,device)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at ../input/roberta-training/clrp_roberta_base_chk/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/5 [00:00<?, ?it/s]

[31mFold 0 Starting ...[0m
                                    Epoch 1 : Train & Validation 
Epoch 1 : | Train Loss : 0.7161814381538982
Epoch 1 : | Val Loss : 0.8608529567718506


 20%|██        | 1/5 [01:06<04:26, 66.74s/it]

[32mbest validation loss decreased from inf to 0.8608529567718506[0m
                                    Epoch 2 : Train & Validation 


 40%|████      | 2/5 [02:11<03:15, 65.31s/it]

Epoch 2 : | Train Loss : 0.5788266257920736
Epoch 2 : | Val Loss : 0.8712576031684875
                                    Epoch 3 : Train & Validation 
Epoch 3 : | Train Loss : 0.498326729300996
Epoch 3 : | Val Loss : 0.6953274607658386


 60%|██████    | 3/5 [03:17<02:11, 65.67s/it]

[32mbest validation loss decreased from 0.8608529567718506 to 0.6953274607658386[0m
                                    Epoch 4 : Train & Validation 
Epoch 4 : | Train Loss : 0.4962921383934961
Epoch 4 : | Val Loss : 0.638544499874115


 80%|████████  | 4/5 [04:23<01:05, 65.82s/it]

[32mbest validation loss decreased from 0.6953274607658386 to 0.638544499874115[0m
                                    Epoch 5 : Train & Validation 


100%|██████████| 5/5 [05:27<00:00, 65.48s/it]

Epoch 5 : | Train Loss : 0.4204038507501844
Epoch 5 : | Val Loss : 0.6402451395988464



Some weights of RobertaModel were not initialized from the model checkpoint at ../input/roberta-training/clrp_roberta_base_chk/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/5 [00:00<?, ?it/s]

[31mFold 1 Starting ...[0m
                                    Epoch 1 : Train & Validation 
Epoch 1 : | Train Loss : 0.6748804057567892
Epoch 1 : | Val Loss : 0.9498304724693298


 20%|██        | 1/5 [01:05<04:22, 65.55s/it]

[32mbest validation loss decreased from inf to 0.9498304724693298[0m
                                    Epoch 2 : Train & Validation 
Epoch 2 : | Train Loss : 0.5658780273417352
Epoch 2 : | Val Loss : 0.6140304803848267


 40%|████      | 2/5 [02:11<03:17, 65.75s/it]

[32mbest validation loss decreased from 0.9498304724693298 to 0.6140304803848267[0m
                                    Epoch 3 : Train & Validation 


 60%|██████    | 3/5 [03:15<02:10, 65.06s/it]

Epoch 3 : | Train Loss : 0.5033807826923652
Epoch 3 : | Val Loss : 0.741830587387085
                                    Epoch 4 : Train & Validation 


 80%|████████  | 4/5 [04:19<01:04, 64.76s/it]

Epoch 4 : | Train Loss : 0.41658109583905045
Epoch 4 : | Val Loss : 0.9114920496940613
                                    Epoch 5 : Train & Validation 


100%|██████████| 5/5 [05:24<00:00, 64.85s/it]

Epoch 5 : | Train Loss : 0.3798221123050636
Epoch 5 : | Val Loss : 0.7816985845565796



Some weights of RobertaModel were not initialized from the model checkpoint at ../input/roberta-training/clrp_roberta_base_chk/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/5 [00:00<?, ?it/s]

[31mFold 2 Starting ...[0m
                                    Epoch 1 : Train & Validation 
Epoch 1 : | Train Loss : 0.6854136374215005
Epoch 1 : | Val Loss : 0.9140220284461975


 20%|██        | 1/5 [01:05<04:22, 65.75s/it]

[32mbest validation loss decreased from inf to 0.9140220284461975[0m
                                    Epoch 2 : Train & Validation 
Epoch 2 : | Train Loss : 0.5267930845139732
Epoch 2 : | Val Loss : 0.8095553517341614


 40%|████      | 2/5 [02:12<03:18, 66.06s/it]

[32mbest validation loss decreased from 0.9140220284461975 to 0.8095553517341614[0m
                                    Epoch 3 : Train & Validation 
Epoch 3 : | Train Loss : 0.5238513809064744
Epoch 3 : | Val Loss : 0.5290573835372925


 60%|██████    | 3/5 [03:18<02:12, 66.10s/it]

[32mbest validation loss decreased from 0.8095553517341614 to 0.5290573835372925[0m
                                    Epoch 4 : Train & Validation 
Epoch 4 : | Train Loss : 0.4416852213547263
Epoch 4 : | Val Loss : 0.516528844833374


 80%|████████  | 4/5 [04:24<01:06, 66.12s/it]

[32mbest validation loss decreased from 0.5290573835372925 to 0.516528844833374[0m
                                    Epoch 5 : Train & Validation 


100%|██████████| 5/5 [05:28<00:00, 65.72s/it]

Epoch 5 : | Train Loss : 0.41788657009601593
Epoch 5 : | Val Loss : 0.5519751310348511



Some weights of RobertaModel were not initialized from the model checkpoint at ../input/roberta-training/clrp_roberta_base_chk/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/5 [00:00<?, ?it/s]

[31mFold 3 Starting ...[0m
                                    Epoch 1 : Train & Validation 
Epoch 1 : | Train Loss : 0.726289794058867
Epoch 1 : | Val Loss : 0.6710468530654907


 20%|██        | 1/5 [01:05<04:22, 65.68s/it]

[32mbest validation loss decreased from inf to 0.6710468530654907[0m
                                    Epoch 2 : Train & Validation 


 40%|████      | 2/5 [02:09<03:14, 64.79s/it]

Epoch 2 : | Train Loss : 0.5561068255296895
Epoch 2 : | Val Loss : 0.6864041090011597
                                    Epoch 3 : Train & Validation 


 60%|██████    | 3/5 [03:13<02:08, 64.49s/it]

Epoch 3 : | Train Loss : 0.46224618619176705
Epoch 3 : | Val Loss : 0.7284156084060669
                                    Epoch 4 : Train & Validation 
Epoch 4 : | Train Loss : 0.40700859051774924
Epoch 4 : | Val Loss : 0.5809049606323242


 80%|████████  | 4/5 [04:20<01:05, 65.18s/it]

[32mbest validation loss decreased from 0.6710468530654907 to 0.5809049606323242[0m
                                    Epoch 5 : Train & Validation 


100%|██████████| 5/5 [05:24<00:00, 64.92s/it]

Epoch 5 : | Train Loss : 0.3853402627815663
Epoch 5 : | Val Loss : 0.5891647934913635



Some weights of RobertaModel were not initialized from the model checkpoint at ../input/roberta-training/clrp_roberta_base_chk/ and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/5 [00:00<?, ?it/s]

[31mFold 4 Starting ...[0m
                                    Epoch 1 : Train & Validation 
Epoch 1 : | Train Loss : 0.7666724521509358
Epoch 1 : | Val Loss : 1.4729663133621216


 20%|██        | 1/5 [01:05<04:23, 65.91s/it]

[32mbest validation loss decreased from inf to 1.4729663133621216[0m
                                    Epoch 2 : Train & Validation 
Epoch 2 : | Train Loss : 0.607243581976689
Epoch 2 : | Val Loss : 0.8258786201477051


 40%|████      | 2/5 [02:12<03:18, 66.26s/it]

[32mbest validation loss decreased from 1.4729663133621216 to 0.8258786201477051[0m
                                    Epoch 3 : Train & Validation 
Epoch 3 : | Train Loss : 0.5412815459807154
Epoch 3 : | Val Loss : 0.8128378987312317


 60%|██████    | 3/5 [03:18<02:12, 66.23s/it]

[32mbest validation loss decreased from 0.8258786201477051 to 0.8128378987312317[0m
                                    Epoch 4 : Train & Validation 
Epoch 4 : | Train Loss : 0.4601004965288538
Epoch 4 : | Val Loss : 0.5476098656654358


 80%|████████  | 4/5 [04:24<01:06, 66.28s/it]

[32mbest validation loss decreased from 0.8128378987312317 to 0.5476098656654358[0m
                                    Epoch 5 : Train & Validation 
Epoch 5 : | Train Loss : 0.4020189489697067
Epoch 5 : | Val Loss : 0.5330884456634521


100%|██████████| 5/5 [05:31<00:00, 66.31s/it]

[32mbest validation loss decreased from 0.5476098656654358 to 0.5330884456634521[0m



