In [1]:
import pandas as pd 
import numpy as np 
import torch 
import torch.nn as nn 
import random 
import os 
import re 
from transformers import AutoModel , AutoTokenizer, BertTokenizer
from torch.utils.data import DataLoader , Dataset
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
import math
import warnings
warnings.filterwarnings("ignore")

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [3]:
config = {"seed":42,
          "batch_size":16,
          "max_len" : 256}

In [4]:
seed_everything(config["seed"])

In [5]:
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [6]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

In [7]:
def clean_text(excerpt):
    punctuations = ".,?!;\(\":-)‘"
    extrait = excerpt
    for p in punctuations : 
      extrait = extrait.replace(p,f" {p} ")
    extrait = re.sub(r"'s"," is ",extrait)
    extrait = extrait.replace("i'm","I'm")
    extrait = extrait.replace("don't","do not")
    extrait = extrait.replace("didn't","did not")
    extrait = extrait.replace("can't","cannot")
    extrait = extrait.replace("i'll","I will")
    extrait = extrait.replace("wouldn't","would not")
    extrait = extrait.replace("i've","I have")
    extrait = re.sub(r"i've","I have",extrait)
    extrait = extrait.replace("won't","will not")
    extrait = extrait.replace("couldn't","could not")
    extrait = extrait.replace("wasn't","was not")
    extrait = extrait.replace("you'll","you will")
    extrait = extrait.replace("isn't","is not")
    extrait = extrait.replace("you're","you are")
    extrait = extrait.replace("hadn't","had not")
    extrait = extrait.replace("you've","you have")
    extrait = extrait.replace("doesn't","does not")
    extrait = extrait.replace("haven't","have not")
    extrait = extrait.replace("they're","they are")
    extrait = extrait.replace("we're","we are")
    #extrait = re.sub(r"(/s+)i(/s+)","I",excerpt)
    #extrait = re.sub(r"don't","do not",extrait)
    #extrait = re.sub(r"i'm","I'm",extrait)
    #extrait = re.sub(r"man's","man is",extrait)
    #extrait = re.sub(r"it's","it is",extrait)
    #extrait = re.sub(r"didn't","did not",extrait)
    #extrait = re.sub(r"can't","cannot",extrait)
    #extrait = re.sub(r"earth's","earth is",extrait)
    #extrait = re.sub(r"father's","father is",extrait)
    #extrait = re.sub(r"i'll","I will",extrait)
    #extrait = re.sub(r"i've","I have",extrait)
    #extrait = re.sub(r"i\'",r"I'",extrait)
    #extrait = re.sub(r"children\'s","children is",extrait)
    
    return extrait 

In [8]:
test["cleaned_excerpt"] = test["excerpt"].map(clean_text)

In [9]:
class Lisibility (nn.Module) :
    
    def __init__(self,path) :
        super(Lisibility,self).__init__()
        
        self.layer = AutoModel.from_pretrained(path)
        self.dense = nn.Linear(768,1)
        self.dropout = nn.Dropout(p=0.1)
    def forward(self,**xp) :
        
        x = self.layer(**xp)[0] [:,0,:]
        #x = self.dropout(x)
        #x = self.dense(x)
        
        return x

In [10]:
class DataGenerator(Dataset) :
    
    def __init__(self,texts,tokenizer,max_len) :
        
        self.tokenizer = tokenizer 
        self.texts = texts 
        #self.labels = labels 
        self.max_len = max_len 
    def __getitem__(self,idx) :
        
        encode = self.tokenizer(self.texts[idx],return_tensors = "pt",max_length=self.max_len,\
                               padding="max_length",truncation=True)
        #target = torch.tensor(self.labels[idx])
        return encode 
    def __len__(self) :
        return len(self.texts)

In [11]:
if torch.cuda.is_available() :
    device = torch.device("cuda")
else :
    device = torch.device("cpu")

In [12]:
def create_dataloader (texts,tokenizer) :
    dataset = DataGenerator(texts,tokenizer,config["max_len"])
    dataloader = DataLoader(dataset,batch_size= config["batch_size"],shuffle=False)
    return dataloader 

In [13]:
with open("../input/bert-model-training/tokenizer","rb") as f :
    tokenizer = pickle.load(f)

In [14]:
tokenizer = BertTokenizer.from_pretrained(f"../input/bert-fine-tunning/model_{0}")

In [15]:
model_base = "../input/training-bert-models/bert_base_chk/"

In [16]:
paths = [f"../input/bert-fine-tunning/model_{i}/model{i}.bin" for i in range(5)]

In [17]:
loss_fn = lambda out,y_t : torch.sqrt(nn.MSELoss()(out.view(-1),y_t.view(-1)))

In [18]:
train["bins-target"] = pd.cut(train["target"].values,bins=[train["target"].min()-1,train["target"].\
                                                   quantile(0.25),train["target"].quantile(0.5),train["target"].quantile(0.75),train["target"].quantile(1)],labels=["Q1","Q2","Q3","Q4"])

In [19]:
train["cleaned_excerpt"] = train["excerpt"].map(clean_text)

In [20]:
def get_bert_embedding(tx_dataloader,model) :
    b_embedding = []
    model.eval()
    for i,data in enumerate(tx_dataloader) :
        
        data = { key:val.reshape(val.shape[0],-1).to(device) for (key,val) in data.items()}
        with torch.no_grad() :
            
            out = model(**data).detach().cpu().numpy().tolist()
            b_embedding.extend(out)
    
    return b_embedding
    

In [21]:
rmse = lambda y_t,y_pred : math.sqrt(mean_squared_error(y_t,y_pred))

In [22]:
test_dataloader = create_dataloader(test["cleaned_excerpt"].values,tokenizer)

In [23]:
def svr_prediction(model ,i,n_folds = 5) :
    test_embedding = get_bert_embedding(test_dataloader,model)
    prediction = np.zeros((len(test)))
    scores = []
    st = StratifiedKFold(n_splits=n_folds,shuffle=False) 
    print(f"Model {i} starting...")
    for fold ,(tr_ind , val_ind) in enumerate(st.split(train["cleaned_excerpt"].values,\
                                                       train["bins-target"].values)):
        
        train_texts = train.reset_index(drop=True).loc[tr_ind,"cleaned_excerpt"].values
        train_target = train.reset_index(drop=True).loc[tr_ind,"target"].values
        
        val_texts = train.reset_index(drop=True).loc[val_ind,"cleaned_excerpt"].values
        val_target = train.reset_index(drop=True).loc[val_ind,"target"].values
        
        tr_dataloader = create_dataloader(train_texts,tokenizer)
        val_dataloader = create_dataloader(val_texts,tokenizer)
        
        X_t = get_bert_embedding(tr_dataloader,model)
        X_v = get_bert_embedding(val_dataloader,model)
        
        svr = LinearSVR(C=10)
        svr.fit(X_t,train_target)
        ypred =svr.predict(X_v)
        score = rmse(val_target,ypred)
        scores.append(score)
        print(f"Score for the fold {fold} : {score}")
        
        prediction += svr.predict(test_embedding)
    
    
    print(f"mse_error for the model {i} : {np.mean(scores)}")
    prediction /= n_folds     
    
    return prediction 
        
        

In [24]:
predictions = []
for i,path in enumerate(paths) : 
    
    model = Lisibility(model_base)
    model.load_state_dict(torch.load(path),strict=False)
    model.to(device)
    
    pr = svr_prediction(model,i,n_folds=5)
    
    predictions.append(pr)

Some weights of BertModel were not initialized from the model checkpoint at ../input/training-bert-models/bert_base_chk/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model 0 starting...
Score for the fold 0 : 0.7192434693083672
Score for the fold 1 : 0.7904604990131309
Score for the fold 2 : 0.9187359508922068
Score for the fold 3 : 0.7106232875634056
Score for the fold 4 : 0.7732415588980055
mse_error for the model 0 : 0.7824609531350232


Some weights of BertModel were not initialized from the model checkpoint at ../input/training-bert-models/bert_base_chk/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model 1 starting...
Score for the fold 0 : 1.2755136025541722
Score for the fold 1 : 0.8232618162352218
Score for the fold 2 : 0.7080438385808601
Score for the fold 3 : 0.802204876246077
Score for the fold 4 : 0.8760734031654613
mse_error for the model 1 : 0.8970195073563584


Some weights of BertModel were not initialized from the model checkpoint at ../input/training-bert-models/bert_base_chk/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model 2 starting...
Score for the fold 0 : 0.734462437894346
Score for the fold 1 : 0.8891379718380851
Score for the fold 2 : 0.7383791431923975
Score for the fold 3 : 0.7167245812641363
Score for the fold 4 : 0.8499092223690656
mse_error for the model 2 : 0.7857226713116061


Some weights of BertModel were not initialized from the model checkpoint at ../input/training-bert-models/bert_base_chk/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model 3 starting...
Score for the fold 0 : 0.801820368103026
Score for the fold 1 : 1.0807555597320724
Score for the fold 2 : 0.7344142575320661
Score for the fold 3 : 0.7195790004024897
Score for the fold 4 : 0.7350918747248418
mse_error for the model 3 : 0.8143322120988993


Some weights of BertModel were not initialized from the model checkpoint at ../input/training-bert-models/bert_base_chk/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model 4 starting...
Score for the fold 0 : 0.9243942489361698
Score for the fold 1 : 0.7903875879184152
Score for the fold 2 : 0.816595709292896
Score for the fold 3 : 0.9487159264478735
Score for the fold 4 : 0.7940256031329643
mse_error for the model 4 : 0.8548238151456637


In [25]:
pred = np.vstack(predictions)

In [26]:
prediction = np.mean(pred,axis=0)

In [27]:
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

In [28]:
submission["target"] = prediction

In [29]:
submission.to_csv("submission.csv",index=False)