In [None]:
import pandas as pd 
import numpy as np 
import torch 
import torch.nn as nn 
import random 
import os 
import re 
from transformers import AutoModel , AutoTokenizer, BertTokenizer
from torch.utils.data import DataLoader , Dataset
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
import math
import warnings
warnings.filterwarnings("ignore")

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [None]:
config = {"seed":42,
          "batch_size":16,
          "max_len" : 256}

In [None]:
seed_everything(config["seed"])

In [None]:
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

In [None]:
def clean_text(excerpt):
    punctuations = ".,?!;\(\":-)‘"
    extrait = excerpt
    for p in punctuations : 
      extrait = extrait.replace(p,f" {p} ")
    extrait = re.sub(r"'s"," is ",extrait)
    extrait = extrait.replace("i'm","I'm")
    extrait = extrait.replace("don't","do not")
    extrait = extrait.replace("didn't","did not")
    extrait = extrait.replace("can't","cannot")
    extrait = extrait.replace("i'll","I will")
    extrait = extrait.replace("wouldn't","would not")
    extrait = extrait.replace("i've","I have")
    extrait = re.sub(r"i've","I have",extrait)
    extrait = extrait.replace("won't","will not")
    extrait = extrait.replace("couldn't","could not")
    extrait = extrait.replace("wasn't","was not")
    extrait = extrait.replace("you'll","you will")
    extrait = extrait.replace("isn't","is not")
    extrait = extrait.replace("you're","you are")
    extrait = extrait.replace("hadn't","had not")
    extrait = extrait.replace("you've","you have")
    extrait = extrait.replace("doesn't","does not")
    extrait = extrait.replace("haven't","have not")
    extrait = extrait.replace("they're","they are")
    extrait = extrait.replace("we're","we are")
    #extrait = re.sub(r"(/s+)i(/s+)","I",excerpt)
    #extrait = re.sub(r"don't","do not",extrait)
    #extrait = re.sub(r"i'm","I'm",extrait)
    #extrait = re.sub(r"man's","man is",extrait)
    #extrait = re.sub(r"it's","it is",extrait)
    #extrait = re.sub(r"didn't","did not",extrait)
    #extrait = re.sub(r"can't","cannot",extrait)
    #extrait = re.sub(r"earth's","earth is",extrait)
    #extrait = re.sub(r"father's","father is",extrait)
    #extrait = re.sub(r"i'll","I will",extrait)
    #extrait = re.sub(r"i've","I have",extrait)
    #extrait = re.sub(r"i\'",r"I'",extrait)
    #extrait = re.sub(r"children\'s","children is",extrait)
    
    return extrait 

In [None]:
test["cleaned_excerpt"] = test["excerpt"].map(clean_text)

In [None]:
class Lisibility (nn.Module) :
    
    def __init__(self,path) :
        super(Lisibility,self).__init__()
        
        self.layer = AutoModel.from_pretrained(path)
        self.dense = nn.Linear(768,1)
        self.dropout = nn.Dropout(p=0.1)
    def forward(self,**xp) :
        
        x = self.layer(**xp)[0] [:,0,:]
        #x = self.dropout(x)
        #x = self.dense(x)
        
        return x

In [None]:
class DataGenerator(Dataset) :
    
    def __init__(self,texts,tokenizer,max_len) :
        
        self.tokenizer = tokenizer 
        self.texts = texts 
        #self.labels = labels 
        self.max_len = max_len 
    def __getitem__(self,idx) :
        
        encode = self.tokenizer(self.texts[idx],return_tensors = "pt",max_length=self.max_len,\
                               padding="max_length",truncation=True)
        #target = torch.tensor(self.labels[idx])
        return encode 
    def __len__(self) :
        return len(self.texts)

In [None]:
if torch.cuda.is_available() :
    device = torch.device("cuda")
else :
    device = torch.device("cpu")

In [None]:
def create_dataloader (texts,tokenizer) :
    dataset = DataGenerator(texts,tokenizer,config["max_len"])
    dataloader = DataLoader(dataset,batch_size= config["batch_size"],shuffle=False)
    return dataloader 

In [None]:
with open("../input/bert-model-training/tokenizer","rb") as f :
    tokenizer = pickle.load(f)

In [None]:
tokenizer = BertTokenizer.from_pretrained(f"../input/bert-fine-tunning/model_{0}")

In [None]:
model_base = "../input/training-bert-models/bert_base_chk/"

In [None]:
paths = [f"../input/bert-fine-tunning/model_{i}/model{i}.bin" for i in range(5)]

In [None]:
loss_fn = lambda out,y_t : torch.sqrt(nn.MSELoss()(out.view(-1),y_t.view(-1)))

In [None]:
train["bins-target"] = pd.cut(train["target"].values,bins=[train["target"].min()-1,train["target"].\
                                                   quantile(0.25),train["target"].quantile(0.5),train["target"].quantile(0.75),train["target"].quantile(1)],labels=["Q1","Q2","Q3","Q4"])

In [None]:
train["cleaned_excerpt"] = train["excerpt"].map(clean_text)

In [None]:
def get_bert_embedding(tx_dataloader,model) :
    b_embedding = []
    model.eval()
    for i,data in enumerate(tx_dataloader) :
        
        data = { key:val.reshape(val.shape[0],-1).to(device) for (key,val) in data.items()}
        with torch.no_grad() :
            
            out = model(**data).detach().cpu().numpy().tolist()
            b_embedding.extend(out)
    
    return b_embedding
    

In [None]:
rmse = lambda y_t,y_pred : math.sqrt(mean_squared_error(y_t,y_pred))

In [None]:
test_dataloader = create_dataloader(test["cleaned_excerpt"].values,tokenizer)

In [None]:
def svr_prediction(model ,i,n_folds = 5) :
    test_embedding = get_bert_embedding(test_dataloader,model)
    prediction = np.zeros((len(test)))
    scores = []
    st = StratifiedKFold(n_splits=n_folds,shuffle=False) 
    print(f"Model {i} starting...")
    for fold ,(tr_ind , val_ind) in enumerate(st.split(train["cleaned_excerpt"].values,\
                                                       train["bins-target"].values)):
        
        train_texts = train.reset_index(drop=True).loc[tr_ind,"cleaned_excerpt"].values
        train_target = train.reset_index(drop=True).loc[tr_ind,"target"].values
        
        val_texts = train.reset_index(drop=True).loc[val_ind,"cleaned_excerpt"].values
        val_target = train.reset_index(drop=True).loc[val_ind,"target"].values
        
        tr_dataloader = create_dataloader(train_texts,tokenizer)
        val_dataloader = create_dataloader(val_texts,tokenizer)
        
        X_t = get_bert_embedding(tr_dataloader,model)
        X_v = get_bert_embedding(val_dataloader,model)
        
        svr = LinearSVR(C=10)
        svr.fit(X_t,train_target)
        ypred =svr.predict(X_v)
        score = rmse(val_target,ypred)
        scores.append(score)
        print(f"Score for the fold {fold} : {score}")
        
        prediction += svr.predict(test_embedding)
    
    
    print(f"mse_error for the model {i} : {np.mean(scores)}")
    prediction /= n_folds     
    
    return prediction 
        
        

In [None]:
predictions = []
for i,path in enumerate(paths) : 
    
    model = Lisibility(model_base)
    model.load_state_dict(torch.load(path),strict=False)
    model.to(device)
    
    pr = svr_prediction(model,i,n_folds=5)
    
    predictions.append(pr)

In [None]:
pred = np.vstack(predictions)

In [None]:
prediction = np.mean(pred,axis=0)

In [None]:
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
submission["target"] = prediction

In [None]:
submission.to_csv("submission.csv",index=False)