# CommonLit Readability - RoBerta + XGB Baseline

Thanks to [Abhishek](https://www.kaggle.com/abhishek) for the model: [modelf1](https://www.kaggle.com/abhishek/modelf1) discussed in [🚀 AutoNLP to the rescue](https://www.kaggle.com/c/commonlitreadabilityprize/discussion/237795).

For the embeddings loading code, thanks to [Abhishek](https://www.kaggle.com/abhishek) for the notebook [yum yum yum](https://www.kaggle.com/abhishek/yum-yum-yum) and [Maunish](https://www.kaggle.com/maunish) for the notebook: [CLRP: RoBerta + LGBM](https://www.kaggle.com/maunish/clrp-roberta-lgbm). Also, thanks to [Maunish](https://www.kaggle.com/maunish) for the last notebook for the idea to use boosted trees on RoBerta embeddings. This notebook is basically similar to the previous versions of that notebook except for using XGBoost, ordinary KFold and outputting OOF RMSE.

# Load Libraries

In [None]:
import os
import gc
import sys
import time
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from transformers import AutoModel, AutoTokenizer

# Load Data

In [None]:
data_dir = '../input/commonlitreadabilityprize/'
train = pd.read_csv(data_dir + 'train.csv')
test = pd.read_csv(data_dir + 'test.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

target = train['target'].to_numpy()

# Load Embeddings
source: https://www.kaggle.com/maunish/clrp-roberta-lgbm

In [None]:
# source: https://www.kaggle.com/maunish/clrp-roberta-lgbm

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


class CLRPDataset(nn.Module):
    def __init__(self, df, tokenizer, max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)
    

def get_embeddings(df, path, plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = CLRPDataset(df, tokenizer, config['max_len'])
    dl = DataLoader(ds,
                    batch_size=config["batch_size"],
                    shuffle=False,
                    num_workers = 4,
                    pin_memory=True,
                    drop_last=False)
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0], -1).to(device) for key, val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:, 0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [None]:
# source: https://www.kaggle.com/maunish/clrp-roberta-lgbm

config = {
    'batch_size': 128,
    'max_len': 256,
    'seed': 42,
}
seed_everything(seed=config['seed'])

train_embeddings =  get_embeddings(train,'../input/modelf1')
test_embeddings = get_embeddings(test,'../input/modelf1')

# XGBoost

In [None]:
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    
    'eta': 0.05,
    'max_depth': 3,
    
    'gamma': 1,
    'subsample': 0.8,
    
    'nthread': 2
}

nfolds = 5
kf = KFold(n_splits=nfolds, shuffle=True, random_state=config['seed'])

In [None]:
best_iterations = []
oof_rmses = []
preds = np.zeros(test.shape[0])

for k, (train_idx, valid_idx) in enumerate(kf.split(train)):    
    
    dtrain = xgb.DMatrix(train_embeddings[train_idx], target[train_idx])
    dvalid = xgb.DMatrix(train_embeddings[valid_idx], target[valid_idx])
    evals_result = dict()
    booster = xgb.train(params,
                        dtrain,
                        evals=[(dtrain, 'train'), (dvalid, 'valid')],
                        num_boost_round=300,
                        early_stopping_rounds=20,
                        evals_result=evals_result,
                        verbose_eval=False)
    
    best_iteration = np.argmin(evals_result['valid']['rmse'])
    best_iterations.append(best_iteration)
    oof_rmse = evals_result['valid']['rmse'][best_iteration]
    oof_rmses.append(oof_rmse)
    
    preds += booster.predict(xgb.DMatrix(test_embeddings), ntree_limit=int(best_iteration+1)) / nfolds
    
evals_df = pd.DataFrame()
evals_df['fold'] = range(1, nfolds+1)
evals_df['best_iteration'] = best_iterations
evals_df['oof_rmse'] = oof_rmses

display(evals_df)
print('mean oof rmse = {}'.format(np.mean(oof_rmses)))

# Submission

In [None]:
test['prediction'] = preds
submission = pd.DataFrame()
submission['id'] = test['id'].copy()
submission['target'] = test['prediction'].copy()
submission.to_csv('submission.csv', index=False)
submission.head()