## First NLP Competition 
This is the first NLP competition for me and I had never explored transformers before this. So, firstly, I'd like to mention some resources that helped me to learn and achieve this result.

* World's first 4xGM [Abhishek Thakur](https://www.kaggle.com/abhishek)'s book [Approaching (Almost) Any Machine Learning Problem](https://github.com/abhishekkrthakur/approachingalmost) and his [youtube](https://www.youtube.com/user/abhisheksvnit) channel.
* [Notebooks](https://www.kaggle.com/maunish/clrp-pytorch-roberta-inference) by [Maunish dave](https://www.kaggle.com/maunish)
* Some YouTube videos:
    * [BERT Neural Network - EXPLAINED! by CodeEmporium](https://youtu.be/xI0HHN5XKDo)
    * [Grandmaster Series – Building World-Class NLP Models with Transformers and Hugging Face by NVIDIA Developer](https://youtu.be/PXc_SlnT2g0)

### Approach used:
* pretraining roberta 
* Tokenizer: RobertaTokenizer
* model: roberta for sequence classification
* 5 folds, 7 epochs

### scores upon submission:

| Model | RMSE on LB |
| --- | --- |
| RoBERTa for seq classif | 0.481 |
| RoBERTa for seq classif stacked with lgbm | 0.482 |
| RoBERTa for seq classif stacked with ridge | 0.483 |

In [None]:
# specifying basic config
import transformers

MAX_LEN = 256

TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32

EPOCHS = 5

BERT_PATH = "../input/roberta-base"

MODEL_PATH = "../input/roberta-de-nero/"

device = 'cuda'

TOKENIZER = transformers.RobertaTokenizer.from_pretrained(BERT_PATH)

In [None]:
import torch
class BERTDataset:
    def __init__(self, text):
        self.text = text
    
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        encode = self.tokenizer(self.text[item],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        
        return encode

In [None]:
# attention head
from torch import nn 

class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [None]:
import transformers
from torch import nn

class ROBERTA_(nn.Module):
    def __init__(self):
        super(ROBERTA_, self).__init__()
        self.bert = transformers.AutoModelForSequenceClassification.from_pretrained(
            BERT_PATH,
            num_labels=1
        )
        self.head = AttentionHead(768, 768, 1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, 1)

    def forward(self, **param_mehta):
        
        x = self.bert(**param_mehta)
        x = x["logits"].squeeze(-1)
        
#         x = x[0]
#         x = self.head(x)
#         x = self.dropout(x)
#         x = self.linear(x)
        
        return x

In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')


num_bins = int(np.floor(1 + np.log2(len(train))))
train.loc[:,'bins'] = pd.cut(train['target'], bins=num_bins, labels=False)

yyy = train.target.values
bins = train.bins.values

train_dataset = BERTDataset(
    text = train.excerpt.values
)
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size = TRAIN_BATCH_SIZE,
)


test_dataset = BERTDataset(
    text = test.excerpt.values
)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size = TEST_BATCH_SIZE,
)

In [None]:
model = ROBERTA_()

In [None]:
def predict(test_data, model, model_path):
    preds = []
    
    state = torch.load(model_path)
    model.load_state_dict(state['state_dict'])
    
    model.to(device)
    model.eval()
    
    with torch.no_grad():
        for d in test_data:
            inputs = {key: val.reshape(val.shape[0], -1).to(device) for key, val in d.items()}
            output = model(**inputs)
            
            preds.extend(output.cpu().numpy())
    
    return preds

In [None]:
x_train, x_test = [], []
print('Fold:', end=' ')
for i in range(5):
    model_path_ = MODEL_PATH + 'model_' + str(i) + '.pth'
    x_train.append(predict(train_dataloader, model, model_path_))
    x_test.append(predict(test_dataloader, model, model_path_))
    print(f'{i}', end=' ')

### First, I have created a submission dataframe with targets just from RoBERTa model.

In [None]:
preds_sub = None
for i in x_test:
    try:
        preds_sub += np.array(i)
    except:
        preds_sub = np.array(i)
preds_sub /= 5

sub_nostack = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
sub_nostack['target'] = preds_sub
print(sub_nostack)

In [None]:
from sklearn.metrics import mean_squared_error
def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

### Stacking on RoBERTa

The following cell is for stacking a model on RoBERTa. I have used 5 folds again.

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import Ridge
from lightgbm import LGBMRegressor

# add model of choice for stacking
stacking_models_dispatcher = {
    'ridge': Ridge(alpha=50),
    'lgbm': LGBMRegressor()
}

target = None
for train_data, test_data in zip(x_train, x_test):
    
    kfold = StratifiedKFold(n_splits=5)
    
    preds = None
    sum_scores = 0
    for k, (train_idx, valid_idx) in enumerate(kfold.split(train_data, bins)):
        
        train_data = np.array(train_data).reshape(-1, 1)
        test_data = np.array(test_data).reshape(-1, 1)
        
        stacking_model = stacking_models_dispatcher['ridge'] # specify model of choice for stacking
        X_train, y_train = train_data[train_idx], yyy[train_idx]
        X_valid, y_valid = train_data[valid_idx], yyy[valid_idx]
        
    
        stacking_model.fit(X_train, y_train)
        prediction = stacking_model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k}, rmse score: {score}')
        
        sum_scores += score
        try:
            preds += stacking_model.predict(test_data)
        except:
            preds = stacking_model.predict(test_data)
 
    print(f'MEAN RMSE: {sum_scores / 5}')
        
    preds /= 5
    try:
        target += preds
    except:
        target = preds 
target /= 5

In [None]:
sub_stack = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
sub_stack['target'] = target
print(sub_stack)

#### If you would like to use this notebook, make sure you use the correct data frame to save the submission file in the next cell.

In [None]:
sub_nostack.to_csv('submission.csv', index=False)