In [None]:
# --- CSS STYLE ---
from IPython.core.display import HTML
def css_styling():
    styles = open("../input/2020-cost-of-living/alerts.css", "r").read()
    return HTML("<style>"+styles+"</style>")
css_styling()

<img src="https://i.imgur.com/GvOB7o2.png">

<center><h1>Training: BERT, RoBERTa and smart Experiment Tracking with W&B</h1></center>

# Introduction

Do you feel like you have **no idea what the heck are all these Tokenizers, what is BERT, how to train the models and ... who is Roberta?**.

You're not alone. This is my very first hands on NLP competition and let me tell you, it took some time to wrap my head around it. So, if you're a bit lost like I was starting this, hopefully the detailed explanations will bring a little bit of light.

<div class="alert simple-alert">
📍 A VERY big part of inspiration from this notebook comes from <b>Tanay Mehta's</b> notebook <a href="https://www.kaggle.com/heyytanay/training-kfolds-pytorch-bert-large-w-o-oom">[TRAINING & KFOLDS] PyTorch BERT-Large w/o OOM🎯</a> and <b>Abhishek Thakur's</b> notebook <a href="https://www.kaggle.com/abhishek/fork-of-fork-of-yum-yum-yum-93f968">Fork of Fork of yum yum yum 93f968</a>. I look to learn something new every competition, and this one was my first interaction with Hugging Face. 🤗
</div>

*OK! Let's get started!*

### ⬇️ Libraries

In [None]:
# Common Libraries
import wandb
import os
import random
import seaborn as sns
import pandas as pd
import numpy as np
from tqdm import tqdm
import string
import gc
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

# Text Manipulation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import GradScaler, autocast

# Transformers
from transformers import BertTokenizer, RobertaTokenizer, AutoTokenizer
from transformers import BertModel, RobertaModel
from transformers import AdamW, get_linear_schedule_with_warmup


# Custom colors
my_colors = ["#E4916C", "#E36A67", "#FFB2C8", "#BCE6EF", "#1E5656"]
sns.palplot(sns.color_palette(my_colors))

class color:
    '''S from Start & E from End.'''
    S = '\033[1m' + '\033[93m'
    E = '\033[0m'
    
# Environment check
os.environ["WANDB_SILENT"] = "true"
CONFIG = {'competition': 'common-lit', '_wandb_kernel': 'aot'}
pd.set_option('mode.chained_assignment', None)
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Secrets 🤫
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb")


# Set seed
def set_seed(n=10):
    torch.manual_seed(n)
    random.seed(n)
    np.random.seed(0)
    
set_seed(n=10)

### What is "transformers" package?

It's a [State-of-the-art Natural Language Processing](https://pypi.org/project/transformers/) for Jax, PyTorch and TensorFlow - from [huggingface](https://huggingface.co/).

🤗 Transformers provides **thousands of pretrained models** to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation and more in over 100 languages.

🤗 Transformers provides APIs to **quickly download and use those pretrained models** on a given text, fine-tune them on your own datasets and then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and can be modified to enable quick research experiments.

🤗 Transformers is backed by the three most popular deep learning libraries — Jax, **PyTorch** and TensorFlow.

**Why would I use pretrained models?**

Because some very smart people trained these very intelligent algorithms on thousands of observations. 

> 🔥 You can use what the algorithm already knows and just adapt it on our personal dataset through **Transfer Learning**. It's like teaching a 3rd grade kid how to read a passage instead of a kid that has never read before. 😎

In [None]:
! wandb login $secret_value_0

In [None]:
def train_to_device(ids, mask, metadata, target, device):
    '''Sends dataloader output to device.'''
    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    metadata = metadata.to(device, dtype=torch.float)
    target = target.to(device, dtype=torch.float)
    
    return ids, mask, metadata, target


def test_to_device(ids, mask, metadata, device):
    '''Sends dataloader output to device.'''
    ids = ids.to(device, dtype=torch.long)
    metadata = metadata.to(device, dtype=torch.float)
    mask = mask.to(device, dtype=torch.long)
    
    return ids, mask, metadata

# 1. Text Preprocessing

Before we start, let's make the preprocessing. I am using the very same preprocessing I made [in my first notebook here](https://www.kaggle.com/andradaolteanu/i-commonlit-explore-xgbrf-repeatedfold-model).

In [None]:
def clean_paragraph(paragraph, verbose=False):
    '''Cleans paragraph before tokenization.
    This step might help or NOT - as the pretrained models we will use are accustomed
    with a certain type of input.'''
    
    # Tokenize & convert to lower case
    tokens = word_tokenize(paragraph)
    tokens = [t.lower() for t in tokens]

    # Remove punctuation & non alphabetic characters from each word
    table = str.maketrans('', '', string.punctuation)
    tokens = [t.translate(table) for t in tokens]
    tokens = [t for t in tokens if t.isalpha()]

    # Filter out stopwords
    stop_words = stopwords.words('english')
    tokens = [t for t in tokens if not t in stop_words]

    # Lemmatizer
    lemmatizer = WordNetLemmatizer()
    tokens_lemm = [lemmatizer.lemmatize(t) for t in tokens]

    if verbose:
        print(color.BOLD + "Show difference between original and lemmatized token:" + color.END)
        for a, b, in zip(tokens, tokens_lemm):
            if a != b: print(a, " | ", b)
                
    return " ".join(tokens_lemm)

In [None]:
def create_features(df):
    '''Creates features based on preprocessed text column.
    df: the training or testing dataframe.'''
    
    # --- Add new features from text ---
    word_frequencies = pd.read_csv("../input/english-word-frequency/unigram_freq.csv")
    # Convert it into a dict (i.e. hashmap)
    word_frequencies = dict(zip(word_frequencies["word"], word_frequencies["count"]))
    available_words = set(word_frequencies.keys())
    # Tokenize full text
    df["split_text"] = df["text"].apply(lambda x: [word for word in x.split(" ")])
    # Get word count for each word
    df["freq_text"] = df["split_text"].apply(lambda x: [word_frequencies.get(word, 0) for word in x 
                                                        if word in available_words])


    # --- Create more features ---
    # Get sum, mean, std etc. from the text frequencies
    df["freq_sum"] = df["freq_text"].apply(lambda x: np.sum(x))
    df["freq_mean"] = df["freq_text"].apply(lambda x: np.mean(x))
    df["freq_std"] = df["freq_text"].apply(lambda x: np.std(x))
    df["freq_min"] = df["freq_text"].apply(lambda x: np.min(x))
    df["freq_max"] = df["freq_text"].apply(lambda x: np.max(x))

    # Get more info from text itself
    df["no_words"] = df["text"].apply(lambda x: len(x.split(" ")))
    df["no_words_paragraph"] = df["excerpt"].apply(lambda x: len(x.split(" ")))


    # --- Scale the Features ---
    cols = ['freq_sum', 'freq_mean', 'freq_std', 'freq_min', 
            'freq_max', 'no_words', 'no_words_paragraph']
    X = df[cols]
    X_scaled = pd.DataFrame(StandardScaler().fit_transform(X))
    df[cols] = X_scaled
    
    return df

In [None]:
# Read in training data and preprocess 'excerpt' feature
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
df["text"] = df["excerpt"].apply(lambda x: clean_paragraph(x))

# Create features
df = create_features(df)

df.head(3)

# 2. Global Model Setup

### Before we start - BERT vs ROBERTA

🔥 **BERT** is a [bi-directional transformer for pre-training](https://towardsdatascience.com/bert-roberta-distilbert-xlnet-which-one-to-use-3d5ab82ba5f8) over a lot of unlabeled textual data to learn a language representation that can be used to fine-tune for specific machine learning tasks. While BERT outperformed the NLP state-of-the-art on several challenging tasks, its performance improvement could be attributed to the bidirectional transformer, novel pre-training tasks of Masked Language Model and Next Structure Prediction along with a lot of data and Google’s compute power.

🔥 **RoBERTa** Introduced at Facebook, [robustly optimized BERT approach RoBERTa](https://towardsdatascience.com/bert-roberta-distilbert-xlnet-which-one-to-use-3d5ab82ba5f8), is a retraining of BERT with improved training methodology, 1000% more data and compute power *(BERT on steroids)*.

<center><img src="https://i.imgur.com/FdpaUw6.png" width=700></center>


### What is "gradScaler()"?

I saw Tanay Mehta using this and got me wondered [what is it for](https://pytorch.org/docs/stable/notes/amp_examples.html)?

If the forward pass for a particular op has `float16` inputs => the backward pass for that op will produce `float16` gradients. Gradient *values with small magnitudes may not be representable in `float16`* => these values will flush to zero, hence the update for the corresponding parameters will be lost.

🔥 To prevent **underflow**, **`gradScaler()`** multiplies the network’s loss(es) by a scale factor and invokes a backward pass on the scaled loss(es). Gradients flowing backward through the network are then scaled by the same factor. **In other words, gradient values have a larger magnitude, so they don’t flush to zero.**

In [None]:
# ===== MODEL PARAMETERS =====
# OR: "../input/huggingface-roberta/roberta-large"
MODEL_PATH = "../input/d/xhlulu/huggingface-bert/bert-large-uncased"
# OR: "roberta-large"
MODEL_NAME = "bert-large-uncased"
# OR: RobertaTokenizer
TOKENIZER = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)
scaler = GradScaler()    ### to prevent underflow

# Check if we are using either train or test data
IS_TEST = False
if len(df) == 7: IS_TEST == True

# 3. The Dataset

The `Dataset` class is an [abstract class representing a dataset](https://pytorch.org/tutorials/beginner/data_loading_tutorial.html) and it should have the following methods:

🔥 `__init__` to initiate our parameters (like the `text` we want to use, the `tokenizer`, size of the `tokenizer` etc.)

🔥 `__len__` so that len(dataset) returns the size of the dataset

🔥 `__getitem__` to support the indexing such that dataset[i] can be used to get *i*th sample

In [None]:
class LiitDataset(Dataset):
    
    def __init__(self, texts, tokenizer, max_len, is_test, targets=None, metadata=None):
        '''Initiate the arguments of the object.
        texts: the raw excerpt or preprocessed text from df
        targets: corresponding targets
        tokenizer: the tokenizer from transformers library
        max_len: the size of the tokenizer
        is_test: wether the data is a test/validation or train
        metadata: additional dataframe with features to use for the model'''
        
        self.text = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test
        self.metadata = metadata
        
    def __len__(self):
        '''Returns the size of the dataset.'''
        return len(self.text)
    
    def __getitem__(self, i):
        '''This will help during the data_loader operation.
        i: index(es) to select the data'''
        
        # Select the text & tokenize
        batch_text = str(self.text[i])
        inputs = self.tokenizer(batch_text,
                                max_length = self.max_len,
                                padding = "max_length",
                                truncation = True)
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        
        # Select the metadata
        meta = np.array(self.metadata.iloc[i].values, dtype=np.float32)
            
        if self.is_test == True:
            return {"input_ids" : torch.tensor(ids, dtype=torch.long),
                    "attention_mask" : torch.tensor(mask, dtype=torch.long),
                    "metadata" : torch.tensor(meta, dtype=torch.float)}
        else:
            target = self.targets[i]
            return {"input_ids" : torch.tensor(ids, dtype=torch.long),
                    "attention_mask" : torch.tensor(mask, dtype=torch.long),
                    "metadata" : torch.tensor(meta, dtype=torch.float),
                    "target" : torch.tensor(target, dtype=torch.float)}

### 🔎 What's the Output of the Dataset?

* `input_ids`: the token indices
* `attention_mask`: exactly ehat it says - a `0` or `1` array that tells the model which tokens should be attended to and which should not

<center><img src="https://i.imgur.com/XDnBLcE.png" width=1000></center>

In [None]:
# Array of texts & features & targets
sample_text = df["excerpt"][:6].values
sample_meta = df.iloc[:6, 9:]
sample_target = df["target"][:6].values


# Instantiate Dataset object
dataset = LiitDataset(texts=sample_text, targets=sample_target,
                      tokenizer=TOKENIZER, max_len=5, is_test=IS_TEST, 
                      metadata = sample_meta)
# The Dataloader
dataloader = DataLoader(dataset, batch_size=3, shuffle=False)

# Output of the Dataloader
for k, data in enumerate(dataloader):
    ids, mask, meta, target = data.values()
    print(color.S + f"Batch: {k}" + color.E, "\n" +
          color.S + "Ids:" + color.E, ids, "\n" +
          color.S + "Mask:" + color.E, mask, "\n" +
          color.S + "Metadata:" + color.E, meta, "\n" +
          color.S + "Target:" + color.E, target, "\n" +
          "="*50)

# 4. The Pretrained Model

🔥 OK! Now we want to create a **class** that takes the `input` (our paragraphs), processes them through some sort of pretrained model *(BERT, RoBERTa etc.)*, spills a **hidden embedding** that contains *valuable information* about the paragraphs and then finds out the `target`.

🔥 To do that, the `output` from the pretrained model will be passed to a `Dropout()` layer and then to a simple `Linear()` one, which is the final step that **classifies the information into the score**.

In [None]:
class TRANSFORMERS_MODEL(nn.Module):
    # TODO: Implement for RoBERTa as well
    
    def __init__(self, MODEL_PATH, drop=0.3, roberta=False, no_columns=7, meta_size=500):
        # If super is not called, an AttributeError will appear
        super(TRANSFORMERS_MODEL, self).__init__()
        
        # Text Model Layer
        if roberta == True:
            self.Model = RobertaModel.from_pretrained(MODEL_PATH)
        else:
            self.Model = BertModel.from_pretrained(MODEL_PATH)
        # Metadata Layer
        self.Metadata = nn.Sequential(nn.Linear(no_columns, meta_size),
                                      nn.BatchNorm1d(meta_size),
                                      nn.ReLU(),
                                      nn.Dropout(p=drop))
        # Aggregation Layer
        self.Linear = nn.Linear(1024 + meta_size, 1)
        
    def forward(self, input_ids, attention_mask, metadata, prints=False):
        '''A forward pass of this network.
        Use `prints=True` if you want to see output shape at each pass.'''
        
        if prints: print("===============")
        _, text = self.Model(input_ids, attention_mask, return_dict=False)
        if prints:
            print(color.S+"Text Out Shape:"+color.E, text.shape)
            
        meta = self.Metadata(metadata)
        if prints:
            print(color.S+"Metadata Out Shape:"+color.E, meta.shape)
            
        text_meta = torch.cat((text, meta), dim=1)
        out = self.Linear(text_meta)
        if prints:
            print(color.S+"After FNN Shape:"+color.E, out.shape)
        
        return out.view(-1)

### How does this class work?

For me at least it's a bit tricky to visualize what's behind Neural Nets. This is why I usually `print` a lot and create schemas out of them. The example below is for **BERT** output:

<center><img src="https://i.imgur.com/Kr8jP8X.png" width=1000></center>

In [None]:
# Initiate the model
model_example = TRANSFORMERS_MODEL(MODEL_PATH, drop=0.3, roberta=False, no_columns=7, meta_size=500)
model_example.train()  ### training mode: ON

# We'll use the dataset & dataloader from previous example
for k, data in enumerate(dataloader):
    ids, mask, meta, target = data.values()
    break
    
print(color.S+"Input data shape:"+color.E, len(ids), "paragraphs.", "\n")

out = model_example(ids, mask, meta, prints=True)

# 5. The Optimizer

🔥 **Optimizers** are methods used to **change the attributes of the neural network such as weights and learning rate** to reduce the losses. Optimizers are used to *solve optimization problems* by minimizing the function.

🔥 **WEIGHT DECAY**:

`weight_decay` is a regularization technique by adding a small penalty, usually the L2 norm of the weights to the loss function. It provides an approach to *reduce the overfitting* on the training data and improve the performance of the model on new data.

🤗 **ADAMW**:

The `AdamW()` optimizer **decouples the weight decay from the optimization step**. This means that the `weight_decay` and `learning_rate` can be optimized *separately*, i.e. changing the learning rate does not change the optimal weight decay.

In [None]:
def custom_optimizer(model, LR, prints=False):
    '''A custom optimizer for the model parameters.
    model: the initialized model class
    lr: learning rate'''
    
    # Get model parameters
    parameters = list(model.named_parameters())
    if prints:
        print(color.S+"Number of Parameters to Optimize:"+color.E, len(parameters))
        
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    
    # Set weight_decay to start at 0.0 for the no_decay parameters
    ### and the rest to start from 0.003
    optimizer_parameters = [
            {"params" : [p for n, p in parameters if not any(nd in n for nd in no_decay)],
             "weight_decay" : 0.003}, 
            {"params" : [p for n, p in parameters if any(nd in n for nd in no_decay)],
             "weight_decay" : 0.0}]
    
    optimizer = AdamW(optimizer_parameters, lr=LR)
    if prints: print(color.S+"Optimizer:"+color.E, optimizer)
    
    return optimizer

In [None]:
# Optimizer Example
optimizer_example = custom_optimizer(model_example, LR=0.0005, prints=True)

### Loss

Now lets create a loss function that will be used to assess the **difference between the predicted values of the model vs the actuals**.

In [None]:
def MSE_loss(output, target):
    '''Returns the square-root loss between the predicted and actual.'''
    loss = torch.sqrt(nn.MSELoss()(output, target))
    
    return loss

In [None]:
# Clean environment of all tests we've performed
del optimizer_example, out, model_example, dataset, dataloader
gc.collect()

# 6. Training Function

> 🔥 **Note**: for *each fold*, TRAIN = 2,267 observation, VALID = 567 observations.

🤗 **GET LINEAR SCHEDULE WITH WARMUP:**

Create a schedule with a `learning_rate` that **decreases linearly** from the initial `lr` set in the optimizer to 0, *after a warmup period* during which it **increases linearly** from 0 to the initial `lr` set in the optimizer.

🔥 **AUTOCAST**:

`torch.cuda.amp` provides **convenience methods** for mixed precision, where some operations use the `torch.float32` (float) datatype and other operations use `torch.float16` (half) [source here](https://pytorch.org/docs/stable/amp.html#autocasting). Instances of `autocast` serve as context managers or decorators that allow regions of your script to run in mixed precision.

To better visualize what's happening in the cell below, let's see a schema that **sumarises the trainin process in `train_model()`**:
<center><img src="https://i.imgur.com/0vQ2dpk.png" width=1000></center>

In [None]:
def train_model(lit_data, FEATURE, SPLITS, TOKENIZER, MAX_LEN, IS_TEST, 
                TRAIN_BATCH, VALID_BATCH, DROP, LR, EPOCHS, ROBERTA,
                META_SIZE, FEATURE_NO, MODEL_NAME, MODEL_PATH, N, CONFIG):
    
    # Initialize W&B experiment
    params = dict(model=MODEL_NAME, feature=FEATURE, splits=SPLITS, max_len=MAX_LEN,
              is_test=IS_TEST, train_batch=TRAIN_BATCH, valid_batch=VALID_BATCH, 
              drop=DROP, lr=LR, epochs=EPOCHS)
    CONFIG.update(params)
    run = wandb.init(project='commonlit', name=f"{MODEL_NAME}_exp_{N}", config=CONFIG, anonymous="allow")
    
    # Remember to turn GPU: ON
    device = torch.device('cuda:0')

    # We need to create a BIN, because the K-Fold receives only discrete values
    bins = int(np.floor(1 + np.log2(len(lit_data))))
    lit_data["bins"] = pd.cut(lit_data["target"], bins=bins, labels=False)

    # K-Fold Validation
    cv = StratifiedKFold(n_splits=SPLITS)
    cv_splits = cv.split(X=lit_data, y=lit_data['bins'].values)
    del lit_data['bins']


    # ~~~~~~~~~ Training ... ~~~~~~~~~
    for fold, (train_i, valid_i) in enumerate(cv_splits):

        print(color.S + f"========== Fold {fold} ==========" + color.E, "\n")
        # Set train + validation datas
        train_df = lit_data.iloc[train_i, :]
        valid_df = lit_data.iloc[valid_i, :]

        # Now set the PyTorch Dataset
        ### IS_TEST here will be for both True when committing
        ### and False when submitting to competition
        train_data = LiitDataset(texts=train_df[FEATURE].values, targets=train_df["target"].values,
                                 tokenizer=TOKENIZER, max_len=MAX_LEN, is_test=IS_TEST, 
                                 metadata=train_df.iloc[:, 9:])
        valid_data = LiitDataset(texts=valid_df[FEATURE].values, targets=valid_df["target"].values,
                                 tokenizer=TOKENIZER, max_len=MAX_LEN, is_test=IS_TEST,
                                 metadata = valid_df.iloc[:, 9:])

        # Dataloaders
        ### be sure to set shuffle=False for valid_loader !
        train_loader = DataLoader(train_data, batch_size=TRAIN_BATCH, 
                                  shuffle=True, num_workers=8)
        valid_loader = DataLoader(valid_data, batch_size=VALID_BATCH, 
                                  shuffle=False, num_workers=8)


        # ~~~ Model Setup ~~~
        model = TRANSFORMERS_MODEL(MODEL_PATH, drop=DROP, roberta=ROBERTA, 
                                   no_columns=FEATURE_NO, meta_size=META_SIZE).to(device)
        optimizer = custom_optimizer(model, LR)
        # Compute total number of training steps
        no = int(len(train_df) / TRAIN_BATCH * VALID_BATCH)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                                    num_training_steps=no)

        # ~~~ Epochs Loop ~~~
        BEST_LOSS = 2
        for epoch in range(EPOCHS):
            print(color.S + f"--- Epoch {epoch} ---" + color.E)

            # -Train the Model-
            model.train()   ### Training Mode: ON
            train_losses = []
            for k, data in enumerate(train_loader):
                ids, mask, meta, target = data.values()
                ids, mask, meta, target = train_to_device(ids, mask, meta, target, device)

                with autocast(): 
                    out = model(ids, mask, meta)
                    loss = MSE_loss(out, target)
                    train_losses.append(loss.cpu().detach().numpy().tolist())

                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()
                
            # Log Training Voss into the experiment
            train_losses = np.mean(train_losses)
            wandb.log({"fold":fold, "mean_train_loss": np.float(train_losses)}, step=epoch)


            # -Validate the Model-
            all_targets, all_preds = [], []
            model.eval()   ### Evaluation Mode: ON
            # Disable gradients - we aren't optimizing anything now
            with torch.no_grad():
                for k, data in enumerate(valid_loader):
                    ids, mask, meta, target = data.values()
                    ids, mask, meta, target = train_to_device(ids, mask, meta, target, device)

                    out = model(ids, mask, meta)

                    all_targets.extend(target.cpu().detach().numpy().tolist())
                    all_preds.extend(out.cpu().detach().numpy().tolist())

                    
            # -Let's see how the model did in this epoch-
            epoch_loss = np.sqrt(mean_squared_error(all_targets, all_preds))
            # Log Validation Loss into the experiment
            wandb.log({"fold":fold, "validation_loss": np.float(epoch_loss)}, step=epoch)
            print("Epoch RMSE Loss:", epoch_loss)
            


            # -Save Model-
            if epoch_loss < BEST_LOSS:
                # If loss decreased, then save the model
                print("saving model in fold {} | epoch {} ...".format(fold, epoch))
                torch.save(model.state_dict(), f"{MODEL_NAME}_fold_{fold}_loss{round(epoch_loss, 4)}.pt")
                ### TODO: Save model to W&B
                BEST_LOSS = epoch_loss


        print(color.S+"Best RMSE in this fold: "+color.E, BEST_LOSS, "\n"*2)
        wandb.log({"best_rmse": np.float(BEST_LOSS)})

        del model, optimizer, scheduler, ids, mask, meta, target
        torch.cuda.empty_cache()
        gc.collect()
    
    wandb.finish()

# 7. Experiments

We are going to use **[W&B](https://wandb.ai/site)** for *experiment tracking*; this way, if we alter the code or erase it in any way and for some reason we **want to go back** to a hyperparameter setup OR in case we **forgot what model performed** the best, we can just **[check the dashboard and find out](https://wandb.ai/andrada/commonlit?workspace=user-)** 😎.

> 🔥 **Note**: you can tweak the parameters to get better results.

## I. Baseline Model using BERT

<center><video src="https://i.imgur.com/38fo1Jz.mp4" width=700 controls></center>

In [None]:
MODEL_PATH = "../input/d/xhlulu/huggingface-bert/bert-large-uncased"
MODEL_NAME = "bert-large-uncased"
TOKENIZER = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)

FEATURE = "text"        ### feature to use for training
SPLITS = 2              ### k-fold number of splits
MAX_LEN = 150           ### length of tokenizer
TRAIN_BATCH = 32        ### train batch size
VALID_BATCH = 32        ### test batch size
DROP = 0.3              ### percentage of neurons to drop
LR = 0.00005            ### learning rate
EPOCHS = 3              ### how many times the model is trained
ROBERTA=False           ### wether to use bert or roberta
N=0                     ### unique identifier for W&B experiment
META_SIZE = 500         ### metadata size of hidden layer
FEATURE_NO = 7          ### number of features to use


train_model(df, FEATURE, SPLITS, TOKENIZER, MAX_LEN, IS_TEST, 
            TRAIN_BATCH, VALID_BATCH, DROP, LR, EPOCHS, ROBERTA,
            META_SIZE, FEATURE_NO, MODEL_NAME, MODEL_PATH, N, CONFIG)

## II. Baseline Model using RoBERTa

<center><video src="https://i.imgur.com/IiNJuVN.mp4" width=700 controls></center>

In [None]:
MODEL_PATH = "../input/huggingface-roberta/roberta-large"
MODEL_NAME = "roberta-large"
TOKENIZER = RobertaTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)

In [None]:
FEATURE = "text"        ### feature to use for training
SPLITS = 2              ### k-fold number of splits
MAX_LEN = 150           ### length of tokenizer
TRAIN_BATCH = 20        ### train batch size
VALID_BATCH = 32        ### test batch size
DROP = 0.3              ### percentage of neurons to drop
LR = 0.00005            ### learning rate
EPOCHS = 3              ### how many times the model is trained
ROBERTA=True            ### wether to use bert or roberta
N=0                     ### unique identifier for W&B experiment
META_SIZE = 500         ### metadata size of hidden layer
FEATURE_NO = 7          ### number of features to use


train_model(df, FEATURE, SPLITS, TOKENIZER, MAX_LEN, IS_TEST, 
            TRAIN_BATCH, VALID_BATCH, DROP, LR, EPOCHS, ROBERTA,
            META_SIZE, FEATURE_NO, MODEL_NAME, MODEL_PATH, N, CONFIG)

# 8. How do I do the inference and submit?

> **Note**: As a last step, I would like to leave here **a sample code that you can use to infere into this competition**. It is quite simple, but you will **have to turn OFF the internet**, because we will send the inference through a *notebook*, and the competition doesn't allow the Internet to be turned ON. W&B doesn't work without Internet, so I would recommend taking the following code **copy-paste and creating a special notebook for inference**.

### 🔥 Step 1

Copy into a new notebook (GPU: ON & Internet: OFF) the following classes: `LiitDataset()`, `TRANSFORMERS_MODEL()` and the preprocessing function (if any) for `excerpt`.

### 🔥 Step 2

Now just copy the following lines of code:

In [None]:
# Read in training data and preprocess 'excerpt' feature
df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
df["text"] = df["excerpt"].apply(lambda x: clean_paragraph(x))

# Create features
df = create_features(df)

In [None]:
def predict(df, FEATURE, TOKENIZER, MAX_LEN, IS_TEST, DROP, ROBERTA, 
            META_SIZE, FEATURE_NO, MODEL_PATH, TRAINED_MODEL_PATH):
    '''Function that makes the inference.'''
    
    # Remember to turn GPU: ON
    device = torch.device('cuda:0')

    # Now set the PyTorch Dataset
    lit_dataset = LiitDataset(texts=df[FEATURE].values, tokenizer=TOKENIZER, 
                              max_len=MAX_LEN, is_test=IS_TEST, metadata=df.iloc[:, 7:])
    # Dataloaders
    lit_loader = DataLoader(lit_dataset, batch_size=32, 
                              shuffle=True, num_workers=8)
    
    # Load pretrained model
    model = TRANSFORMERS_MODEL(MODEL_PATH, drop=DROP, roberta=ROBERTA,
                               no_columns=FEATURE_NO, meta_size=META_SIZE).to(device)
    model.load_state_dict(torch.load(TRAINED_MODEL_PATH))
    model.eval()
    
    final_output = []
    
    for index, data in enumerate(lit_loader):
        # We are predicting, no gradients needed
        with torch.no_grad():
            ids, mask, meta = data.values()
            ids, mask, meta = test_to_device(ids, mask, meta, device)
            out = model(ids, mask, meta)
            out = out.detach().cpu().numpy().ravel().tolist()
            final_output.extend(out)
    
    # Return predictions
    torch.cuda.empty_cache()
    return np.array(final_output)

In [None]:
# Be sure you're using the same numbers as for training!

MODEL_NAME = "bert-large-uncased"
MODEL_PATH = "../input/d/xhlulu/huggingface-bert/bert-large-uncased"
TRAINED_MODEL_PATH = "../input/commonlit-dataset/bert-large-uncased_fold_1_loss0.6337.pt"
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH)
FEATURE = "text"   
MAX_LEN = 150 
IS_TEST = True
DROP = 0.3     
ROBERTA=False
META_SIZE = 500
FEATURE_NO = 7


prediction = predict(df, FEATURE, TOKENIZER, MAX_LEN, IS_TEST, DROP, ROBERTA, 
                     META_SIZE, FEATURE_NO, MODEL_PATH, TRAINED_MODEL_PATH)

### 🔥 Step 3

Append predictions to the `submission.csv` dataframe and you're done!

In [None]:
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
submission.target = prediction
submission.to_csv("submission.csv", index=False)

In [None]:
submission

<img src="https://i.imgur.com/cUQXtS7.png">

# 🎨 My Specs
* 🖥 **Z8 G4** Workstation
* 💾 2 CPUs & 96GB Memory 
* 🎮 NVIDIA **Quadro RTX 8000** 
* 🏃🏾‍♀️ **RAPIDS** version 0.17 
* 💻 **Zbook Studio G7** on the go