## RoBERTa

Paper - https://arxiv.org/pdf/1907.11692.pdf

### Differences from BERT

* Static Vs Dynamic Masking
* NSP (Next Sentence Prediction) objective is removed
* Larger dataset

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"]="1"

import time
import numpy as np
import pandas as pd
import random
from sklearn import model_selection

import torch
from torch import nn
from torch.utils import data
from transformers import (WEIGHTS_NAME, 
                          BertConfig, BertForSequenceClassification, BertTokenizer,
                          RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer,
                          XLMConfig, XLMForSequenceClassification, XLMTokenizer, 
                          XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer,
                          DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
from transformers import AdamW, WarmupLinearSchedule
from transformers import AutoModelForSequenceClassification, AutoTokenizer

MODEL_CLASSES = {
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
}

df = pd.read_csv("../input/ClothingReviews.csv")
train_df, test_df = model_selection.train_test_split(df, test_size=0.3, random_state=2019)

I1106 05:16:54.155798 139803614635776 file_utils.py:39] PyTorch version 1.1.0 available.
I1106 05:16:54.188442 139803614635776 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


### Custom Functions

In [2]:
def convert_text_to_features(examples, tokenizer,
                                      max_length=512,
                                      pad_on_left=False,
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
    """
    Loads a data file into a list of ``InputFeatures``
    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)
    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.
    """
    features = [[],[],[]]
    for (ex_index, example) in enumerate(examples):
#         if ex_index % 10000 == 0:
#             print("Writing example %d" % (ex_index))

        inputs = tokenizer.encode_plus(
            example,
            None,
            add_special_tokens=True,
            max_length=max_length,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)

#         if ex_index < 1:
#             print("*** Example ***")
#             print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
#             print("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
#             print("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))

        features[0].append(input_ids)
        features[1].append(attention_mask)
        features[2].append(token_type_ids)

    return features

def seed_everything(seed=123):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

device = 'cuda' if torch.cuda.is_available() else 'cpu'

### RoBERTa Model 

In [3]:
model_start_time = time.time()
# Model Config and initialize
model_name = "roberta"
pretrained_model_name = "roberta-base"
n_classes = 1
config_class, model_class, tokenizer_class = MODEL_CLASSES[model_name]
config = config_class.from_pretrained(pretrained_model_name)
tokenizer = tokenizer_class.from_pretrained(pretrained_model_name, do_lower_case=False)
model = model_class.from_pretrained(pretrained_model_name, num_labels=1)

# Dataset Preparation
max_length = 128
train_df["text"] = train_df["text"].astype(str).fillna("NA")
train_features = convert_text_to_features(train_df["text"], tokenizer, max_length=max_length)
X = torch.tensor(train_features[0], dtype=torch.long)
X_mask = torch.tensor(train_features[1], dtype=torch.long)
X_seg_ids = torch.tensor(train_features[2], dtype=torch.long)
y = train_df["recommended"].values
y = torch.tensor(y[:,np.newaxis], dtype=torch.float32)

batch_size = 8
train_dataset = data.TensorDataset(X, X_mask, X_seg_ids, y)
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Model Building
n_epochs = 1
accumulation_steps = 1
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
num_train_optimization_steps = int(n_epochs*len(train_dataset)/batch_size/accumulation_steps)
num_warmup_steps = int(0.05*num_train_optimization_steps)
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)
scheduler = WarmupLinearSchedule(optimizer,
                                 warmup_steps=num_warmup_steps,
                                 t_total=num_train_optimization_steps)

seed_everything()
model.to(device)
loss_fn = nn.BCEWithLogitsLoss()

for epoch in range(n_epochs):
    model.train()
    for x_batch, x_mask, x_seg_ids, y_batch in train_loader:
        outputs = model(x_batch.to(device),
                        attention_mask=x_mask.to(device),
                        token_type_ids=x_seg_ids.to(device),
                        labels=None)
        y_pred = outputs[0]
        loss = loss_fn(y_pred, y_batch.to(device))
        loss.backward()
        scheduler.step()
        optimizer.step()
        optimizer.zero_grad()

model_end_time = time.time()

I1106 05:16:54.680823 139803614635776 file_utils.py:296] https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json not found in cache or force_download set to True, downloading to /tmp/tmp8pjgt5ip
100%|██████████| 473/473 [00:00<00:00, 121771.78B/s]
I1106 05:16:55.041465 139803614635776 file_utils.py:309] copying /tmp/tmp8pjgt5ip to cache at /home/srk/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.9dad9043216064080cf9dd3711c53c0f11fe2b09313eaa66931057b4bdcaf068
I1106 05:16:55.044054 139803614635776 file_utils.py:313] creating metadata file for /home/srk/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.9dad9043216064080cf9dd3711c53c0f11fe2b09313eaa66931057b4bdcaf068
I1106 05:16:55.046086 139803614635776 file_utils.py:322] removing temp file /tmp/tmp8pjgt5ip
I1106 05:16:55.047217 139803614635776 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.hug

### Preparation of test dataset

In [4]:
pred_start_time = time.time()
# Data Preparation
test_df["text"] = test_df["text"].astype(str).fillna("NA")
test_features = convert_text_to_features(test_df["text"], tokenizer, max_length=max_length)

test_X = torch.tensor(test_features[0], dtype=torch.long)
test_X_mask = torch.tensor(test_features[1], dtype=torch.long)
test_X_seg_ids = torch.tensor(test_features[2], dtype=torch.long)
test_y = test_df["recommended"].values

test_dataset = data.TensorDataset(test_X, test_X_mask, test_X_seg_ids)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Prediction
preds = np.zeros([len(test_dataset), 1])
model.eval()
for i, (x_batch, x_mask, x_seg_ids) in enumerate(test_loader):
    outputs = model(x_batch.to(device),
                    attention_mask=x_mask.to(device),
                    token_type_ids=x_seg_ids.to(device),
                    labels=None)
    y_pred = sigmoid(outputs[0].detach().cpu().numpy())
    preds[i*batch_size:(i+1)*batch_size, :] = y_pred
    
from sklearn import metrics
print(metrics.roc_auc_score(test_y, preds))
pred_end_time = time.time()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


0.9635614176358467


In [5]:
preds

array([[0.9979741 ],
       [0.99806815],
       [0.97840422],
       ...,
       [0.84606194],
       [0.9881072 ],
       [0.96491498]])

In [6]:
print(model_end_time - model_start_time)
print(pred_end_time - pred_start_time)

323.97860860824585
36.13456892967224
