In [2]:
import random
import numpy as np
import os
import torch
from tqdm import tqdm
from transformers import RobertaForSequenceClassification, RobertaConfig, RobertaTokenizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from model_compression.training_utils.datasets import (processors, load_and_cache_examples)
from model_compression.training_utils.modules import RobertaForSpanClassification
from model_compression.training_utils.metrics import superglue_compute_metrics
from model_compression.training_utils.training_utils import train, evaluate
from model_compression.training_utils.utils import TrainConfig, output_modes

In [3]:
MODEL_CLASSES = {
    "roberta": (
        RobertaConfig,
        RobertaTokenizer,
        {"classification": RobertaForSequenceClassification, "span_classification": RobertaForSpanClassification},
    ),
}
tasks_num_spans = {
    "wic": 2,
    "wsc": 2,
}


In [4]:
from typing import Optional
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
@dataclass
class TrainConfig:
    weight_decay: float = 0.01
    max_steps: int = -1
    gradient_accumulation_steps: int = 1
    num_train_epochs: int = 30
    warmup_ratio: float = 0.06
    learning_rate: float = 0.00001
    adam_epsilon:float = 1e-8
    max_grad_norm:float = 1.0
    train_batch_size: int = 16 
    eval_batch_size:int = 32
    eval_and_save_steps:float = 500


In [5]:
task_name = "BoolQ"
model_type = "roberta"
model_checkpoint = "roberta-base"
tokenizer_name = model_checkpoint
do_lower_case = False
data_dir = "../../data/BoolQ/"
output_dir = ""

In [6]:
# Prepare task
task_name = task_name.lower()
assert task_name in processors, f"Task {task_name} not found!"
processor = processors[task_name]()
output_mode = output_modes[task_name]
label_list = processor.get_labels() 
num_labels = len(label_list)

In [7]:
model_type = model_type.lower()
config_class, tokenizer_class, model_classes = MODEL_CLASSES[model_type]
model_class = model_classes[output_mode]
config = config_class.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    finetuning_task=task_name,
)
if output_mode == "span_classification":
    config.num_spans = tasks_num_spans[task_name]
tokenizer = tokenizer_class.from_pretrained(
    tokenizer_name,
    do_lower_case=do_lower_case,
)
model = model_class.from_pretrained(
            model_checkpoint,
            config=config,
        )
model.cuda()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [8]:
train_dataset = load_and_cache_examples(task_name, tokenizer, data_dir, max_seq_length=512) 

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [9]:
train_config = TrainConfig(train_batch_size=32)

In [None]:
global_step, tr_loss = train(train_dataset, model, tokenizer, 
                             output_mode = output_mode,
                             model_type=model_type,
                             train_config=train_config, task_name=task_name, data_dir=data_dir)

Epoch 0 loss: 0.634: 100%|████████████████████| 295/295 [01:35<00:00,  3.07it/s]
Epoch 1 loss: 0.553:  69%|█████████████▊      | 204/295 [01:04<00:28,  3.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned 

In [None]:
result, preds, ex_ids = evaluate(task_name, model, tokenizer, eval_batch_size=train_config.eval_batch_size,
                                             device="cuda:0", use_fixed_seq_length=False, output_mode=output_mode,
                                             model_type=model_type, use_tqdm=True, data_dir=data_dir)
result = dict((f"{k}", v) for k, v in result.items())

eval_task_names = (task_name,) 

for eval_task_name in eval_task_names:
    result, preds, ex_ids = evaluate(eval_task_name, model, tokenizer, eval_batch_size=train_config.eval_batch_size,
                                     device="cuda:0", use_fixed_seq_length=False, output_mode=output_mode,
                                     model_type=model_type, use_tqdm=True, data_dir=data_dir,split="test", prefix="")
    
    processor = processors[eval_task_name]()
    if task_name == "record":
        answers = processor.get_answers(data_dir, "test")
        processor.write_preds(preds, ex_ids, output_dir, answers=answers)
    else:
        processor.write_preds(preds, ex_ids, output_dir)