# Paraphrase Classification Model

## Data Loading

Load the training and validation data into dataframes

Use the dataframes to create huggingface datasets

In [1]:
from datasets import load_dataset

cnn_dailymail_dataset = load_dataset("cnn_dailymail", '2.0.0')

Reusing dataset cnn_dailymail (C:\Users\phill\.cache\huggingface\datasets\cnn_dailymail\2.0.0\2.0.0\3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
100%|██████████| 3/3 [00:00<00:00, 19.88it/s]


In [2]:
train_dataset, val_dataset, test_dataset = cnn_dailymail_dataset['train'], cnn_dailymail_dataset['validation'], cnn_dailymail_dataset['test']

In [3]:
list(cnn_dailymail_dataset.keys())

['train', 'validation', 'test']

## Tokenize Data

Load the tokenizer of the model that will be fine-tuned

In [4]:
#from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import BartTokenizer

checkpoint = 'facebook/bart-base'
tokenizer = BartTokenizer.from_pretrained(checkpoint)

This tokenize function tokenizes both sentences and concatenates them with a seperator token

In [5]:
encoder_max_length=512
decoder_max_length=128
batch_size = 8

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

Apply the tokenize function to each dataset

In [6]:
tokenized_train = train_dataset.map(process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size,
    remove_columns=["article", "highlights", "id"],
    num_proc=8)
#data_collator = DataCollatorWithPadding(tokenizer)

NameError: name 'tokenizer' is not defined

In [None]:
tokenized_val = val_dataset.map(process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["article", "highlights", "id"])

## Load model

Load the huggingface model for sequence classification with 2 labels

In [1]:
from transformers import BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

#model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to('cuda')

In [3]:
model.config.vocab_size

50265

In [6]:
model.config

BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 1024,
  "model_type": "bart",
  "no_repeat_ng

## Get Trainer Config

Load the accuracy and f1 metrics and create a function that applies them to pass to the trainer

In [7]:
import numpy as np
from datasets import load_metric

metric = load_metric('accuracy', 'f1')

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Create the TrainingArguments. This specifies checkpoint path, batch_size, epochs, when to apply the metrics, etc.

In [8]:
from transformers import TrainingArguments
batch_size = 8
training_args = TrainingArguments('./data/models/test_summarization_bart1', 
per_device_train_batch_size=batch_size, 
per_device_eval_batch_size=batch_size, 
num_train_epochs=1, 
evaluation_strategy='epoch',) 
#report_to="wandb")

# Train the model
Create the Trainer with everything created so far and run the train function.

In [11]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=cnn_dailymail_dataset['train'],
    eval_dataset=cnn_dailymail_dataset['validation'],
    #data_collator=data_collator,
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics
)
train_output = trainer.train()

The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: highlights, article, id.
***** Running training *****
  Num examples = 0
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 35890
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mphillipmerritt[0m (use `wandb login --relogin` to force relogin)
  warn("The `IPython.html` package has been deprecated since IPython 4.0. "
[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


  0%|          | 0/35890 [00:00<?, ?it/s]

IndexError: Invalid key: 90427 is out of bounds for size 0

In [10]:
train_output

TrainOutput(global_step=3860, training_loss=0.22098883544842815, metrics={'train_runtime': 1111.6416, 'train_samples_per_second': 222.198, 'train_steps_per_second': 3.472, 'train_loss': 0.22098883544842815, 'epoch': 5.0})