# Training Script for BERT
This notebook is used to train BERT on comments or on posts from data obtained from the subreddit r/amITheAsshole. Change the config file to train the model on posts or on comments.
We provided two config files similar to the ones used to obtain our model results
- `config_comments.json` to train the comments
- `config_posts.json` to train the posts

In [1]:
import copy
import datetime
import json
import os.path
import random
import time
import pprint
from aita.datasets import AITADatasetBERT
import numpy as np
import torch
from aita.utils import format_time
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import random_split
from transformers import BertForSequenceClassification, AdamW
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup
from aita.utils.metrics import Metrics
pp = pprint.PrettyPrinter(indent=4)


# Device Config

In [2]:

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1050 Ti with Max-Q Design


# Prepare Dataset

## Load Config File

In [3]:
config_file = open('training_config.json')
config = json.load(config_file)
pp.pprint(config)

{   'batch_size': 2,
    'dataset_path': 'data/posts.csv',
    'epochs': 3,
    'eps': 1e-08,
    'lr': 2e-05,
    'model_name': 'model_comments.pt',
    'model_path': 'model_weights',
    'model_weights': 'model_weights/last_model_posts.pt',
    'token_length': 512,
    'undersample': True,
    'warmup_steps': 100}


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
dataset = AITADatasetBERT(dataset_path=config['dataset_path'],tokenizer=tokenizer, max_token_length=config['token_length'], undersample=config['undersample'])

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset,  # The training samples.
                              sampler=RandomSampler(train_dataset),  # Select batches randomly
                              batch_size=config["batch_size"])

validation_dataloader = DataLoader(val_dataset,  # The validation samples.
                                   sampler=SequentialSampler(val_dataset),  # Pull out batches sequentially.
                                   batch_size=config["batch_size"])
print(f"Training Dataset Size: {len(train_dataset)}, Validation Dataset Size: {len(val_dataset)}")

Training Dataset Size: 8053, Validation Dataset Size: 895


# Prepare Model

In [5]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

optimizer = AdamW(model.parameters(),
                  lr=config["lr"],  # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps=config["eps"]  # args.adam_epsilon  - default is 1e-8.
                  )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Initialize Scheduler

In [6]:
epochs = config['epochs']
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=config["warmup_steps"],
                                            num_training_steps=total_steps)

# Training

In [7]:
def train_step(train_dataloader, model, t0, device, optimizer, scheduler):

    total_train_loss = 0
    training_metrics = Metrics()
    for step, batch in enumerate(train_dataloader):
        # Progress update every 400 batches.
        if step % 10 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the defined defice using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        # from documentation the loss used here is CrossEntropy the model returns the loss
        loss, logits = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels,
                             return_dict=False)

        total_train_loss += loss.item()

        logits = logits.detach().to("cpu").numpy()
        label_ids = b_labels.to('cpu').numpy()

        pred_flat = np.argmax(logits, axis=1).flatten().astype(np.float32)
        labels_flat = label_ids[:, 1].flatten().astype(np.float32)
        training_metrics.update_metrics(labels_flat, pred_flat)
        # Calculate the accuracy for this batch of train sentences, and accumulate it over all batches.

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()
    return total_train_loss / len(train_dataloader), training_metrics.calculate_metrics()

In [8]:
def validation_step(validation_dataloader, model, device):
    total_validation_loss = 0
    validation_metrics = Metrics()

    for batch in validation_dataloader:

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            (loss, logits) = model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels,
                                   return_dict=False)

            total_validation_loss += loss

            # Move logits and labels to CPU
            logits = logits.detach().to('cpu').numpy()
            label_ids = b_labels.to('cpu').numpy()

            pred_flat = np.argmax(logits, axis=1).flatten().astype(np.float32)
            labels_flat = label_ids[:, 1].flatten().astype(np.float32)
            validation_metrics.update_metrics(labels_flat, pred_flat)

    return total_validation_loss / len(validation_dataloader), validation_metrics.calculate_metrics()



In [9]:
model.cuda()

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()

best_val_acc, best_val_loss = 0, torch.finfo(torch.float32).max

model_full_path = os.path.join(config['model_path'], config['model_name'])
for epoch_i in range(epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()

    total_train_loss = 0
    total_train_accuracy = 0

    # set model to train
    model.train()
    # For each batch of training data...
    training_loss, training_metrics = train_step(train_dataloader=train_dataloader,
                                           device=device, t0=t0,
                                           model=model,
                                           optimizer=optimizer,
                                           scheduler=scheduler)

    training_time = format_time(time.time() - t0)

    print("")
    print("  Training Metrics: ", training_metrics)
    print("  Average training loss: {0:.4f}".format(training_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # validation step

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode -- the dropout layers behave differently during evaluation.
    model.eval()
    avg_val_loss, validation_metrics = validation_step(model=model,
                                                          validation_dataloader=validation_dataloader,
                                                          device=device)

    validation_time = format_time(time.time() - t0)
    avg_val_accuracy = validation_metrics["accuracy"]
    print("")
    print("  Validation Metrics: ", validation_metrics)
    print("  Average validaton loss: {0:.4f}".format(avg_val_loss))
    print("  Validation epoch took: {:}".format(validation_time))

    if avg_val_accuracy > best_val_acc and avg_val_loss < best_val_loss:
        best_val_acc = avg_val_accuracy
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), model_full_path)


Training...

Running Validation...

  Validation Metrics:  {'accuracy': 0.5066964, 'precision': 0.5066964, 'recall': 1.0, 'f1': 0.59226286, 'MCC': 0.0}
  Average validaton loss: 0.7031
  Validation epoch took: 0:01:09

Training...

Running Validation...

  Validation Metrics:  {'accuracy': 0.5066964, 'precision': 0.5066964, 'recall': 1.0, 'f1': 0.59226286, 'MCC': 0.0}
  Average validaton loss: 0.7031
  Validation epoch took: 0:01:11

Training...

Running Validation...

  Validation Metrics:  {'accuracy': 0.5066964, 'precision': 0.5066964, 'recall': 1.0, 'f1': 0.59226286, 'MCC': 0.0}
  Average validaton loss: 0.7031
  Validation epoch took: 0:01:11
