# Attempt to fine tune BERT with the VERY limited data available

**Result:** The code works but will require more work to get embeddings of longer texts.

In [0]:
!nvidia-smi

Thu May 21 13:08:51 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P0    33W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
!pip install transformers



In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import json
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
import re

def clean_text(raw_text: str):
    if raw_text is None:
        return ''

    soup = BeautifulSoup(raw_text, features="html.parser")
    raw_text = soup.get_text()
    raw_text = raw_text.replace('\n', ' ').replace('\xa0', ' ')
    return raw_text


def read_json_as_df(path: str) -> pd.DataFrame:
    json_data = []

    with open(path, 'r', encoding='utf-8') as file:

        for line in file:
            data = json.loads(line)
            json_data.append([clean_text(data['post'].get('body', None)),
                              data['priority']])

    df = pd.DataFrame(data=json_data, columns=('text', 'priority'))

    return df

In [0]:
import random
import numpy as np
import pandas as pd
import transformers
import time
import datetime
from tqdm import tqdm

import torch

if torch.cuda.is_available():
    device = torch.device('cuda')


In [0]:
MAX_LENGTH = 512
BATCH_SIZE = 16
EPOCHS = 3

label_mapping = {
    'green': 0,
    'amber': 1,
    'escalate': 2,
    'red': 3
}

In [0]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased',
                                                       do_lower_case=True)

In [0]:
train_df = read_json_as_df('/content/drive/My Drive/temp_datasets/combined-train.json')
test_df = read_json_as_df('/content/drive/My Drive/temp_datasets/combined-test.json')

In [0]:
train_df['sentence_len'] = np.array([len(sent) for sent in train_df.text])
test_df['sentence_len'] = np.array([len(sent) for sent in test_df.text])

In [0]:
train_df['tokenized_len'] = np.array([len(tokenizer.tokenize(sent)) for sent in train_df.text])
test_df['tokenized_len'] = np.array([len(tokenizer.tokenize(sent)) for sent in test_df.text])

In [0]:
train_df = train_df[train_df.tokenized_len < 510]
test_df = test_df[test_df.tokenized_len < 510]

In [0]:
def tokenize(sentences, tokenizer):
    input_ids, input_masks = [],[]
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=MAX_LENGTH, pad_to_max_length=True, 
                                             return_attention_mask=True, return_token_type_ids=False)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32')

In [0]:
train_input_ids, train_input_masks = tokenize(train_df.text, tokenizer)
test_input_ids, test_input_masks = tokenize(test_df.text, tokenizer)

100%|██████████| 6301/6301 [00:07<00:00, 793.21it/s]
100%|██████████| 386/386 [00:00<00:00, 613.75it/s]


In [0]:
train_input_ids = torch.tensor(train_input_ids)
train_input_masks = torch.tensor(train_input_masks)

test_input_ids = torch.tensor(test_input_ids)
test_input_masks = torch.tensor(test_input_masks)

In [0]:
train_labels = torch.tensor(train_df.priority.map(label_mapping).values)
test_labels = torch.tensor(test_df.priority.map(label_mapping).values)

In [0]:
train_dataset = torch.utils.data.TensorDataset(train_input_ids,
                                               train_input_masks,
                                               train_labels)

test_dataset = torch.utils.data.TensorDataset(test_input_ids,
                                               test_input_masks,
                                               test_labels)

In [0]:
# train_size = int(0.9 * len(train_dataset))
# val_size = len(train_dataset) - train_size

# train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

# print(f'Train set size: {train_size}\nValid set size: {val_size}')

In [0]:
train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                               sampler=torch.utils.data.RandomSampler(train_dataset),
                                               batch_size=BATCH_SIZE)

# validation_dataloader = torch.utils.data.DataLoader(val_dataset,
#                                              sampler=torch.utils.data.RandomSampler(val_dataset),
#                                              batch_size=BATCH_SIZE)

test_dataloader = torch.utils.data.DataLoader(test_dataset,
                                             sampler=torch.utils.data.SequentialSampler(test_dataset),
                                             batch_size=BATCH_SIZE)

In [0]:
model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                                   num_labels=4,
                                                                   output_attentions=False,
                                                                   output_hidden_states=False)

model.cuda()

In [0]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [0]:
optimizer = transformers.AdamW(model.parameters(),
                               lr=5e-5,
                               eps=1e-8)

In [0]:
total_steps = len(train_dataloader) * EPOCHS

scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
                                                         num_warmup_steps=0,
                                                         num_training_steps=total_steps)

In [0]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [0]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

total_t0 = time.time()

for epoch_i in range(0, EPOCHS):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    print('Training...')
    
    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):
        
        if step % 40 == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            
        b_input_ids = batch[0].long().to(device)
        b_input_mask = batch[1].long().to(device)
        b_labels = batch[2].to(device)
        
        model.zero_grad()        
        
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        
        total_train_loss += loss.item()
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        
        scheduler.step()
        
    avg_train_loss = total_train_loss / len(train_dataloader)

    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    
    # print("")
    # print("Running Validation...")
    
    # t0 = time.time()
    
    # model.eval()
    
    # total_eval_accuracy = 0
    # total_eval_loss = 0
    # nb_eval_steps = 0
    
    # for batch in validation_dataloader:
        
    #     b_input_ids = batch[0].long().to(device)
    #     b_input_mask = batch[1].long().to(device)
    #     b_labels = batch[2].to(device)
        
    #     with torch.no_grad():        
            
    #         (loss, logits) = model(b_input_ids, 
    #                                token_type_ids=None, 
    #                                attention_mask=b_input_mask,
    #                                labels=b_labels)
       
    # total_eval_loss += loss.item() 
    # logits = logits.detach().cpu().numpy()
    # label_ids = b_labels.to('cpu').numpy()
    
    # total_eval_accuracy += flat_accuracy(logits, label_ids)
    
    # avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    # print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # validation_time = format_time(time.time() - t0)
    
    # print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    # print("  Validation took: {:}".format(validation_time))

    # training_stats.append(
    #     {
    #         'epoch': epoch_i + 1,
    #         'Training Loss': avg_train_loss,
    #         'Valid. Loss': avg_val_loss,
    #         'Valid. Accur.': avg_val_accuracy,
    #         'Training Time': training_time,
    #         'Validation Time': validation_time
    #     }
    # )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
    


Training...


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


  Batch    40  of    394.    Elapsed: 0:00:33.
  Batch    80  of    394.    Elapsed: 0:01:07.
  Batch   120  of    394.    Elapsed: 0:01:40.
  Batch   160  of    394.    Elapsed: 0:02:14.
  Batch   200  of    394.    Elapsed: 0:02:47.
  Batch   240  of    394.    Elapsed: 0:03:21.
  Batch   280  of    394.    Elapsed: 0:03:54.
  Batch   320  of    394.    Elapsed: 0:04:27.
  Batch   360  of    394.    Elapsed: 0:05:01.

  Average training loss: 0.72
  Training epcoh took: 0:05:29

Training...
  Batch    40  of    394.    Elapsed: 0:00:33.
  Batch    80  of    394.    Elapsed: 0:01:07.
  Batch   120  of    394.    Elapsed: 0:01:40.
  Batch   160  of    394.    Elapsed: 0:02:14.
  Batch   200  of    394.    Elapsed: 0:02:47.
  Batch   240  of    394.    Elapsed: 0:03:21.
  Batch   280  of    394.    Elapsed: 0:03:54.
  Batch   320  of    394.    Elapsed: 0:04:27.
  Batch   360  of    394.    Elapsed: 0:05:01.

  Average training loss: 0.48
  Training epcoh took: 0:05:29

Training...
  Ba

In [0]:
model.eval()

predictions, true_labels = [], []

for batch in test_dataloader:
    
    batch = tuple(t.long().to(device) for t in batch)
    
    b_input_ids, b_input_mask, b_labels = batch
    
    with torch.no_grad():
        
        outputs = model(b_input_ids, 
                        token_type_ids=None,
                        attention_mask=b_input_mask)
        
    logits = outputs[0]
    
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    predictions.append(logits)
    true_labels.append(label_ids)

In [0]:
from sklearn.metrics import classification_report

In [0]:
flat_predictions = np.concatenate(predictions, axis=0)

flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

flat_true_labels = np.concatenate(true_labels, axis=0)

In [0]:
inverted_label_mapping = {
    0: 'green',
    1: 'amber',
    2: 'escalate',
    3: 'red'
}

flat_predictions = list(map(inverted_label_mapping.get, flat_predictions))
flat_true_labels = list(map(inverted_label_mapping.get, flat_true_labels))

In [0]:
print(classification_report(y_true=flat_true_labels,
                            y_pred=flat_predictions))

              precision    recall  f1-score   support

       amber       0.69      0.72      0.71       151
    escalate       0.44      0.25      0.32        16
       green       0.78      0.96      0.86       156
         red       0.58      0.24      0.34        63

    accuracy                           0.72       386
   macro avg       0.62      0.54      0.56       386
weighted avg       0.70      0.72      0.69       386



In [0]:
from sklearn.metrics import f1_score
print('Macro avgd f1 score: ',
      f1_score(flat_true_labels,
               flat_predictions, 
               labels=['escalate', 'red', 'amber'],
               average='macro'))

Macro avgd f1 score:  0.4541934232694569


In [0]:
def map_flagged(label):
    if label in ['escalate', 'red', 'amber']:
        return 'flagged'
    elif label == 'green':
        return 'green'

flagged_ytest = list(map(map_flagged, flat_true_labels))
flagged_predictions = list(map(map_flagged, flat_predictions))

print('Classification Report:\n', classification_report(flagged_ytest, flagged_predictions))

Classification Report:
               precision    recall  f1-score   support

     flagged       0.97      0.81      0.88       230
       green       0.78      0.96      0.86       156

    accuracy                           0.87       386
   macro avg       0.87      0.89      0.87       386
weighted avg       0.89      0.87      0.87       386



In [0]:
def map_urgent(label):
    if label in ['escalate', 'red']:
        return 'urgent'
    elif label in ['green', 'amber']:
        return 'non-urgent'

urgent_ytest = list(map(map_urgent, flat_true_labels))
urgent_predictions = list(map(map_urgent, flat_predictions))

print('Classification Report:\n', classification_report(urgent_ytest, urgent_predictions))

Classification Report:
               precision    recall  f1-score   support

  non-urgent       0.86      0.98      0.92       307
      urgent       0.86      0.38      0.53        79

    accuracy                           0.86       386
   macro avg       0.86      0.68      0.72       386
weighted avg       0.86      0.86      0.84       386

