<a href="https://colab.research.google.com/github/oaarnikoivu/dissertation/blob/master/BERT_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Detecting Emotions from Tweets with BERT 

First we need to install the transformers python package to get access to the pre-trained BERT models.

In [0]:
!pip install transformers
!pip install fast-bert

In [0]:
from transformers import BertTokenizer
from box import Box
from tqdm import tqdm, trange
from pathlib import Path
from sklearn.model_selection import train_test_split

import torch
import pandas as pd 
import collections 
import random
import numpy as np
import apex

import logging
import datetime

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc

In [54]:
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

In [0]:
torch.cuda.empty_cache()

In [0]:
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

In [0]:
path = '/content/drive/My Drive/'

DATA_PATH = Path(path + '/datasets/SemEval/')
LABEL_PATH = Path(path + '/labels/')

MODEL_PATH = Path(path + '/models/')
LOG_PATH = Path(path + '/logs/')

MODEL_PATH.mkdir(exist_ok=True)
LOG_PATH.mkdir(exist_ok=True)

BERT_PRETRAINED_PATH = Path(path + '/BERT/')

OUTPUT_PATH = Path(MODEL_PATH/'output')
OUTPUT_PATH.mkdir(exist_ok=True)

# GPU

In [0]:
device = torch.device('cuda')

In [0]:
label_cols = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 
              'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

In [85]:
databunch = BertDataBunch(DATA_PATH, 
                          LABEL_PATH,
                          tokenizer='bert-base-uncased',
                          train_file='train.csv',
                          val_file='val.csv',
                          test_data='test.csv',
                          label_file='labels.csv',
                          text_col="Tweet",
                          label_col=label_cols,
                          batch_size_per_gpu=16,
                          max_seq_length=128,
                          multi_gpu=True,
                          multi_label=True,
                          model_type='bert')

01/16/2020 22:00:38 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
01/16/2020 22:00:38 - INFO - root -   Loading features from cached file /content/drive/My Drive/datasets/SemEval/cache/cached_bert_train_multi_label_128_train.csv
01/16/2020 22:00:38 - INFO - root -   Loading features from cached file /content/drive/My Drive/datasets/SemEval/cache/cached_bert_dev_multi_label_128_val.csv
01/16/2020 22:00:38 - INFO - root -   Loading features from cached file /content/drive/My Drive/datasets/SemEval/cache/cached_bert_test_multi_label_128_test


In [86]:
databunch.train_dl.dataset[0][3]

tensor([0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1.])

In [87]:
num_labels = len(databunch.labels)
num_labels

11

In [0]:
metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})

In [89]:
learner = BertLearner.from_pretrained_model(
    databunch, 
    pretrained_path='bert-base-uncased',
    metrics=metrics,
    device=device,
    logger=logger,
    output_dir=OUTPUT_PATH,
    finetuned_wgts_path=None,
    warmup_steps=500,
    multi_gpu=True,
    is_fp16=False,
    multi_label=True,
    logging_steps=0)

01/16/2020 22:00:45 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
01/16/2020 22:00:45 - INFO - transformers.configuration_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 11,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pru

bert-base-uncased
<class 'str'>


01/16/2020 22:00:47 - INFO - transformers.modeling_utils -   Weights of BertForMultiLabelSequenceClassification not initialized from pretrained model: ['classifier.weight', 'classifier.bias']
01/16/2020 22:00:47 - INFO - transformers.modeling_utils -   Weights from pretrained model not used in BertForMultiLabelSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']


In [90]:
num_epochs = 4
learning_rate = 6e-5

learner.fit(num_epochs, 
            learning_rate, 
            schedule_type="warmup_cosine",
            optimizer_type="lamb",
            validate=True)

01/16/2020 22:00:51 - INFO - __main__ -   ***** Running training *****
01/16/2020 22:00:51 - INFO - __main__ -     Num examples = 6838
01/16/2020 22:00:51 - INFO - __main__ -     Num Epochs = 4
01/16/2020 22:00:51 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 16
01/16/2020 22:00:51 - INFO - __main__ -     Gradient Accumulation steps = 1
01/16/2020 22:00:51 - INFO - __main__ -     Total optimization steps = 1712


01/16/2020 22:04:11 - INFO - __main__ -   Running evaluation
01/16/2020 22:04:11 - INFO - __main__ -     Num examples = 886
01/16/2020 22:04:11 - INFO - __main__ -     Batch size = 32


01/16/2020 22:04:19 - INFO - __main__ -   eval_loss after epoch 1: 0.5360515330518995: 
01/16/2020 22:04:19 - INFO - __main__ -   eval_accuracy_thresh after epoch 1: 0.7782679796218872: 
01/16/2020 22:04:19 - INFO - __main__ -   eval_roc_auc after epoch 1: 0.7060729593375952: 
01/16/2020 22:04:19 - INFO - __main__ -   eval_fbeta after epoch 1: 0.5877101421356201: 
01/16/2020 22:04:19 - INFO - __main__ -   lr after epoch 1: 5.136e-05
01/16/2020 22:04:19 - INFO - __main__ -   train_loss after epoch 1: 0.60032088901395
01/16/2020 22:04:19 - INFO - __main__ -   

01/16/2020 22:07:38 - INFO - __main__ -   Running evaluation
01/16/2020 22:07:38 - INFO - __main__ -     Num examples = 886
01/16/2020 22:07:38 - INFO - __main__ -     Batch size = 32


01/16/2020 22:07:45 - INFO - __main__ -   eval_loss after epoch 2: 0.4469392778617995: 
01/16/2020 22:07:45 - INFO - __main__ -   eval_accuracy_thresh after epoch 2: 0.8047403693199158: 
01/16/2020 22:07:45 - INFO - __main__ -   eval_roc_auc after epoch 2: 0.8036134666285568: 
01/16/2020 22:07:45 - INFO - __main__ -   eval_fbeta after epoch 2: 0.6121078729629517: 
01/16/2020 22:07:45 - INFO - __main__ -   lr after epoch 2: 4.8108228255680444e-05
01/16/2020 22:07:45 - INFO - __main__ -   train_loss after epoch 2: 0.4875621260883652
01/16/2020 22:07:45 - INFO - __main__ -   

01/16/2020 22:11:04 - INFO - __main__ -   Running evaluation
01/16/2020 22:11:04 - INFO - __main__ -     Num examples = 886
01/16/2020 22:11:04 - INFO - __main__ -     Batch size = 32


01/16/2020 22:11:12 - INFO - __main__ -   eval_loss after epoch 3: 0.40796645092112677: 
01/16/2020 22:11:12 - INFO - __main__ -   eval_accuracy_thresh after epoch 3: 0.8391134738922119: 
01/16/2020 22:11:12 - INFO - __main__ -   eval_roc_auc after epoch 3: 0.8364070077910779: 
01/16/2020 22:11:12 - INFO - __main__ -   eval_fbeta after epoch 3: 0.6505194902420044: 
01/16/2020 22:11:12 - INFO - __main__ -   lr after epoch 3: 1.664423060378799e-05
01/16/2020 22:11:12 - INFO - __main__ -   train_loss after epoch 3: 0.4323560443019199
01/16/2020 22:11:12 - INFO - __main__ -   

01/16/2020 22:14:31 - INFO - __main__ -   Running evaluation
01/16/2020 22:14:31 - INFO - __main__ -     Num examples = 886
01/16/2020 22:14:31 - INFO - __main__ -     Batch size = 32


01/16/2020 22:14:39 - INFO - __main__ -   eval_loss after epoch 4: 0.4019217299563544: 
01/16/2020 22:14:39 - INFO - __main__ -   eval_accuracy_thresh after epoch 4: 0.8439359664916992: 
01/16/2020 22:14:39 - INFO - __main__ -   eval_roc_auc after epoch 4: 0.8416226770669722: 
01/16/2020 22:14:39 - INFO - __main__ -   eval_fbeta after epoch 4: 0.6489752531051636: 
01/16/2020 22:14:39 - INFO - __main__ -   lr after epoch 4: 0.0
01/16/2020 22:14:39 - INFO - __main__ -   train_loss after epoch 4: 0.41246818953028347
01/16/2020 22:14:39 - INFO - __main__ -   



(1712, 0.4831768122336296)

In [91]:
learner.save_model()

01/16/2020 22:15:04 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/models/output/model_out/config.json
01/16/2020 22:15:05 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/models/output/model_out/pytorch_model.bin


In [0]:
from fast_bert.prediction import BertClassificationPredictor

In [93]:
MODEL_PATH = OUTPUT_PATH/'model_out'

predictor = BertClassificationPredictor(model_path=MODEL_PATH, 
                                        label_path=LABEL_PATH,
                                        multi_label=True,
                                        model_type='bert',
                                        do_lower_case=True)

01/16/2020 22:15:08 - INFO - transformers.tokenization_utils -   Model name '/content/drive/My Drive/models/output/model_out' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, bert-base-finnish-cased-v1, bert-base-finnish-uncased-v1). Assuming '/content/drive/My Drive/models/output/model_out' is a path or url to a directory containing tokenizer files.
01/16/2020 22:15:08 - INFO - transformers.tokenization_utils -   loading file /content/drive/My Drive/models/output/model_out/vocab.txt
01/16/2020 22:15:08 - INFO - transformers.tokenization_utils -  

/content/drive/My Drive/models/output/model_out
<class 'str'>


In [98]:
single_pred = predictor.predict("I am so thankful for everything.")

01/16/2020 22:16:45 - INFO - root -   Writing example 0 of 1


In [99]:
single_pred

[('joy', 0.613877534866333),
 ('optimism', 0.5743357539176941),
 ('sadness', 0.2809685468673706),
 ('love', 0.24984759092330933),
 ('fear', 0.24792851507663727),
 ('disgust', 0.23301798105239868),
 ('anticipation', 0.2312241792678833),
 ('anger', 0.22087150812149048),
 ('pessimism', 0.16349482536315918),
 ('trust', 0.14730584621429443),
 ('surprise', 0.1007944643497467)]