In [1]:
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 11.1 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 48.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 53.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 55.4 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled 

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [1]:
root_dir = "gdrive/MyDrive/Capstone/"

pretrained = "roberta-base"
model_checkpoint = pretrained
tokenizer_checkpoint = pretrained

num_labels = 10
batch_size = 8

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AdamW, get_scheduler

import torch
from torch.utils.data import DataLoader

from datasets import load_dataset, load_metric, load_from_disk

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score

import pickle
from tqdm.auto import tqdm

In [3]:
dataset = load_dataset('csv', data_files=root_dir + 'df_cleaned.csv', split='train')

Using custom data configuration default-863e501dda6703d2
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-863e501dda6703d2/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


In [4]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["tweet_text_cleaned"], padding="max_length", truncation=True)

tokenized_datasets = dataset.train_test_split(test_size=0.1).map(tokenize_function, batched=True)

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/csv/default-863e501dda6703d2/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-16cc27aa8ace637b.arrow and /root/.cache/huggingface/datasets/csv/default-863e501dda6703d2/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-9287c69e8d270862.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-863e501dda6703d2/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-0c9f25bc547d5809.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-863e501dda6703d2/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-5487cef7100ff762.arrow


In [5]:
tokenized_datasets = tokenized_datasets.remove_columns(['class_label', 
                                                        'data_type', 
                                                        'event',
                                                        'event_type',
                                                        'file_name',
                                                        'hashtags',
                                                        'processed_text_length',
                                                        #'token_type_ids',
                                                        'tweet_id',
                                                        'tweet_text',
                                                        'tweet_text_cleaned',
                                                        'year'])
tokenized_datasets = tokenized_datasets.rename_column("class_label_id", "labels")
tokenized_datasets.set_format("torch")

In [6]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=batch_size)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=batch_size)

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/csv/default-863e501dda6703d2/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-bccbae190d62effe.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/csv/default-863e501dda6703d2/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-a70efc740d4f809d.arrow


In [7]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [8]:
device

device(type='cuda')

In [9]:
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [10]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/375 [00:00<?, ?it/s]

In [11]:
metric= load_metric("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.715}

In [12]:
metric_name = "accuracy"

args = TrainingArguments(
    "capstone_"+pretrained,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    metric_for_best_model=metric_name,
)

In [13]:
metric= load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [14]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [15]:
trainer.train()

***** Running training *****
  Num examples = 68834
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 25815


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6865,0.707856,0.754478
2,0.5774,0.664842,0.771866
3,0.4746,0.726794,0.768205


Saving model checkpoint to capstone_roberta-base/checkpoint-500
Configuration saved in capstone_roberta-base/checkpoint-500/config.json
Model weights saved in capstone_roberta-base/checkpoint-500/pytorch_model.bin
tokenizer config file saved in capstone_roberta-base/checkpoint-500/tokenizer_config.json
Special tokens file saved in capstone_roberta-base/checkpoint-500/special_tokens_map.json
Saving model checkpoint to capstone_roberta-base/checkpoint-1000
Configuration saved in capstone_roberta-base/checkpoint-1000/config.json
Model weights saved in capstone_roberta-base/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in capstone_roberta-base/checkpoint-1000/tokenizer_config.json
Special tokens file saved in capstone_roberta-base/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to capstone_roberta-base/checkpoint-1500
Configuration saved in capstone_roberta-base/checkpoint-1500/config.json
Model weights saved in capstone_roberta-base/checkpoint-1500/pytorch_

TrainOutput(global_step=25815, training_loss=0.595012334823239, metrics={'train_runtime': 12899.9065, 'train_samples_per_second': 16.008, 'train_steps_per_second': 2.001, 'total_flos': 5.433686182368461e+16, 'train_loss': 0.595012334823239, 'epoch': 3.0})

In [16]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 7649
  Batch size = 8


{'epoch': 3.0,
 'eval_accuracy': 0.768204994116878,
 'eval_loss': 0.7267939448356628,
 'eval_runtime': 150.3837,
 'eval_samples_per_second': 50.863,
 'eval_steps_per_second': 6.364}

In [17]:
model.save_pretrained(root_dir + 'models/' + pretrained)
tokenizer.save_pretrained(root_dir + 'tokenizers/' + pretrained)

Configuration saved in gdrive/MyDrive/Capstone/models/roberta-base/config.json
Model weights saved in gdrive/MyDrive/Capstone/models/roberta-base/pytorch_model.bin
tokenizer config file saved in gdrive/MyDrive/Capstone/tokenizers/roberta-base/tokenizer_config.json
Special tokens file saved in gdrive/MyDrive/Capstone/tokenizers/roberta-base/special_tokens_map.json


('gdrive/MyDrive/Capstone/tokenizers/roberta-base/tokenizer_config.json',
 'gdrive/MyDrive/Capstone/tokenizers/roberta-base/special_tokens_map.json',
 'gdrive/MyDrive/Capstone/tokenizers/roberta-base/vocab.json',
 'gdrive/MyDrive/Capstone/tokenizers/roberta-base/merges.txt',
 'gdrive/MyDrive/Capstone/tokenizers/roberta-base/added_tokens.json',
 'gdrive/MyDrive/Capstone/tokenizers/roberta-base/tokenizer.json')

In [18]:
tokenized_datasets.save_to_disk(root_dir + 'assets/datasets/' + pretrained)

# Results Visualization

In [19]:
dataset = load_dataset('csv', data_files=root_dir + 'df_cleaned.csv', split='train')
tokenized_datasets = load_from_disk(root_dir + 'assets/datasets/' + pretrained)
tokenized_datasets.set_format("torch")

Using custom data configuration default-863e501dda6703d2
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-863e501dda6703d2/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


In [20]:
model_checkpoint = root_dir + "models/" + pretrained
tokenizer_checkpoint = root_dir + "tokenizers/" + pretrained

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=10)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

loading configuration file gdrive/MyDrive/Capstone/models/roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
 

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [21]:
device

device(type='cuda')

In [22]:
metric = load_metric("accuracy")
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=batch_size)

model.eval()
#prob = []
y_pred = []
total_loss = 0.0
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    total_loss += outputs.loss.item()
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
    #prob.extend(torch.nn.Softmax(dim=1)(outputs.logits))
    y_pred.extend([t.item() for t in predictions])

loss = total_loss/tokenized_datasets["test"].shape[0]
metric.compute()

{'accuracy': 0.768204994116878}

In [None]:
y_true = tokenized_datasets["test"]["labels"].numpy().tolist()

In [None]:
assert len(y_true) == len(y_pred)

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, path='cm', cmap=plt.cm.Reds):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#     pickle.dump(cm, open(path, 'wb'))

    fig, ax = plt.subplots(figsize=(15,15))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
label_name = sorted(pd.Series(dataset['class_label']).unique())

In [None]:
accuracy = accuracy_score(y_true, y_pred)
matthews = matthews_corrcoef(y_true, y_pred)

precisions = {}
recalls = {}
f1s = {}
aucrocs = {}

for i in range(len(label_name)):
    prediction_ = [1 if pred == i else 0 for pred in y_pred]
    true_ = [1 if label == i else 0 for label in y_true]
    f1s.update({label_name[i]: f1_score(true_, prediction_)})
    precisions.update({label_name[i]: precision_score(true_, prediction_)})
    recalls.update({label_name[i]: recall_score(true_, prediction_)})
    # aucrocs.update({label_name[i]: roc_auc_score(true_, list(t.item() for t in prob[:, i]))})

metrics_dict = {'loss': loss, 'accuracy': accuracy, 'matthews coef': matthews, 'precision': precisions,
                     'recall': recalls, 'f1': f1s, 
                # 'aucroc': aucrocs
                }

pickle.dump(metrics_dict, open(root_dir+'/output/evaluation_metrics/'+pretrained, 'wb'))

cm = plot_confusion_matrix(y_true, y_pred, label_name, normalize=False,
                      path='test_confusion_matrix', title='confusion matrix for test dataset')
plt.savefig(root_dir+'/output/confusion_matrix/'+pretrained, format='png')
cm_norm = plot_confusion_matrix(y_true, y_pred, label_name, normalize=True,
                      path='test normalized_confusion_matrix', title='normalized confusion matrix for test dataset')
plt.savefig(root_dir+'/output/normalized_confusion_matrix/'+pretrained, format='png')

print('loss: %.2f' % loss)
print('accuracy: %.2f' % accuracy)
print('matthews coef: %.2f' % matthews)
for i in range(len(label_name)):
    print('precision score for %s: %.2f' % (label_name[i], precisions[label_name[i]]))
    print('recall score for %s: %.2f' % (label_name[i], recalls[label_name[i]]))
    print('f1 score for %s: %.2f' % (label_name[i], f1s[label_name[i]]))
    # print('auc roc score for %s: %.2f' % (label_name[i], aucrocs[label_name[i]]))

In [28]:
with open(root_dir+'/output/evaluation_metrics/'+pretrained, 'rb') as file:
    p = pickle.load(file)