In [1]:
import torch
from tqdm import tqdm 

from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np
import random

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

seed = 0

cuda:0


Following code is focused on formatting the data into a dataframe

In [13]:
df_work = pd.read_csv('emailsWork.csv')[["Body", "Label"]]
df_promotions = pd.read_csv("emailsPromo.csv")[["Body", "Label"]]
df_blog = pd.read_csv("current.csv")[["Body", "Label"]]

maindf = pd.concat([df_work, df_promotions, df_blog])
labels = maindf["Label"].unique()

label_dict = {}
for index, label in enumerate(labels):
    label_dict[label] = index

print(maindf.Label.value_counts(), maindf.shape)

maindf.Label = maindf.Label.replace(label_dict)

Xtrain, Xtest, ytrain, ytest = train_test_split(maindf.Body.values, maindf.Label.values, test_size=0.15, random_state=seed)

  df_work = pd.read_csv('emailsWork.csv')[["Body", "Label"]]


Work         2718
Promotion    1985
school       1970
Blog          995
Name: Label, dtype: int64 (7668, 2)


Tokenize the train and test data to be fed into the model.

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case = True)

etrain = tokenizer.batch_encode_plus(Xtrain, add_special_tokens = True, return_attention_mask = True, pad_to_max_length = True, max_length=256, return_tensors = "pt")
etest = tokenizer.batch_encode_plus(
    Xtest, add_special_tokens=True, return_attention_mask=True, pad_to_max_length=True, max_length=256, return_tensors="pt")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [4]:
input_ids_train = etrain["input_ids"]
attention_masks_train = etrain["attention_mask"]
labels_train = torch.tensor(ytrain)

input_ids_test = etest["input_ids"]
attention_masks_test = etest["attention_mask"]
labels_test = torch.tensor(ytest)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [5]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict), output_attentions=False, output_hidden_states=False)
model = model.to(device)

dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=3)
dataloader_test = DataLoader(dataset_test, sampler=RandomSampler(dataset_test), batch_size=3)

optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)

epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train)*epochs)

#scoring methods
def f1_score_(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average="weighted")
def accuracy(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Do Not Run the code below, It will run for more than an hour, this is for training the data 

In [6]:
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

def evaluate(dataloader):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [],[]

    for batch  in dataloader:
        batch = tuple(b.to(device) for b in batch)

        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2],
        }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs["labels"].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    
    return loss_val_avg, predictions, true_vals

# for epoch in tqdm(range(1, epochs + 1)):
#     model.train()

#     loss_train_total = 0

#     progress = tqdm(dataloader_train, desc="Epoch {:1d}".format(epoch), leave=False, disable=False)
#     for batch in progress:
#         model.zero_grad()

#         batch = tuple(b.to(device) for b in batch)

#         inputs = {
#             "input_ids": batch[0],
#             "attention_mask": batch[1],
#             "labels": batch[2],
#         }

#         outputs = model(**inputs)

#         loss = outputs[0]
#         loss_train_total += loss.item()
#         loss.backward()

#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

#         optimizer.step()
#         scheduler.step()

#         progress.set_postfix({"training_loss": "{:.3f}".format(loss.item()/len(batch))})
    
#     torch.save(model.state_dict(),
#                f'./models/finetuned_BERT_epoch_{epoch}.model')

#     tqdm.write(f'\nEpoch {epoch}')

#     loss_training_avg = loss_train_total/len(dataloader_train)
#     tqdm.write(f'Training Loss: {loss_training_avg}')

#     val_loss, predictions, true_vals = evaluate(dataloader_test)
#     val_f1 = f1_score_(predictions, true_vals)
#     tqdm.write(f'Validation loss: {val_loss}')
#     tqdm.write(f'F1 Score (Weighted): {val_f1}')

Run this to test the model.

In [8]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = len(label_dict), output_attentions=False, output_hidden_states=False)

model.to(device)
model.load_state_dict(torch.load('./models/finetuned_BERT_epoch_5.model', map_location=torch.device('cuda:0')))

_, predictions, true_vals = evaluate(dataloader_test)
accuracy(predictions, true_vals)
val_f1 = f1_score_(predictions, true_vals)
print(f'F1 Score: {val_f1}')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Class: Work
Accuracy: 428/430

Class: Promotion
Accuracy: 267/273

Class: Blog
Accuracy: 162/163

Class: school
Accuracy: 285/285

F1 Score: 0.9921705781220846
