In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import transformers
import tqdm.notebook as tq
from collections import defaultdict


In [2]:
# load data
# check if local env or paperspace
import os
if os.path.exists('/datasets/essays/essays_cleaned.csv'):
    dataset_path='/datasets/essays/essays_cleaned.csv'
else:
    dataset_path='../data/essays_cleaned.csv'
df = pd.read_csv(dataset_path,encoding="utf-8")

In [3]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
# Hyperparameters
MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 1e-05
THRESHOLD = 0.5 # threshold for the sigmoid
# intialize tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [4]:
# split into train and test
df_train, df_test = train_test_split(df, random_state=77, test_size=0.30, shuffle=True)
# split test into test and validation datasets
df_test, df_valid = train_test_split(df_test, random_state=88, test_size=0.50, shuffle=True)

In [5]:
# create a dataset class
class EssayDataset(Dataset):
    def __init__(self, df, max_len,target_list):
        self.df = df
        self.max_len = max_len
        self.text = list(df['TEXT'])
        self.tokenizer = transformers.BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.targets = self.df[target_list].values
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.text[idx]
        text_length = len(text)
        middle_start = text_length // 3
        middle_end = 2 * (text_length // 3)
        text_middle = text[middle_start:middle_end]
        text=text_middle
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )
        print(self.targets[idx])
        return {
            'text': text,
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'targets': torch.FloatTensor(self.targets[idx])
        }

In [6]:
# target list is the list of big 5 personality traits that we want to predict
target_list= list(df.columns[1:])
target_list

['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']

In [7]:
# datasets
train_dataset = EssayDataset(df_train, MAX_LEN, target_list)
valid_dataset = EssayDataset(df_valid, MAX_LEN, target_list)
test_dataset = EssayDataset(df_test, MAX_LEN, target_list)

In [8]:
# create dataloaders
train_data_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_data_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, shuffle=True)

In [9]:
batch=next(iter(train_data_loader))

[1 0 1 0 0]
[0 1 0 0 0]
[1 0 1 1 0]
[0 1 1 0 1]
[0 0 0 1 1]
[0 1 0 0 0]
[0 0 1 0 1]
[0 0 0 0 0]
[0 0 1 1 1]
[1 0 1 1 0]
[1 1 1 1 1]
[0 1 1 0 1]
[1 0 1 0 0]
[0 0 0 0 0]
[0 1 1 1 1]
[1 0 0 1 1]


In [16]:
batch

{'text': ['o Ill write that This assignment is very weird and I do not know how anything can come of it What should I say  I do not consciously know what I am thinking so I do not know what to type I Am just sitting here trying to think of something profound to say That is stupid considering that I do not have to say anything important but I guess that is what I Am used to doing when I write an assignment Right now I Am staring at a picture of a whales tale as it dives into the ocean It really looks kind of stupid and I cannot think of anything else to say about it You psychology people are strange and I wonder what the heck you are going t',
  ' weird  sleep is good that girl next to me was hot today forgot her name though  oh well ill ask colin dude needs to shave funny though wonder what linda is doing I need to call her why upgrade things 13 minutes left SO long this is not acoustic liars oooh wrong song  what is up with internet downloading its there for a reason CDs are so damn e

In [17]:

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = transformers.AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()



In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [19]:
# BCEWithLogitsLoss combines a Sigmoid layer and the BCELoss in one single class. 
# This version is more numerically stable than using a plain Sigmoid followed 
# by a BCELoss as, by combining the operations into one layer, 
# we take advantage of the log-sum-exp trick for numerical stability.
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [20]:
# define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-5)   

In [21]:
# Training of the model for one epoch
def train_model(training_loader, model, optimizer):

    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to training mode (activate droput, batch norm)
    model.train()
    # initialize the progress bar
    loop = tq.tqdm(enumerate(training_loader), total=len(training_loader), 
                      leave=True, colour='steelblue')
    for batch_ix, data in loop:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        # forward
        outputs = model(ids, mask, token_type_ids) # (batch,predict)=(32,8)
        ids = ids.cpu().detach().numpy()
        mask = mask.cpu().detach().numpy()
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        # training accuracy, apply sigmoid, round (apply thresh 0.5)
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
        targets = targets.cpu().detach().numpy()
        correct_predictions += np.sum(outputs==targets)
        num_samples += targets.size   # total number of elements in the 2D array

        # backward
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # grad descent step
        optimizer.step()

        # Update progress bar
        #loop.set_description(f"")
        #loop.set_postfix(batch_loss=loss)

    # returning: trained model, model accuracy, mean loss
    return model, float(correct_predictions)/num_samples, np.mean(losses)

In [22]:

def eval_model(validation_loader, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    with torch.no_grad():
        for btch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # validation accuracy
            # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            targets = targets.cpu().detach().numpy()
            correct_predictions += np.sum(outputs==targets)
            num_samples += targets.size   # total number of elements in the 2D array

    return float(correct_predictions)/num_samples, np.mean(losses)

In [17]:

history = defaultdict(list)
best_accuracy = 0

for epoch in range(1, EPOCHS+1):
    print(f'Epoch {epoch}/{EPOCHS}')
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model, optimizer)

    print(f'train_loss={train_loss:.4f}, val_loss={val_loss:.4f} train_acc={train_acc:.4f}, val_acc={val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    # save the best model
    if val_acc > best_accuracy:
        torch.save(model.state_dict(),"ckpts/MLTC_model_state.bin")
        best_accuracy = val_acc

Epoch 1/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.6970, val_loss=0.6910 train_acc=0.5152, val_acc=0.5348
Epoch 2/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.6893, val_loss=0.6855 train_acc=0.5361, val_acc=0.5612
Epoch 3/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.6783, val_loss=0.6876 train_acc=0.5700, val_acc=0.5553
Epoch 4/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.6617, val_loss=0.6826 train_acc=0.6023, val_acc=0.5526
Epoch 5/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.6372, val_loss=0.6845 train_acc=0.6401, val_acc=0.5606
Epoch 6/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.6013, val_loss=0.6940 train_acc=0.6783, val_acc=0.5639
Epoch 7/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.5614, val_loss=0.7184 train_acc=0.7275, val_acc=0.5698
Epoch 8/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.5163, val_loss=0.7535 train_acc=0.7611, val_acc=0.5542
Epoch 9/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.4712, val_loss=0.7689 train_acc=0.7953, val_acc=0.5482
Epoch 10/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.4213, val_loss=0.8207 train_acc=0.8284, val_acc=0.5461


## Evaluate Model

In [29]:
# load the best model
model = BERTClass()
model.load_state_dict(torch.load("../checkpoints/MLTC_bert_model_state.bin", map_location=torch.device('cpu')))

<All keys matched successfully>

In [30]:
# Evaluate the model using the test data
test_acc, test_loss = eval_model(test_data_loader, model, optimizer)
test_acc, test_loss

(0.5827027027027027, 0.6750079989433289)

In [32]:
from sklearn.metrics import confusion_matrix, classification_report

In [38]:
def get_predictions(model,data_loader):
    model.eval()
    text=[]
    predictions = []
    predictions_probs = []
    real_values = []
    with torch.no_grad():
        for data in data_loader:
            text.extend(data['text'])
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            outputs = torch.sigmoid(outputs)
            predictions.extend(outputs.cpu().detach().round())
            real_values.extend(targets.cpu().detach())
            predictions_probs.extend(outputs.cpu().detach())      
    predictions = torch.stack(predictions)
    predictions_probs = torch.stack(predictions_probs)
    real_values = torch.stack(real_values)
    return text,predictions,predictions_probs, real_values

In [39]:
# get predictions for the test data
text, predictions, prediction_probs, real_values = get_predictions(model, test_data_loader)

In [44]:
# print size and shapes
print(f"predictions shape: {predictions.shape}, real values shape: {real_values.shape}, text length: {len(text)}, prediction_probs shape: {prediction_probs.shape}")

predictions shape: torch.Size([370, 5]), real values shape: torch.Size([370, 5]), text length: 370, prediction_probs shape: torch.Size([370, 5])


In [40]:
predictions

tensor([[1., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1.],
        [0., 1., 0., 0., 1.],
        ...,
        [1., 1., 0., 0., 1.],
        [1., 1., 0., 0., 1.],
        [1., 1., 0., 1., 0.]])

In [41]:
print(classification_report(real_values, predictions, target_names=target_list))

              precision    recall  f1-score   support

        cEXT       0.50      0.81      0.62       178
        cNEU       0.55      0.85      0.67       190
        cAGR       0.58      0.60      0.59       195
        cCON       0.61      0.59      0.60       186
        cOPN       0.64      0.78      0.70       190

   micro avg       0.57      0.72      0.64       939
   macro avg       0.58      0.73      0.64       939
weighted avg       0.58      0.72      0.64       939
 samples avg       0.58      0.71      0.60       939



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
