In [1]:
!nvidia-smi


Thu Mar 28 00:59:22 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro P5000        Off  | 00000000:00:05.0 Off |                  Off |
| 28%   45C    P8     7W / 180W |      2MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import transformers
import tqdm.notebook as tq
from collections import defaultdict

torch.cuda.empty_cache()

import gc
gc.collect()



20

In [3]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [5]:
# load data
# check if local env or paperspace
import os
if os.path.exists('/datasets/essays/essays_cleaned.csv'):
    dataset_path='/datasets/essays/essays_cleaned.csv'
else:
    dataset_path='../data/essays_cleaned.csv'
df = pd.read_csv(dataset_path,encoding="utf-8")

In [6]:
PRE_TRAINED_MODEL_NAME = 'roberta-base'
# Hyperparameters
MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 1e-05
THRESHOLD = 0.5 # threshold for the sigmoid
# intialize tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [7]:
# split into train and test
df_train, df_test = train_test_split(df, random_state=77, test_size=0.30, shuffle=True)
# split test into test and validation datasets
df_test, df_valid = train_test_split(df_test, random_state=88, test_size=0.50, shuffle=True)

In [8]:
# create a dataset class
class EssayDataset(Dataset):
    def __init__(self, df, max_len,target_list):
        self.df = df
        self.max_len = max_len
        self.text = list(df['TEXT'])
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.targets = self.df[target_list].values
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.text[idx]
        text_length = len(text)
        middle_start = text_length // 3
        middle_end = 2 * (text_length // 3)
        text_middle = text[middle_start:middle_end]
        text=text_middle
        inputs = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )
        return {
            'text': text,
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'targets': torch.FloatTensor(self.targets[idx])
        }

In [9]:
# target list is the list of big 5 personality traits that we want to predict
target_list= list(df.columns[1:])
target_list

['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']

In [10]:
# datasets
train_dataset = EssayDataset(df_train, MAX_LEN, target_list)
valid_dataset = EssayDataset(df_valid, MAX_LEN, target_list)
test_dataset = EssayDataset(df_test, MAX_LEN, target_list)

In [11]:
# create dataloaders
train_data_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_data_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, shuffle=True)

In [12]:
batch=next(iter(train_data_loader))

In [13]:
batch

{'text': ['pretty thick beer goggles on so I do not really know if she was as good looking as I thought OH well That just shows my dedication to crew IF I give up getting some then being on the crew team must be very important to me I felt like I was in too This girl was just absolutely digging me that is a shame and then on saturday I went to a lakehouse that was a neither a on a lake b not even a house it was a freakin streamcreek apartment that sucked These girls that were there were pretty annoying ONe was really cute but she was kind of snobbish I do not know Id like to talk to her but these girls were all too snobbish for me I was hoping maybe there would be some down to earth girls but apparently not I hope I meet a good girl pretty soon I want a girlfriend but I think I Am trying too ha',
  'd have taken advantage of that earlier and done this assignment then My attention span is short I keep finding myself drawn into her conversation It makes me think about talking to my frien

In [14]:

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = transformers.AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BERTClass(
  (bert_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerN

In [16]:
# BCEWithLogitsLoss combines a Sigmoid layer and the BCELoss in one single class. 
# This version is more numerically stable than using a plain Sigmoid followed 
# by a BCELoss as, by combining the operations into one layer, 
# we take advantage of the log-sum-exp trick for numerical stability.
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [17]:
# define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-5)   

In [18]:
# Training of the model for one epoch
def train_model(training_loader, model, optimizer):

    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to training mode (activate droput, batch norm)
    model.train()
    # initialize the progress bar
    loop = tq.tqdm(enumerate(training_loader), total=len(training_loader), 
                      leave=True, colour='steelblue')
    for batch_ix, data in loop:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        # forward
        outputs = model(ids, mask, token_type_ids) # (batch,predict)=(32,8)
        ids = ids.cpu().detach().numpy()
        mask = mask.cpu().detach().numpy()
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        # training accuracy, apply sigmoid, round (apply thresh 0.5)
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
        targets = targets.cpu().detach().numpy()
        correct_predictions += np.sum(outputs==targets)
        num_samples += targets.size   # total number of elements in the 2D array

        # backward
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # grad descent step
        optimizer.step()

        # Update progress bar
        #loop.set_description(f"")
        #loop.set_postfix(batch_loss=loss)

    # returning: trained model, model accuracy, mean loss
    return model, float(correct_predictions)/num_samples, np.mean(losses)

In [27]:

def eval_model(validation_loader, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    with torch.no_grad():
        for btch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # validation accuracy
            # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            targets = targets.cpu().detach().numpy()
            correct_predictions += np.sum(outputs==targets)
            num_samples += targets.size   # total number of elements in the 2D array

    return float(correct_predictions)/num_samples, np.mean(losses)

In [19]:

history = defaultdict(list)
best_accuracy = 0

for epoch in range(1, EPOCHS+1):
    print(f'Epoch {epoch}/{EPOCHS}')
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model, optimizer)

    print(f'train_loss={train_loss:.4f}, val_loss={val_loss:.4f} train_acc={train_acc:.4f}, val_acc={val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    # save the best model
    if val_acc > best_accuracy:
        torch.save(model.state_dict(),"ckpts/MLTC_roberta_model_state.bin")
        best_accuracy = val_acc

Epoch 1/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.6937, val_loss=0.6900 train_acc=0.5091, val_acc=0.5267
Epoch 2/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.6876, val_loss=0.6928 train_acc=0.5442, val_acc=0.5477
Epoch 3/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.6738, val_loss=0.6822 train_acc=0.5804, val_acc=0.5639
Epoch 4/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.6508, val_loss=0.7053 train_acc=0.6176, val_acc=0.5714
Epoch 5/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.6182, val_loss=0.7090 train_acc=0.6535, val_acc=0.5757
Epoch 6/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.5797, val_loss=0.7205 train_acc=0.6999, val_acc=0.5784
Epoch 7/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.5352, val_loss=0.7231 train_acc=0.7345, val_acc=0.5790
Epoch 8/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.4817, val_loss=0.8009 train_acc=0.7782, val_acc=0.5741
Epoch 9/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.4321, val_loss=0.8222 train_acc=0.8078, val_acc=0.5698
Epoch 10/10


  0%|          | 0/108 [00:00<?, ?it/s]

train_loss=0.3885, val_loss=0.8791 train_acc=0.8353, val_acc=0.5590


In [25]:
# load the best model
model = BERTClass()
model.load_state_dict(torch.load("../checkpoints/MLTC_roberta_model_state.bin",map_location=torch.device('cpu')),strict=False)
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTClass(
  (bert_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerN

In [28]:
# Evaluate the model using the test data
test_acc, test_loss = eval_model(test_data_loader, model, optimizer)
test_acc, test_loss

(0.5843243243243244, 0.7146516864498457)

In [29]:
from sklearn.metrics import classification_report

In [30]:
def get_predictions(model,data_loader):
    model.eval()
    text=[]
    predictions = []
    predictions_probs = []
    real_values = []
    with torch.no_grad():
        for data in data_loader:
            text.extend(data['text'])
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            outputs = torch.sigmoid(outputs)
            predictions.extend(outputs.cpu().detach().round())
            real_values.extend(targets.cpu().detach())
            predictions_probs.extend(outputs.cpu().detach())      
    predictions = torch.stack(predictions)
    predictions_probs = torch.stack(predictions_probs)
    real_values = torch.stack(real_values)
    return text,predictions,predictions_probs, real_values

In [31]:
# get predictions for the test data
text, predictions, prediction_probs, real_values = get_predictions(model, test_data_loader)

In [32]:
# print size and shapes
print(f"predictions shape: {predictions.shape}, real values shape: {real_values.shape}, text length: {len(text)}, prediction_probs shape: {prediction_probs.shape}")

predictions shape: torch.Size([370, 5]), real values shape: torch.Size([370, 5]), text length: 370, prediction_probs shape: torch.Size([370, 5])


In [33]:
print(classification_report(real_values, predictions, target_names=target_list))

              precision    recall  f1-score   support

        cEXT       0.53      0.79      0.64       178
        cNEU       0.60      0.61      0.60       190
        cAGR       0.55      0.79      0.65       195
        cCON       0.57      0.85      0.68       186
        cOPN       0.62      0.65      0.63       190

   micro avg       0.57      0.74      0.64       939
   macro avg       0.57      0.74      0.64       939
weighted avg       0.57      0.74      0.64       939
 samples avg       0.58      0.71      0.60       939



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [1]:
#  calulate accuracy using sklearn for each trait
from sklearn.metrics import accuracy_score
accuracy_scores = {}
for i in range(len(target_list)):
    accuracy_scores[target_list[i]] = accuracy_score(real_values[:,i], predictions[:,i])
accuracy_scores

NameError: name 'target_list' is not defined