In [1]:
!nvidia-smi

Thu Mar 28 14:04:53 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro P5000        Off  | 00000000:00:05.0 Off |                  Off |
| 26%   33C    P8     7W / 180W |      2MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import transformers
import tqdm.notebook as tq
from collections import defaultdict

torch.cuda.empty_cache()

import gc
gc.collect()



160

In [3]:
!nvidia-smi

Thu Mar 28 14:05:01 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro P5000        Off  | 00000000:00:05.0 Off |                  Off |
| 26%   33C    P8     6W / 180W |      2MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
# load data
# check if local env or paperspace
import os
if os.path.exists('/datasets/essays/essays_cleaned.csv'):
    dataset_path='/datasets/essays/essays_cleaned.csv'
else:
    dataset_path='../data/essays_cleaned.csv'
df = pd.read_csv(dataset_path,encoding="utf-8")

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create the transform
vectorizer = TfidfVectorizer(max_features=1000)
# tokenize and build vocab
vectorizer.fit(df['TEXT'])



In [6]:
PRE_TRAINED_MODEL_NAME = 'roberta-base'
# Hyperparameters
MAX_LEN = 512
TRAIN_BATCH_SIZE = 20
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 20
LEARNING_RATE = 1e-05
THRESHOLD = 0.5 # threshold for the sigmoid
# intialize tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [7]:
# split into train and test
df_train, df_test = train_test_split(df, random_state=77, test_size=0.30, shuffle=True)
# split test into test and validation datasets
df_test, df_valid = train_test_split(df_test, random_state=88, test_size=0.50, shuffle=True)

In [8]:
# create a dataset class
class EssayDataset(Dataset):
    def __init__(self, df, max_len,target_list):
        self.df = df
        self.max_len = max_len
        self.text = list(df['TEXT'])
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.targets = self.df[target_list].values
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.text[idx]
        text_length = len(text)
        middle_start = text_length // 3
        middle_end = 2 * (text_length // 3)
        text_middle = text[middle_start:middle_end]
        text=text_middle
        inputs = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors='pt'
        )
        return {
            'text': text,
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'targets': torch.FloatTensor(self.targets[idx]),
            'tf_idf_features': torch.FloatTensor(vectorizer.transform([text]).toarray().flatten())

        }

In [9]:
# target list is the list of big 5 personality traits that we want to predict
target_list= list(df.columns[1:])
target_list

['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']

In [10]:
# datasets
train_dataset = EssayDataset(df_train, MAX_LEN, target_list)
valid_dataset = EssayDataset(df_valid, MAX_LEN, target_list)
test_dataset = EssayDataset(df_test, MAX_LEN, target_list)

In [11]:
# create dataloaders
train_data_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_data_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, shuffle=True)

In [12]:
batch=next(iter(train_data_loader))

In [13]:
batch

{'text': [' first of all I Am not 18 yet so they will not let me get the fancy shmancy email account yet the other day they let me get a 2000 dollar loan which I have to pay off in 3 months or they sue me   wacky oif you ask me but then again who is  this campus is so huge no one knows anyone around here none of the professors none of the students  you smile at someone and immediately your somesort of wack job walking down the street stoopid people keep bothering me do not they know I Am trying to write   bno respect I tell you none from no body not my parents not my stoopid facking roommate who thinks hes my father  him and his girlfriend I tell yoiu first of all they are not very pretty people to begin with and then they are always here having sex and dammit it get annoying  always trying to include us not leaving alone when I do not want to ber apart of the picture always is it okay if I do not this or were goint to have sex now do not oing tin to the bed room I do not car ego and g

In [14]:
num_tfidf_features = len(vectorizer.get_feature_names_out())

class BERTClass(nn.Module):
    def __init__(self, num_classes=5):
        super(BERTClass, self).__init__()
        self.bert_model = transformers.AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=True)
        self.dropout = nn.Dropout(0.3)
        self.linear1 = nn.Linear(768, 256)  # Linear layer for BERT embeddings
        self.linear2 = nn.Linear(256 + num_tfidf_features, num_classes)  # Concatenated linear layer
        self.relu = nn.ReLU()

    def forward(self, input_ids, attn_mask, token_type_ids, tfidf_features):
        # BERT embeddings
        output = self.bert_model(input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids)
        bert_embeddings = output.last_hidden_state.mean(dim=1)  # Example: Mean pooling of token embeddings
        bert_embeddings = self.relu(self.linear1(bert_embeddings))  # Apply linear layer and ReLU activation
        
        # Concatenate with TF-IDF features
        concatenated_features = torch.cat((bert_embeddings, tfidf_features), dim=1)
        
        # Final linear layer for classification
        output = self.linear2(self.dropout(concatenated_features))
        return output

# Example usage
model = BERTClass()


Downloading pytorch_model.bin:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BERTClass(
  (bert_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768

In [16]:
# BCEWithLogitsLoss combines a Sigmoid layer and the BCELoss in one single class. 
# This version is more numerically stable than using a plain Sigmoid followed 
# by a BCELoss as, by combining the operations into one layer, 
# we take advantage of the log-sum-exp trick for numerical stability.
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [17]:
# define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-5)  


In [18]:
# Training of the model for one epoch
def train_model(training_loader, model, optimizer):

    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to training mode (activate droput, batch norm)
    model.train()
    # initialize the progress bar
    loop = tq.tqdm(enumerate(training_loader), total=len(training_loader), 
                      leave=True, colour='steelblue')
    for batch_ix, data in loop:
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)
        tfidf_features = data['tf_idf_features'].to(device, dtype=torch.float)

        # forward
        outputs = model(ids, mask, token_type_ids,tfidf_features) # output shape: (batch_size, num_classes)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        # training accuracy, apply sigmoid, round (apply thresh 0.5)
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
        targets = targets.cpu().detach().numpy()
        correct_predictions += np.sum(outputs==targets)
        num_samples += targets.size   # total number of elements in the 2D array

        # backward
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # grad descent step
        optimizer.step()


        # Update progress bar
        #loop.set_description(f"")
        #loop.set_postfix(batch_loss=loss)

    # returning: trained model, model accuracy, mean loss
    return model, float(correct_predictions)/num_samples, np.mean(losses)


In [19]:

def eval_model(validation_loader, model):
    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    with torch.no_grad():
        for btch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            tfidf_features = data['tf_idf_features'].to(device, dtype=torch.float)
            outputs = model(ids, mask, token_type_ids,tfidf_features)


            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # validation accuracy
            # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            targets = targets.cpu().detach().numpy()
            correct_predictions += np.sum(outputs==targets)
            num_samples += targets.size   # total number of elements in the 2D array

    return float(correct_predictions)/num_samples, np.mean(losses)

In [20]:

history = defaultdict(list)
best_accuracy = 0

for epoch in range(1, EPOCHS+1):
    print(f'Epoch {epoch}/{EPOCHS}')
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model)

    print(f'train_loss={train_loss:.4f}, val_loss={val_loss:.4f} train_acc={train_acc:.4f}, val_acc={val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    # save the best model
    if val_acc > best_accuracy:
        torch.save(model.state_dict(),"ckpts/MLTC_roberta_tfidf_model_state.bin")
        best_accuracy = val_acc

Epoch 1/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.6930, val_loss=0.6928 train_acc=0.5136, val_acc=0.5202
Epoch 2/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.6910, val_loss=0.6872 train_acc=0.5364, val_acc=0.5526
Epoch 3/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.6820, val_loss=0.6807 train_acc=0.5609, val_acc=0.5671
Epoch 4/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.6691, val_loss=0.6786 train_acc=0.5935, val_acc=0.5666
Epoch 5/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.6472, val_loss=0.6833 train_acc=0.6323, val_acc=0.5623
Epoch 6/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.6231, val_loss=0.6967 train_acc=0.6610, val_acc=0.5596
Epoch 7/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.5975, val_loss=0.6939 train_acc=0.6899, val_acc=0.5687
Epoch 8/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.5690, val_loss=0.7141 train_acc=0.7237, val_acc=0.5601
Epoch 9/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.5361, val_loss=0.7394 train_acc=0.7474, val_acc=0.5509
Epoch 10/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.5102, val_loss=0.7446 train_acc=0.7714, val_acc=0.5531
Epoch 11/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.4767, val_loss=0.7777 train_acc=0.8000, val_acc=0.5429
Epoch 12/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.4502, val_loss=0.7694 train_acc=0.8146, val_acc=0.5520
Epoch 13/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.4226, val_loss=0.8043 train_acc=0.8367, val_acc=0.5531
Epoch 14/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.3973, val_loss=0.8117 train_acc=0.8507, val_acc=0.5504
Epoch 15/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.3669, val_loss=0.8319 train_acc=0.8717, val_acc=0.5612
Epoch 16/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.3436, val_loss=0.8498 train_acc=0.8812, val_acc=0.5623
Epoch 17/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.3195, val_loss=0.8844 train_acc=0.8943, val_acc=0.5542
Epoch 18/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.2962, val_loss=0.9233 train_acc=0.9020, val_acc=0.5536
Epoch 19/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.2759, val_loss=0.9494 train_acc=0.9118, val_acc=0.5526
Epoch 20/20


  0%|          | 0/87 [00:00<?, ?it/s]

train_loss=0.2559, val_loss=0.9658 train_acc=0.9250, val_acc=0.5488


In [30]:
# load the best model
model = BERTClass()
model.load_state_dict(torch.load("ckpts/MLTC_roberta_tfidf_model_state.bin"))
model.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: Error(s) in loading state_dict for BERTClass:
	size mismatch for linear2.weight: copying a param with shape torch.Size([5, 32260]) from checkpoint, the shape in current model is torch.Size([5, 1256]).

In [23]:
# Evaluate the model using the test data
test_acc, test_loss = eval_model(test_data_loader, model)
test_acc, test_loss

(0.5637837837837838, 0.7017953445514044)

In [24]:
from sklearn.metrics import classification_report

In [25]:
def get_predictions(model,data_loader):
    model.eval()
    text=[]
    predictions = []
    predictions_probs = []
    real_values = []
    with torch.no_grad():
        for data in data_loader:
            text.extend(data['text'])
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            tfidf_features = data['tf_idf_features'].to(device, dtype=torch.float)
            outputs = model(ids, mask, token_type_ids,tfidf_features)
            outputs = torch.sigmoid(outputs)
            predictions.extend(outputs.cpu().detach().round())
            real_values.extend(targets.cpu().detach())
            predictions_probs.extend(outputs.cpu().detach())      
    predictions = torch.stack(predictions)
    predictions_probs = torch.stack(predictions_probs)
    real_values = torch.stack(real_values)
    return text,predictions,predictions_probs, real_values

In [26]:
# get predictions for the test data
text, predictions, prediction_probs, real_values = get_predictions(model, test_data_loader)

In [27]:
# print size and shapes
print(f"predictions shape: {predictions.shape}, real values shape: {real_values.shape}, text length: {len(text)}, prediction_probs shape: {prediction_probs.shape}")

predictions shape: torch.Size([370, 5]), real values shape: torch.Size([370, 5]), text length: 370, prediction_probs shape: torch.Size([370, 5])


In [28]:
print(classification_report(real_values, predictions, target_names=target_list))

              precision    recall  f1-score   support

        cEXT       0.52      0.70      0.60       178
        cNEU       0.56      0.56      0.56       190
        cAGR       0.55      0.82      0.66       195
        cCON       0.56      0.85      0.68       186
        cOPN       0.63      0.46      0.53       190

   micro avg       0.56      0.68      0.61       939
   macro avg       0.56      0.68      0.60       939
weighted avg       0.56      0.68      0.60       939
 samples avg       0.56      0.65      0.57       939



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
#  calulate accuracy using sklearn for each trait
from sklearn.metrics import accuracy_score
accuracy_scores = {}
for i in range(len(target_list)):
    accuracy_scores[target_list[i]] = accuracy_score(real_values[:,i], predictions[:,i])
accuracy_scores

{'cEXT': 0.5459459459459459,
 'cNEU': 0.5486486486486486,
 'cAGR': 0.5486486486486486,
 'cCON': 0.5918918918918918,
 'cOPN': 0.5837837837837838}