In [1]:
!nvidia-smi

Fri Apr  5 14:17:12 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro P5000        Off  | 00000000:00:05.0 Off |                  Off |
| 26%   41C    P8     7W / 180W |      2MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Roberta Base Model

In [2]:
import os
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

from transformers import AutoTokenizer, AutoModel
import tqdm.notebook as tq

from utils import saveHistory,printHistory, plot_training_history,get_metrics,get_predictions



In [3]:
# setting the model name
PRE_TRAINED_MODEL_NAME = 'roberta-base'

# setting the dataset
dataset='MBTI 500 multi_label.csv'

# setting the data path
if os.path.exists(f'/datasets/mbti/{dataset}'):
    DATAPATH=f'/datasets/mbti/{dataset}'
else:
    DATAPATH=f'../data/{dataset}'

# setting the checkpoint path 
if os.path.exists('ckpts'):
    CHECKPOINTPATH = 'ckpts/Persnality_MBTI'
else:
    CHECKPOINTPATH = '../ckpts/Persnality_MBTI'

# training parameters
MAX_LEN = 512
TRAIN_BATCH_SIZE = 20
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 10
THRESHOLD = 0.5 # threshold for the sigmoid function

# TOKENIZER
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

# setting the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# setting the random seed
torch.manual_seed(99)
torch.cuda.manual_seed(99)
torch.cuda.manual_seed_all(99)
np.random.seed(99)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [4]:
DATAPATH,CHECKPOINTPATH

('/datasets/mbti/MBTI 500 multi_label.csv', 'ckpts/Persnality_MBTI')

### Loading the Data

In [5]:
# load the data
data = pd.read_csv(DATAPATH,encoding='utf-8')

In [6]:
# create the dataset class
class MBTIDataset(Dataset):
    def __init__(self, data, labels_list, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.targets = self.data[labels_list].values
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        text=self.data.posts[index]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [7]:
# get the labels list
labels_list = data.columns[2:].tolist()
print(f'Labels list: {labels_list}')

Labels list: ['IE', 'NS', 'TF', 'JP']


In [8]:
# create the dataset
dataset = MBTIDataset(data, labels_list, tokenizer, MAX_LEN)

# split the data
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# create the data loaders
train_data_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, shuffle=True)

In [9]:
batch=next(iter(train_data_loader))
batch['input_ids'].shape, batch['attention_mask'].shape, batch['targets'].shape

(torch.Size([20, 512]), torch.Size([20, 512]), torch.Size([20, 4]))

In [10]:
# batch

In [11]:
class ROBERTAClass(torch.nn.Module):
    def __init__(self, PRE_TRAINED_MODEL_NAME, num_classes=4, dropout=0.3):
        super(ROBERTAClass, self).__init__()
        self.bert_model = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=True, output_attentions=True)
        self.dropout = torch.nn.Dropout(dropout)
        self.linear = torch.nn.Linear(768, num_classes)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output
    def getAttention(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        return output.attentions
    def getPrediction(self, output):
        # apply sigmoid function and round the result
        prediction=torch.sigmoid(output)
        prediction=torch.round(prediction)
        return prediction

In [12]:
# train the model for one epoch
def train_epoch(dataloader, model, loss_fn, optimizer, scheduler=None):
    model = model.train()
    losses = []
    correct_predictions = 0
    sample_size=0

    loop = tq.tqdm(enumerate(dataloader), total=len(dataloader), 
                      leave=True, colour='steelblue')

    for _,d in loop:

        # get the input data
        input_ids = d["input_ids"].to(device, dtype = torch.long)
        attention_mask = d["attention_mask"].to(device, dtype = torch.long)
        token_type_ids = d["token_type_ids"].to(device, dtype = torch.long)

        # get the targets
        targets = d["targets"].to(device)

        # get the outputs
        outputs = model(input_ids=input_ids, attn_mask=attention_mask, token_type_ids=token_type_ids)
        loss = loss_fn(outputs, targets)

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0)

        # grad descent step
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
       
        # get the predictions and update the correct predictions
        preds=model.getPrediction(outputs)
        correct_predictions += torch.sum(preds == targets).item()
        sample_size+=targets.cpu().detach().numpy().size
        losses.append(loss.item())

        # get the accuracy and loss
    return correct_predictions *1.0 / sample_size, np.mean(losses)

In [13]:
# evaluate the model

def eval_model(dataloader, model, loss_fn):
    model = model.eval()
    losses = []
    correct_predictions = 0
    sample_size=0
    with torch.no_grad():
        for _,d in enumerate(dataloader, 0):
            input_ids = d["input_ids"].to(device, dtype = torch.long)
            attention_mask = d["attention_mask"].to(device, dtype = torch.long)
            token_type_ids = d["token_type_ids"].to(device, dtype = torch.long)
            targets = d["targets"].to(device, dtype=torch.float)

            outputs = model(input_ids=input_ids, attn_mask=attention_mask, token_type_ids=token_type_ids)
           
            loss = loss_fn(outputs, targets)
            preds=model.getPrediction(outputs)
            correct_predictions += torch.sum(preds == targets).item()
            sample_size+=targets.cpu().detach().numpy().size
            losses.append(loss.item())
            
    return correct_predictions * 1.0 / sample_size, np.mean(losses)

In [14]:
# BCEWithLogitsLoss combines a Sigmoid layer and the BCELoss in one single class. 
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets)

In [15]:
import torch
import os

def save_checkpoint(model, optimizer, epoch, best_val_loss, best_accuracy, checkpoint_path,history):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'best_val_loss': best_val_loss,
        'best_accuracy': best_accuracy,
        'history': history
    }
    torch.save(checkpoint, checkpoint_path)

def load_checkpoint(model, optimizer, checkpoint_path):
    if not os.path.exists(checkpoint_path):
        raise FileNotFoundError(f"Checkpoint file '{checkpoint_path}' not found.")
    
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    best_val_loss = checkpoint['best_val_loss']
    best_accuracy = checkpoint['best_accuracy']
    history = checkpoint['history']
    
    return model, optimizer, epoch, best_val_loss, best_accuracy, history

In [16]:
from collections import defaultdict

def train(PRE_TRAINED_MODEL_NAME, model_name, early_stopping_patience=3,resume_training=False):
    model = ROBERTAClass(PRE_TRAINED_MODEL_NAME)
    model.to(device)

    # Setting the optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)    

    history = defaultdict(list)
    best_accuracy = 0
    best_val_loss = float('inf')
    epochs_without_improvement = 0  # Counter for tracking consecutive epochs without improvement

    if resume_training:
        model, optimizer, start_epoch, best_val_loss, best_accuracy,history = load_checkpoint(model, optimizer, CHECKPOINTPATH + f'_Best_{PRE_TRAINED_MODEL_NAME}.bin')
        start_epoch += 1  # Start from the next epoch

    print(f'{PRE_TRAINED_MODEL_NAME}')
    for epoch in range(1, EPOCHS + 1):
        print(f'Epoch {epoch}/{EPOCHS}')
        train_acc, train_loss= train_epoch(train_data_loader,model,loss_fn,optimizer)
        val_acc, val_loss=eval_model(val_data_loader,model,loss_fn)

        print(f'train_loss={train_loss:.4f}, val_loss={val_loss:.4f} train_acc={train_acc:.4f}, val_acc={val_acc:.4f}')

        history['train_acc'].append(train_acc)
        history['train_loss'].append(train_loss)
        history['val_acc'].append(val_acc)
        history['val_loss'].append(val_loss)

        
        # Check for early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
            # Save the best model
            torch.save(model.state_dict(), CHECKPOINTPATH + f'_Best_{PRE_TRAINED_MODEL_NAME}.bin')
            best_accuracy = val_acc
            save_checkpoint(model, optimizer, epoch, best_val_loss, best_accuracy, CHECKPOINTPATH + f'_{epoch}_{PRE_TRAINED_MODEL_NAME}.bin',history)
        else:
            epochs_without_improvement += 1
            save_checkpoint(model, optimizer, epoch, best_val_loss, best_accuracy, CHECKPOINTPATH + f'_{epoch}_{PRE_TRAINED_MODEL_NAME}.bin',history)
            if epochs_without_improvement >= early_stopping_patience:
                print(f'Early stopping triggered after {epoch} epochs without improvement.')
                break

    print(f'{PRE_TRAINED_MODEL_NAME} Best val accuracy: {best_accuracy}')
    print(f'{PRE_TRAINED_MODEL_NAME} Best val loss: {best_val_loss}')
    return history


In [17]:
# if the checkpoint exist load the model else train the model

history=None
if not os.path.exists(CHECKPOINTPATH + f'_Best_{PRE_TRAINED_MODEL_NAME}.bin'):
   history= train(PRE_TRAINED_MODEL_NAME, 'Roberta MBTI')
   saveHistory(history) 
   plot_training_history(history)  

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


roberta-base
Epoch 1/10


  0%|          | 0/4243 [00:00<?, ?it/s]

train_loss=0.2356, val_loss=0.1890 train_acc=0.9067, val_acc=0.9291
Epoch 2/10


  0%|          | 0/4243 [00:00<?, ?it/s]

train_loss=0.1625, val_loss=0.1626 train_acc=0.9386, val_acc=0.9409
Epoch 3/10


  0%|          | 0/4243 [00:00<?, ?it/s]

: 

In [None]:
model = ROBERTAClass(PRE_TRAINED_MODEL_NAME)
model.load_state_dict(torch.load(CHECKPOINTPATH + f'_Best_{PRE_TRAINED_MODEL_NAME}.bin'))
model.to(device)
print(f'{PRE_TRAINED_MODEL_NAME} loaded')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: PytorchStreamReader failed reading zip archive: failed finding central directory

In [None]:
printHistory(history)

Epoch 1/3
train_loss=0.2435, val_loss=0.1778 train_acc=0.9024, val_acc=0.9341
-----------------------------------------
Epoch 2/3
train_loss=0.1649, val_loss=0.1593 train_acc=0.9373, val_acc=0.9403
-----------------------------------------
Epoch 3/3
train_loss=0.1392, val_loss=0.1514 train_acc=0.9480, val_acc=0.9430
-----------------------------------------


### Testing the model

In [None]:
# get the metrics "in utils.py"
get_metrics(model,test_data_loader,labels_list,device)

Accuracy 0.9417892156862745
Accuracy Scores
{'IE': 0.9404223227752639, 'NS': 0.9759615384615384, 'TF': 0.9443815987933635, 'JP': 0.9063914027149321}
classification_report
              precision    recall  f1-score   support

          IE       0.93      0.81      0.87      2513
          NS       0.90      0.79      0.84       869
          TF       0.90      0.94      0.92      3621
          JP       0.89      0.95      0.92      6091

   micro avg       0.90      0.91      0.91     13094
   macro avg       0.91      0.87      0.89     13094
weighted avg       0.90      0.91      0.91     13094
 samples avg       0.73      0.72      0.72     13094



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Testing the model #2

In [None]:
testing_dataset=pd.read_csv('/datasets/mbti_test_1/mbti_1_cleaned.csv',encoding='utf-8')

In [None]:
testing_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8674 entries, 0 to 8673
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   IE      8674 non-null   int64 
 1   NS      8674 non-null   int64 
 2   TF      8674 non-null   int64 
 3   JP      8674 non-null   int64 
 4   posts   8674 non-null   object
dtypes: int64(4), object(1)
memory usage: 339.0+ KB


In [None]:
# create the dataset
testing_dataset = MBTIDataset(testing_dataset, labels_list, tokenizer, MAX_LEN)
testing_data_loader = DataLoader(testing_dataset, batch_size=20, shuffle=True, drop_last=True)

In [None]:
get_metrics(model,testing_data_loader,labels_list,device)

Accuracy 0.8955254041570438
Accuracy Scores
{'IE': 0.9005773672055427, 'NS': 0.9333718244803695, 'TF': 0.8862586605080831, 'JP': 0.8618937644341801}
classification_report
              precision    recall  f1-score   support

          IE       0.84      0.70      0.77      1996
          NS       0.85      0.63      0.72      1194
          TF       0.87      0.93      0.90      4689
          JP       0.85      0.93      0.89      5227

   micro avg       0.86      0.87      0.86     13106
   macro avg       0.85      0.80      0.82     13106
weighted avg       0.86      0.87      0.86     13106
 samples avg       0.77      0.77      0.76     13106



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
