In [1]:
!nvidia-smi

Thu May  2 01:32:31 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 551.78                 Driver Version: 551.78         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   44C    P0             23W /  128W |       0MiB /   6144MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Roberta Base Model

### setup

In [1]:
import os
import pandas as pd
import numpy as np

import torch

from transformers import AutoTokenizer, AutoModel

from utils import *

import sys
sys.path.append('../datasets')
from mbti_500 import getDataLoaders

In [2]:
# setting the model name
PRE_TRAINED_MODEL_NAME = 'roberta-base'

# setting the dataset
dataset='MBTI 500 multi_label.csv'


# setting the data path
if os.path.exists(f'/datasets/mbti/{dataset}'):
    DATAPATH=f'/datasets/mbti/{dataset}'
else:
    DATAPATH=f'../data/{dataset}'

# setting the checkpoint path 
if os.path.exists('ckpts'):
    CHECKPOINTPATH = 'ckpts/Persnality_MBTI'
else:
    CHECKPOINTPATH = '../ckpts/Persnality_MBTI'

# training parameters
MAX_LEN = 512

# TOKENIZER
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

# setting the device
device = "cpu"

# setting the random seed
torch.manual_seed(99)
torch.cuda.manual_seed(99)
torch.cuda.manual_seed_all(99)
np.random.seed(99)

In [4]:
DATAPATH,CHECKPOINTPATH,device

('../data/MBTI 500 multi_label.csv', '../ckpts/Persnality_MBTI', 'cpu')

### Loading the Data with predictions

In [5]:
# loading the test data with the predictions
df=pd.read_csv('save_test.csv')
df.head()

Unnamed: 0,text,IE,NS,TF,JP,IE_true,NS_true,TF_true,JP_true
0,diffiuc get sense without actually rift idea l...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,quiet one people like reason hard look like ey...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,think video book anyday however problem qualit...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,seatle look like alright place love like frien...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,understand sometimes need immensely private go...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


### Get First 100 mispredictions with equal distribution of classes

In [6]:
# get 100 mispredicted samples IE samples
exctracted_misprediction_IE = df[df['IE'] != df['IE_true']]
# get 100 mispredicted samples IE samples with equal number of 1,0
exctracted_misprediction_IE_1 = exctracted_misprediction_IE[exctracted_misprediction_IE['IE']==1].head(50)
exctracted_misprediction_IE_0 = exctracted_misprediction_IE[exctracted_misprediction_IE['IE']==0].head(50)
exctracted_misprediction_IE = pd.concat([exctracted_misprediction_IE_1,exctracted_misprediction_IE_0])
exctracted_misprediction_IE = exctracted_misprediction_IE.sample(frac=1).reset_index(drop=True)
count_0= exctracted_misprediction_IE.eq(0).sum()
count_1= exctracted_misprediction_IE.eq(1).sum()
count_0,count_1

(text        0
 IE         50
 NS         96
 TF         57
 JP         23
 IE_true    50
 NS_true    89
 TF_true    62
 JP_true    23
 dtype: int64,
 text        0
 IE         50
 NS          4
 TF         43
 JP         77
 IE_true    50
 NS_true    11
 TF_true    38
 JP_true    77
 dtype: int64)

In [7]:
# get 100 mispredicted samples NS samples
exctracted_misprediction_NS = df[df['NS'] != df['NS_true']]
exctracted_misprediction_NS_1 = exctracted_misprediction_NS[exctracted_misprediction_NS['NS']==1].head(50)
exctracted_misprediction_NS_0 = exctracted_misprediction_NS[exctracted_misprediction_NS['NS']==0].head(50)
exctracted_misprediction_NS = pd.concat([exctracted_misprediction_NS_1,exctracted_misprediction_NS_0])
exctracted_misprediction_NS = exctracted_misprediction_NS.sample(frac=1).reset_index(drop=True)

In [8]:
count_0= exctracted_misprediction_NS.eq(0).sum()
count_1= exctracted_misprediction_NS.eq(1).sum()
count_0,count_1

(text        0
 IE         83
 NS         50
 TF         47
 JP         37
 IE_true    72
 NS_true    50
 TF_true    63
 JP_true    38
 dtype: int64,
 text        0
 IE         17
 NS         50
 TF         53
 JP         63
 IE_true    28
 NS_true    50
 TF_true    37
 JP_true    62
 dtype: int64)

In [9]:
# get 100 mispredicted samples TF samples
exctracted_misprediction_TF = df[df['TF'] != df['TF_true']]
exctracted_misprediction_TF_1 = exctracted_misprediction_TF[exctracted_misprediction_TF['TF']==1].head(50)
exctracted_misprediction_TF_0 = exctracted_misprediction_TF[exctracted_misprediction_TF['TF']==0].head(50)
exctracted_misprediction_TF = pd.concat([exctracted_misprediction_TF_1,exctracted_misprediction_TF_0])
exctracted_misprediction_TF = exctracted_misprediction_TF.sample(frac=1).reset_index(drop=True)

In [10]:
count_0= exctracted_misprediction_TF.eq(0).sum()
count_1= exctracted_misprediction_TF.eq(1).sum()
count_0,count_1

(text        0
 IE         89
 NS         94
 TF         50
 JP         35
 IE_true    76
 NS_true    88
 TF_true    50
 JP_true    45
 dtype: int64,
 text        0
 IE         11
 NS          6
 TF         50
 JP         65
 IE_true    24
 NS_true    12
 TF_true    50
 JP_true    55
 dtype: int64)

In [11]:
#  get 100 mispredicted samples JP samples
exctracted_misprediction_JP = df[df['JP'] != df['JP_true']]
exctracted_misprediction_JP_1 = exctracted_misprediction_JP[exctracted_misprediction_JP['JP']==1].head(50)
exctracted_misprediction_JP_0 = exctracted_misprediction_JP[exctracted_misprediction_JP['JP']==0].head(50)
exctracted_misprediction_JP = pd.concat([exctracted_misprediction_JP_1,exctracted_misprediction_JP_0])
exctracted_misprediction_JP = exctracted_misprediction_JP.sample(frac=1).reset_index(drop=True)

In [12]:
count_0= exctracted_misprediction_JP.eq(0).sum()
count_1= exctracted_misprediction_JP.eq(1).sum()
count_0,count_1

(text        0
 IE         96
 NS         99
 TF         57
 JP         50
 IE_true    84
 NS_true    91
 TF_true    62
 JP_true    50
 dtype: int64,
 text        0
 IE          4
 NS          1
 TF         43
 JP         50
 IE_true    16
 NS_true     9
 TF_true    38
 JP_true    50
 dtype: int64)

In [13]:
labels_list = ['IE','NS','TF','JP']

In [14]:
# calculate the accuracy of the extracted_df
def get_accuracy_extracted_df(df,label):
    accuracy = {}
    accuracy[label] = (df[label]==df[label+'_true']).sum()/len(df)
    return accuracy

In [15]:
get_accuracy_extracted_df(exctracted_misprediction_IE,"IE"),get_accuracy_extracted_df(exctracted_misprediction_NS,"NS"),get_accuracy_extracted_df(exctracted_misprediction_TF,"TF"),get_accuracy_extracted_df(exctracted_misprediction_JP,"JP")

({'IE': 0.0}, {'NS': 0.0}, {'TF': 0.0}, {'JP': 0.0})

### LIME

#### Load the model

In [16]:
# import model
import sys
sys.path.append('../Models')
import MBTI_model_lime as model

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
model_no_words = model.ROBERTAClass(PRE_TRAINED_MODEL_NAME)
model_no_words.load_state_dict(torch.load(CHECKPOINTPATH + f'_clean_Best_{PRE_TRAINED_MODEL_NAME}.bin', map_location=torch.device(device)))
model_no_words.to(device)
print(f'{PRE_TRAINED_MODEL_NAME}_no_words loaded')
model_no_words.eval()

torch.set_grad_enabled(False)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


roberta-base_no_words loaded


<torch.autograd.grad_mode.set_grad_enabled at 0x2433a658d90>

In [18]:
import pickle as pkl
lime_explanation_IE= model.explain_model(model_no_words, exctracted_misprediction_IE["text"],aspect='IE')
# save the explanation to a pkl file
with open('lime_explanation_IE.pkl', 'wb') as f:
    pkl.dump(lime_explanation_IE, f)

In [19]:
lime_explanation_NS= model.explain_model(model_no_words, exctracted_misprediction_NS["text"],aspect='NS')
# save the explanation to a pkl file
with open('lime_explanation_NS.pkl', 'wb') as f:
    pkl.dump(lime_explanation_NS, f)

In [20]:
lime_explanation_TF= model.explain_model(model_no_words, exctracted_misprediction_TF["text"],aspect='TF')
# save the explanation to a pkl file
with open('lime_explanation_TF.pkl', 'wb') as f:
    pkl.dump(lime_explanation_TF, f)    

In [21]:
lime_explanation_JP= model.explain_model(model_no_words, exctracted_misprediction_JP["text"],aspect='JP')
# save the explanation to a pkl file
with open('lime_explanation_JP.pkl', 'wb') as f:
    pkl.dump(lime_explanation_JP, f)

### Remove the top 100 features and check the performance

In [25]:
# add a column to the extracted_df that contains with the text with the most important 100 tokens removed 
def remove_words(text,words):
    for word in words:
        text = text.replace(word,'')
        # remove extra spaces
        text = ' '.join(text.split())
    return text

def remove_100_tokens(lime_explanations,aspect,extracted_df):
    for i in range(len(lime_explanations)):
        # sort the words by importance
        sorted_explanation = sorted(lime_explanations[i].as_list(),key=lambda x: x[1],reverse=True)
        words = [word for word,weight in sorted_explanation[:100]]
        extracted_df.loc[i,aspect+'_no_words'] = remove_words(extracted_df.loc[i,'text'],words)   

In [26]:
remove_100_tokens(lime_explanation_IE,"IE",exctracted_misprediction_IE)
remove_100_tokens(lime_explanation_NS,"NS",exctracted_misprediction_NS)
remove_100_tokens(lime_explanation_TF,"TF",exctracted_misprediction_TF)
remove_100_tokens(lime_explanation_JP,"JP",exctracted_misprediction_JP)

In [27]:
# save the dataframes
exctracted_misprediction_IE.to_csv('exctracted_misprediction_IE.csv',index=False)
exctracted_misprediction_NS.to_csv('exctracted_misprediction_NS.csv',index=False)
exctracted_misprediction_TF.to_csv('exctracted_misprediction_TF.csv',index=False)
exctracted_misprediction_JP.to_csv('exctracted_misprediction_JP.csv',index=False)

In [6]:
# load the dataframes
exctracted_misprediction_IE = pd.read_csv('exctracted_misprediction_IE.csv')
exctracted_misprediction_NS = pd.read_csv('exctracted_misprediction_NS.csv')
exctracted_misprediction_TF = pd.read_csv('exctracted_misprediction_TF.csv')
exctracted_misprediction_JP = pd.read_csv('exctracted_misprediction_JP.csv')

### Calculate the performance metrics for each class

In [4]:
# import model
import sys
sys.path.append('../Models')
import roberta_mbti as model

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
roberta_model = model.ROBERTAClass(PRE_TRAINED_MODEL_NAME)
roberta_model.load_state_dict(torch.load(CHECKPOINTPATH + f'_clean_Best_{PRE_TRAINED_MODEL_NAME}.bin', map_location=torch.device(device)))
roberta_model.to(device)
print(f'{PRE_TRAINED_MODEL_NAME}_no_words loaded')
roberta_model.eval()

torch.set_grad_enabled(False)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


roberta-base_no_words loaded


<torch.autograd.grad_mode.set_grad_enabled at 0x17020dfe750>

In [7]:
# get the predictions for the extracted_df IE_no_words , NS_no_words , TF_no_words , JP_no_words
IE_no_words_pred=roberta_model.getPrediction(exctracted_misprediction_IE['IE_no_words'].to_list())
NS_no_words_pred=roberta_model.getPrediction(exctracted_misprediction_NS['NS_no_words'].to_list())
TF_no_words_pred=roberta_model.getPrediction(exctracted_misprediction_TF['TF_no_words'].to_list())
JP_no_words_pred=roberta_model.getPrediction(exctracted_misprediction_JP['JP_no_words'].to_list())

In [8]:
# round the predictions
IE_no_words_pred = np.round(IE_no_words_pred)
NS_no_words_pred = np.round(NS_no_words_pred)
TF_no_words_pred = np.round(TF_no_words_pred)
JP_no_words_pred = np.round(JP_no_words_pred)

In [9]:
# add the predictions of the extracted_df IE_no_words , NS_no_words , TF_no_words , JP_no_words for each of the aspects
exctracted_misprediction_IE['IE_no_words_pred'] = IE_no_words_pred[:,0]
exctracted_misprediction_NS['NS_no_words_pred'] = NS_no_words_pred[:,1]
exctracted_misprediction_TF['TF_no_words_pred'] = TF_no_words_pred[:,2]
exctracted_misprediction_JP['JP_no_words_pred'] = JP_no_words_pred[:,3]

In [10]:
# calculate the accuracy of the extracted_df
def get_accuracy_extracted_df_after(df,label):
    accuracy = {}
    accuracy[label] = (df[label+'_no_words_pred']==df[label+'_true']).sum()/len(df)
    return accuracy

In [11]:
get_accuracy_extracted_df_after(exctracted_misprediction_IE,"IE"),get_accuracy_extracted_df_after(exctracted_misprediction_NS,"NS"),get_accuracy_extracted_df_after(exctracted_misprediction_TF,"TF"),get_accuracy_extracted_df_after(exctracted_misprediction_JP,"JP")

({'IE': 0.4}, {'NS': 0.5}, {'TF': 0.5}, {'JP': 0.44})

In [12]:
import json
# save the accuracy increase
accuracy_scores = {}
accuracy_scores['IE'] = get_accuracy_extracted_df_after(exctracted_misprediction_IE,"IE")
accuracy_scores['NS'] = get_accuracy_extracted_df_after(exctracted_misprediction_NS,"NS")
accuracy_scores['TF'] = get_accuracy_extracted_df_after(exctracted_misprediction_TF,"TF")
accuracy_scores['JP'] = get_accuracy_extracted_df_after(exctracted_misprediction_JP,"JP")

with open('accuracy_scores.json', 'w') as f:
    json.dump(accuracy_scores, f)
    