In [1]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


## Roberta Base Model

### setup

In [2]:
import os
import pandas as pd
import numpy as np

import torch

from transformers import AutoTokenizer, AutoModel


import sys
sys.path.append('../datasets')

In [3]:
# setting the model name
PRE_TRAINED_MODEL_NAME = 'roberta-base'

# setting the dataset
dataset='MBTI 500 multi_label.csv'


# setting the data path
if os.path.exists(f'/datasets/mbti/{dataset}'):
    DATAPATH=f'/datasets/mbti/{dataset}'
else:
    DATAPATH=f'../data/{dataset}'

# setting the checkpoint path 
if os.path.exists('ckpts'):
    CHECKPOINTPATH = 'ckpts/Persnality_MBTI'
else:
    CHECKPOINTPATH = '../ckpts/Persnality_MBTI'

# training parameters
MAX_LEN = 512

# TOKENIZER
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

# setting the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# setting the random seed
torch.manual_seed(99)
torch.cuda.manual_seed(99)
torch.cuda.manual_seed_all(99)
np.random.seed(99)

In [4]:
DATAPATH,CHECKPOINTPATH,device

('../data/MBTI 500 multi_label.csv',
 '../ckpts/Persnality_MBTI',
 device(type='cpu'))

### Loading the Data

In [5]:
df=pd.read_csv('save_test.csv')

In [6]:
labels_list = ['IE', 'NS', 'TF', 'JP']

### Load the model

In [7]:
# import model
import sys
sys.path.append('../Models')
import MBTI_model_lime as model


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/omarahmed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/omarahmed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
model_no_words = model.ROBERTAClass(PRE_TRAINED_MODEL_NAME)
model_no_words.load_state_dict(torch.load(CHECKPOINTPATH + f'_clean_Best_{PRE_TRAINED_MODEL_NAME}.bin', map_location=torch.device(device)))
model_no_words.to(device)
print(f'{PRE_TRAINED_MODEL_NAME}_no_words loaded')
model_no_words.eval()

torch.set_grad_enabled(False)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


roberta-base_no_words loaded


<torch.autograd.grad_mode.set_grad_enabled at 0x14ad7c750>

### Get First 100 correct predictions with equal distribution of classes

In [9]:
# get the correct predictions 
df_correct = df[(df['IE']==df['IE_true']) & (df['NS']==df['NS_true']) & (df['TF']==df['TF_true']) & (df['JP']==df['JP_true'])]

In [10]:
extracted_df = pd.DataFrame(columns=df_correct.columns)
# sort with rows that has 1 in IE, 1 in NS, 1 in TF, 1 in JP
df_correct = df_correct.sort_values(by=['IE','NS','TF','JP'],ascending=False)
# count 0s and 1s
count_0 = [0,0,0,0]
count_1 = [0,0,0,0]
def increment_count(row):
    for i in range(4):
        if row[i]==0:
            count_0[i]+=1
        else:
            count_1[i]+=1
def decrement_count(row):
    for i in range(4):
        if row[i]==0:
            count_0[i]-=1
        else:
            count_1[i]-=1            
for i in range(len(df_correct)):
    increment_count(df_correct.iloc[i][1:5])
    # if any of the counts is greater than 50, do not add the row to the extracted_df
    if count_0[0]>50 or count_0[1]>50 or count_0[2]>50 or count_0[3]>50 or count_1[0]>50 or count_1[1]>50 or count_1[2]>50 or count_1[3]>50:
        decrement_count(df_correct.iloc[i][1:5])
        continue
    # add the row to the extracted_df
    extracted_df=pd.concat([extracted_df,df_correct.iloc[[i]]],ignore_index=True)

  if row[i]==0:
  extracted_df=pd.concat([extracted_df,df_correct.iloc[[i]]],ignore_index=True)
  if row[i]==0:


In [11]:
count_0,count_1

([50, 50, 50, 50], [50, 50, 50, 50])

In [12]:
extracted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   text     100 non-null    object 
 1   IE       100 non-null    float64
 2   NS       100 non-null    float64
 3   TF       100 non-null    float64
 4   JP       100 non-null    float64
 5   IE_true  100 non-null    float64
 6   NS_true  100 non-null    float64
 7   TF_true  100 non-null    float64
 8   JP_true  100 non-null    float64
dtypes: float64(8), object(1)
memory usage: 7.2+ KB


In [13]:
# calculate the accuracy of the extracted_df
def get_accuracy_extracted_df(df,labels_list):
    accuracy = {}
    for label in labels_list:
        accuracy[label] = (df[label]==df[label+'_true']).sum()/len(df)
    return accuracy

get_accuracy_extracted_df(extracted_df,labels_list)

{'IE': 1.0, 'NS': 1.0, 'TF': 1.0, 'JP': 1.0}

### LIME

In [14]:
import pickle
lime_explanation_IE= model.explain_model(model_no_words, extracted_df["text"],aspect='IE')
# save the explanation to a pkl file
with open('lime_explanation_IE.pkl', 'wb') as f:
    pickle.dump(lime_explanation_IE, f)

In [15]:
lime_explanation_NS= model.explain_model(model_no_words, extracted_df["text"],aspect='NS')
# save the explanation to a pkl file
with open('lime_explanation_NS.pkl', 'wb') as f:
    pickle.dump(lime_explanation_NS, f)

In [16]:
lime_explanation_TF= model.explain_model(model_no_words, extracted_df["text"],aspect='TF')
# save the explanation to a pkl file
with open('lime_explanation_TF.pkl', 'wb') as f:
    pickle.dump(lime_explanation_TF, f)   

In [17]:
lime_explanation_JP= model.explain_model(model_no_words, extracted_df["text"],aspect='JP')
# save the explanation to a pkl file
with open('lime_explanation_JP.pkl', 'wb') as f:
    pickle.dump(lime_explanation_JP, f)

In [18]:
# print the lime explanation
print('IE')
print(lime_explanation_IE[0].as_list())

IE
[('ne', 0.042200318246221356), ('come', 0.03792475333483115), ('gonna', 0.03591114203671764), ('xxfps', 0.03580061438505622), ('remindme', 0.03285327692308008), ('mountain', -0.028835826201961096), ('superior', 0.027087850434445084), ('intuition', 0.02645462413674346), ('fuck', 0.02530716541660381), ('call', -0.024500213389414282), ('around', 0.02402212728601511), ('actually', 0.02393943019679887), ('snooki', 0.023161856700141568), ('company', 0.022833391867277248), ('another', -0.022794919797993664), ('say', 0.022041486098803756), ('s', -0.021264440574876326), ('sometimes', -0.0210357717783274), ('electronic', 0.02070474849247822), ('itj', -0.020701263747063), ('sense', -0.020039185027198342), ('importance', 0.019880936102095705), ('something', -0.01986346563495163), ('le', 0.019751364093689504), ('either', -0.019733162957734217), ('exposure', -0.019616667791578387), ('like', -0.019107003295604573), ('look', 0.018965085557991194), ('shit', 0.01884696041038901), ('subjective', 0.018

In [23]:
# sort the explanation
sorted_explanation_IE = sorted(lime_explanation_IE[0].as_list(),key=lambda x: x[1],reverse=True)
sorted_explanation_IE

[('ne', 0.042200318246221356),
 ('come', 0.03792475333483115),
 ('gonna', 0.03591114203671764),
 ('xxfps', 0.03580061438505622),
 ('remindme', 0.03285327692308008),
 ('superior', 0.027087850434445084),
 ('intuition', 0.02645462413674346),
 ('fuck', 0.02530716541660381),
 ('around', 0.02402212728601511),
 ('actually', 0.02393943019679887),
 ('snooki', 0.023161856700141568),
 ('company', 0.022833391867277248),
 ('say', 0.022041486098803756),
 ('electronic', 0.02070474849247822),
 ('importance', 0.019880936102095705),
 ('le', 0.019751364093689504),
 ('look', 0.018965085557991194),
 ('shit', 0.01884696041038901),
 ('subjective', 0.018537529818627167),
 ('world', 0.017997152069101453),
 ('etc', 0.017906039310130923),
 ('betsey', 0.017287971625063635),
 ('get', 0.017253019362686844),
 ('place', 0.016835594059180524),
 ('virtually', 0.01653430800623291),
 ('different', 0.016495322628115174),
 ('r', 0.016191960386013057),
 ('day', 0.01576731121137415),
 ('figure', 0.015358947705603391),
 ('dom

### Remove the top 100 features and check the performance

In [24]:
# add a column to the extracted_df that contains with the text with the most important 100 tokens removed 
def remove_words(text,words):
    for word in words:
        text = text.replace(word,'')
        # remove extra spaces
        text = ' '.join(text.split())
    return text
def remove_100_tokens(lime_explanations,aspect):
    for i in range(len(lime_explanations)):
        # sort the words by importance
        sorted_explanation = sorted(lime_explanations[i].as_list(),key=lambda x: x[1],reverse=True)
        words = [word for word,weight in sorted_explanation[:100]]
        extracted_df.loc[i,aspect+'_no_words'] = remove_words(extracted_df.loc[i,'text'],words)   

In [25]:
remove_100_tokens(lime_explanation_IE,'IE')
remove_100_tokens(lime_explanation_NS,'NS')
remove_100_tokens(lime_explanation_TF,'TF')
remove_100_tokens(lime_explanation_JP,'JP')

### Calculate the performance metrics for each class

In [33]:
# import model
import sys
sys.path.append('../Models')
import roberta_mbti as model

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/omarahmed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/omarahmed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
roberta_model = model.ROBERTAClass(PRE_TRAINED_MODEL_NAME)
roberta_model.load_state_dict(torch.load(CHECKPOINTPATH + f'_clean_Best_{PRE_TRAINED_MODEL_NAME}.bin', map_location=torch.device(device)))
roberta_model.to(device)
print(f'{PRE_TRAINED_MODEL_NAME}_no_words loaded')
roberta_model.eval()

torch.set_grad_enabled(False)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


roberta-base_no_words loaded


<torch.autograd.grad_mode.set_grad_enabled at 0x2b9e0b290>

In [35]:
# get the predictions for the extracted_df IE_no_words , NS_no_words , TF_no_words , JP_no_words
IE_no_words_pred=roberta_model.getPrediction(extracted_df['IE_no_words'].to_list())
NS_no_words_pred=roberta_model.getPrediction(extracted_df['NS_no_words'].to_list())
TF_no_words_pred=roberta_model.getPrediction(extracted_df['TF_no_words'].to_list())
JP_no_words_pred=roberta_model.getPrediction(extracted_df['JP_no_words'].to_list())

In [36]:
# round the predictions
IE_no_words_pred = np.round(IE_no_words_pred)
NS_no_words_pred = np.round(NS_no_words_pred)
TF_no_words_pred = np.round(TF_no_words_pred)
JP_no_words_pred = np.round(JP_no_words_pred)

In [37]:
# add the predictions of the extracted_df IE_no_words , NS_no_words , TF_no_words , JP_no_words for each of the aspects
extracted_df['IE_no_words_pred'] = IE_no_words_pred[:,0]
extracted_df['NS_no_words_pred'] = NS_no_words_pred[:,1]
extracted_df['TF_no_words_pred'] = TF_no_words_pred[:,2]
extracted_df['JP_no_words_pred'] = JP_no_words_pred[:,3]

In [39]:
extracted_df.head(n=2)

Unnamed: 0,text,IE,NS,TF,JP,IE_true,NS_true,TF_true,JP_true,IE_no_words,NS_no_words,TF_no_words,JP_no_words,IE_no_words_pred,NS_no_words_pred,TF_no_words_pred,JP_no_words_pred
0,understand relate whatsoever close friend eith...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,ndestand elate watsoeve fiend eite axiliay con...,destad e wtsoeve clo fied eithe domiat axiliay...,ersta whatsoever close frie dominant axiliary ...,undestand elate wsoeve close iend eithe inant ...,0.0,0.0,0.0,1.0
1,account net karma individual comment suggestio...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,account nt individual commnt bot many sub us d...,accout t karma idividual commt suggstio rddit ...,t karma idividual commt suggstio rddit bot may...,account net karma individual y u include defau...,1.0,0.0,0.0,0.0


In [40]:
# calculate the accuracy of the extracted_df
def get_accuracy_extracted_df_after(df,labels_list):
    accuracy = {}
    for label in labels_list:
        accuracy[label] = (df[label+'_no_words_pred']==df[label+'_true']).sum()/len(df)
    return accuracy

get_accuracy_extracted_df_after(extracted_df,labels_list)

{'IE': 0.77, 'NS': 0.87, 'TF': 0.58, 'JP': 0.84}

In [41]:
# save the accuracy drop in a json file
import json
accuracy_drop =get_accuracy_extracted_df_after(extracted_df,labels_list)
with open('accuracy_drop.json', 'w') as f:
    json.dump(accuracy_drop, f)
    

In [42]:
# save intermediate results of the extracted_df
extracted_df.to_csv('extracted_df.csv',index=False)