In [3]:
import numpy as np
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import warnings
import pickle
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('../data/mbti_1.csv')

In [5]:
def clean(post):
    post_split = post.split('|||')
    post_split = [clean_dots(ind_post) for ind_post in post_split if clean_dots(ind_post)]
    # split the kaggle data set posts by |||
    post_split_split = [x.split(' ') for x in post_split]
    
    # removes any 'words' that have http:// or https:// in them
    return_list = [[item for item in sentence if ('http://' not in item and 'https://' not in item)] for sentence in post_split_split]
    
    # returns a list of posts if they are not empty after removing the links
    return [' '.join(sentence) for sentence in return_list if sentence]

In [6]:
def clean_dots(ind_post):
    if(re.search('[a-zA-Z]', ind_post)):
        if(ind_post.endswith('...') or ind_post.endswith("...'")):
            if(re.search('.*[\.?!]\s', ind_post)):
                return(re.search('.*[\.?!]\s', ind_post).group(0).strip())
            else:
                return(ind_post)
    return ''

In [7]:
df_dict = {'INTJ':0, 'INTP':0, 'ENTJ':1, 'ENTP':1, 
           'INFJ':0, 'INFP':0, 'ENFJ':1, 'ENFP':1, 
           'ISTJ':0, 'ISFJ':0, 'ESTJ':1, 'ESFJ':1, 
           'ISTP':0, 'ISFP':0, 'ESTP':1, 'ESFP':1}

In [8]:
df_split = {'INTJ':[], 'INTP':[], 'ENTJ':[], 'ENTP':[], 
           'INFJ':[], 'INFP':[], 'ENFJ':[], 'ENFP':[], 
           'ISTJ':[], 'ISFJ':[], 'ESTJ':[], 'ESFJ':[], 
           'ISTP':[], 'ISFP':[], 'ESTP':[], 'ESFP':[]}

In [9]:
def extract(mbti, post, list_):
    for ind_post in post:
        list_.append((ind_post, mbti))

In [10]:
for mbti in df_dict.keys():
    if(df[df['type'] == mbti].shape[0] < 200):
        df_dict[mbti] = df[df['type'] == mbti]
    else:
        df_dict[mbti] = df[df['type'] == mbti].sample(n=200)
    
    df_dict[mbti]['post_split'] = df_dict[mbti].posts.apply(clean)
    
    df_dict[mbti].post_split.apply(lambda x: extract(mbti, x, df_split[mbti]))
    print(mbti, df_dict[mbti].shape)

INTJ (200, 3)
INTP (200, 3)
ENTJ (200, 3)
ENTP (200, 3)
INFJ (200, 3)
INFP (200, 3)
ENFJ (190, 3)
ENFP (200, 3)
ISTJ (200, 3)
ISFJ (166, 3)
ESTJ (39, 3)
ESFJ (42, 3)
ISTP (200, 3)
ISFP (200, 3)
ESTP (89, 3)
ESFP (48, 3)


In [11]:
total_posts = 0
for mbti in df_split:
    print(mbti,len(df_split[mbti]))
    total_posts += len(df_split[mbti])
total_posts

INTJ 4988
INTP 5180
ENTJ 5337
ENTP 5130
INFJ 5705
INFP 5181
ENFJ 5461
ENFP 5645
ISTJ 5149
ISFJ 4491
ESTJ 1017
ESFJ 1190
ISTP 4834
ISFP 4732
ESTP 2079
ESFP 973


67092

In [12]:
df_split['INFP'][0]

("Yes, it's such a good game! I love most RPGs, MMORPGs, come to think of it every other genre too but those are my favorites.",
 'INFP')

In [13]:
df_dict['INFP'].post_split.iloc[0]

["Yes, it's such a good game! I love most RPGs, MMORPGs, come to think of it every other genre too but those are my favorites.",
 "Hello, I thought I should probably introduce myself since I've been here a couple of days, it isn't my first post but better late than never.  I love this site and the people here.",
 'All amazing suggestions!! Everyone has such a good point about each. Lots that I had never though of before.',
 'Wow I think you have just re-accounted the story of my life, I honestly agree with every single thing you have said.',
 'Definitely! really good points about the housing as well, and people are fairly friendly too unless you are brandishing a sword at them or god forbid your holding fire too close to them haha.',
 "The sims that's such a good one!",
 'I completely understand where everyone is coming from, for me it is so easy to feel this way as well. What keeps me going is hope.',
 'I definitely see your point about Bioshock, maybe more of a rare holiday that make

In [32]:
df_final_train = pd.DataFrame(columns=["text", "labels"])
df_final_test = df_final_train.copy()
df_final_train

Unnamed: 0,text,labels


In [33]:
df_final_test

Unnamed: 0,text,labels


In [30]:
for mbti in df_split.keys():
    test, train = train_test_split(pd.DataFrame(df_split[mbti], columns=["text", "labels"]), test_size=0.9)
    df_final_train = pd.concat([df_final_train, train])
    df_final_test = pd.concat([df_final_test, test])

In [31]:
df_final_test

Unnamed: 0,text,labels
3996,No. ENFPs and INTJs supposedly go well togeth...,0
1650,Logic is not necessarily the same as math. Ju...,0
2665,I have to admit it. to myself at least... I li...,0
1358,"No, not repeatedly. Maybe for another person, ...",0
3276,My mom is an INFJ. She would drive people who ...,0
...,...,...
389,I just love men who read. Cannot deal with som...,ESFP
595,How do you exactly become a fitness or persona...,ESFP
833,Thanks everybody :). Was nice to know. :happy:...,ESFP
892,$250 for 3 months actually sounds like a prett...,ESFP


In [18]:
df_final_train

Unnamed: 0,text,labels
2278,Sgt Pepper Their later albums were excellent ...,INTJ
3324,Perhaps asking the OP for clarification/expans...,INTJ
519,1 is more about imposition - IFP is more often...,INTJ
3229,you were first to request credit for being wis...,INTJ
4535,Do you ever ask him what's on his mind or what...,INTJ
...,...,...
398,A lot of people misunderstand the term 'valida...,ESFP
401,dealing with emotions is one big problem with ...,ESFP
761,im more for savoury foods than sweet. i can ea...,ESFP
740,The song Banshee Beat by Animal Collective has...,ESFP


In [28]:
df_final = pd.concat([df_final_train, df_final_test])

In [29]:
df_final

Unnamed: 0,text,labels
2278,Sgt Pepper Their later albums were excellent ...,0
3324,Perhaps asking the OP for clarification/expans...,0
519,1 is more about imposition - IFP is more often...,0
3229,you were first to request credit for being wis...,0
4535,Do you ever ask him what's on his mind or what...,0
...,...,...
403,I agree anyone who can take this shit (aka me)...,3
631,"Thanks, my dear. Personnally speaking, I am at...",3
461,I definitely prefer cute women to sexy. I've ...,3
325,If you meet anyone in real life who knows even...,3


In [19]:
def four_labels(type_):
#     analysts = ['INTJ', 'INTP', 'ENTJ', 'ENTP']
#     diplomats = ['INFJ', 'INFP', 'ENFJ', 'ENFP']
#     sentinels = ['ISTJ', 'ISFJ', 'ESTJ', 'ESFJ']
#     explorers = ['ISTP', 'ISFP', 'ESTP', 'ESFP']
    types = {'INTJ':0, 'INTP':0, 'ENTJ':0, 'ENTP':0, 
             'INFJ':1, 'INFP':1, 'ENFJ':1, 'ENFP':1, 
             'ISTJ':2, 'ISFJ':2, 'ESTJ':2, 'ESFJ':2, 
             'ISTP':3, 'ISFP':3, 'ESTP':3, 'ESFP':3}
    
    return types[type_]

In [20]:
df_final_train.labels = df_final_train.labels.apply(four_labels)

In [21]:
df_final_test.labels = df_final_test.labels.apply(four_labels)

In [22]:
df_final_test.head(2)

Unnamed: 0,text,labels
3996,No. ENFPs and INTJs supposedly go well togeth...,0
1650,Logic is not necessarily the same as math. Ju...,0


In [25]:
model_args = ClassificationArgs(num_train_epochs=1)

model = ClassificationModel(
    'distilbert',
    'distilbert-base-uncased',
    num_labels=4,
    args=model_args,
    use_cuda=False
) 

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [26]:
model.train_model(df_final_train, output_dir="4_cat_2.0_BERT_model/")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=60389.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 1'), FloatProgress(value=0.0, max=7549.0), HTML(value='')))





(7549, 1.3297366781935247)

In [27]:
result, model_outputs, wrong_predictions = model.eval_model(df_final_test)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6703.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=838.0), HTML(value='')))




In [164]:
result

{'mcc': 0.05235541569984611, 'eval_loss': 2.6152040983798237}

In [157]:
pred_sent = "I was always bullied by my family for being small and small-minded, but I don't think I ever took it too seriously. I don't ever think I would have considered trying to be an independent thinker if I'd still been able to be an independent thinker about my own life."

In [161]:
pred_sent1 = "I was always bullied by my family for being small, though.I am not sure what that means. I am not in love or I am lonely...just sad.  How are people feeling when they know something is wrong?It was not always like that. I was the only one who was really happy, and it made me feel a little better. I do not agree with those types.  They have no right to complain and everyone else just seems cool to each other. It is not fair.You're making me sound like a pussy and you're going to be very harsh, especially when you know what it is like to have my ass drilled deep into your body."

In [162]:
predictions, raw_outputs = model.predict([pred_sent, pred_sent1])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [163]:
le.inverse_transform(predictions)

array(['INTJ', 'ENFJ'], dtype=object)