In [166]:
import numpy as np
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import warnings
import pickle
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_scorecore
warnings.filterwarnings('ignore')

In [46]:
df = pd.read_csv('../data/mbti_1.csv')

In [47]:
def clean(post):
    post_split = post.split('|||')
    post_split = [clean_dots(ind_post) for ind_post in post_split if clean_dots(ind_post)]
    # split the kaggle data set posts by |||
    post_split_split = [x.split(' ') for x in post_split]
    
    # removes any 'words' that have http:// or https:// in them
    return_list = [[item for item in sentence if ('http://' not in item and 'https://' not in item)] for sentence in post_split_split]
    
    # returns a list of posts if they are not empty after removing the links
    return [' '.join(sentence) for sentence in return_list if sentence]

In [48]:
def clean_dots(ind_post):
    if(re.search('[a-zA-Z]', ind_post)):
        if(ind_post.endswith('...') or ind_post.endswith("...'")):
            if(re.search('.*[\.?!]\s', ind_post)):
                return(re.search('.*[\.?!]\s', ind_post).group(0).strip())
            else:
                return(ind_post)
    return ''

In [108]:
df_dict = {'INTJ':0, 'INTP':0, 'ENTJ':1, 'ENTP':1, 
           'INFJ':0, 'INFP':0, 'ENFJ':1, 'ENFP':1, 
           'ISTJ':0, 'ISFJ':0, 'ESTJ':1, 'ESFJ':1, 
           'ISTP':0, 'ISFP':0, 'ESTP':1, 'ESFP':1}

In [109]:
df_split = {'INTJ':[], 'INTP':[], 'ENTJ':[], 'ENTP':[], 
           'INFJ':[], 'INFP':[], 'ENFJ':[], 'ENFP':[], 
           'ISTJ':[], 'ISFJ':[], 'ESTJ':[], 'ESFJ':[], 
           'ISTP':[], 'ISFP':[], 'ESTP':[], 'ESFP':[]}

In [110]:
def extract(mbti, post, list_):
    for ind_post in post:
        list_.append((ind_post, mbti))

In [111]:
for mbti in df_dict.keys():
    if(df[df['type'] == mbti].shape[0] < 200):
        df_dict[mbti] = df[df['type'] == mbti]
    else:
        df_dict[mbti] = df[df['type'] == mbti].sample(n=200)
    
    df_dict[mbti]['post_split'] = df_dict[mbti].posts.apply(clean)
    
    df_dict[mbti].post_split.apply(lambda x: extract(mbti, x, df_split[mbti]))
    print(mbti, df_dict[mbti].shape)

INTJ (200, 3)
INTP (200, 3)
ENTJ (200, 3)
ENTP (200, 3)
INFJ (200, 3)
INFP (200, 3)
ENFJ (190, 3)
ENFP (200, 3)
ISTJ (200, 3)
ISFJ (166, 3)
ESTJ (39, 3)
ESFJ (42, 3)
ISTP (200, 3)
ISFP (200, 3)
ESTP (89, 3)
ESFP (48, 3)


In [112]:
total_posts = 0
for mbti in df_split:
    print(mbti,len(df_split[mbti]))
    total_posts += len(df_split[mbti])
total_posts

INTJ 5290
INTP 5436
ENTJ 5183
ENTP 5197
INFJ 5757
INFP 5512
ENFJ 5461
ENFP 5557
ISTJ 5117
ISFJ 4491
ESTJ 1017
ESFJ 1190
ISTP 4694
ISFP 4719
ESTP 2079
ESFP 973


67673

In [113]:
df_split['INFP'][0]

('wow, i used to LOVE this book when i read it in highschool. the girl who gave it to me to read said that she always thought of me as being just like Charlie... with less crying.',
 'INFP')

In [114]:
df_dict['INFP'].post_split.iloc[0]

['wow, i used to LOVE this book when i read it in highschool. the girl who gave it to me to read said that she always thought of me as being just like Charlie... with less crying.',
 "wow, before reading this thread i had no idea i was that hard to find, but now that it's been mentioned, if someone wanted to find someone like me it would be really difficult.",
 "I get drunk a few nights a week, and i generally go one of two ways. I'll either be sad and moody as all hell or an angry and cynical, embittered asshole, generally both in the same night.",
 'The worst nightmare i ever had was when i was about ten.',
 "At the moment i'm tossing up between type 4 or type 9. but i can't really decide on either one.",
 "Okay, so lately i've come to the realisation that my perception of myself so far has been an idealised fantasy - that of a fun loving extrovert (ENFP)- and after some deep and genuine introspection...",
 'Attention span? What attention spa...',
 'My best mate is an intp. He hates 

In [137]:
df_final_train = pd.DataFrame(columns=["text", "labels"])
df_final_test = df_final_train.copy()
df_final_train

Unnamed: 0,text,labels


In [138]:
df_final_test

Unnamed: 0,text,labels


In [139]:
for mbti in df_split.keys():
    test, train = train_test_split(pd.DataFrame(df_split[mbti], columns=["text", "labels"]), test_size=0.9)
    df_final_train = pd.concat([df_final_train, train])
    df_final_test = pd.concat([df_final_test, test])

In [140]:
df_final_test

Unnamed: 0,text,labels
557,I'm going to reply in a list format to this be...,INTJ
2279,"For me, I smile and say thank you. Then, life ...",INTJ
2522,I'm an INTJ and a Christian. I'm also the firs...,INTJ
1627,"Yeah, this aspect gets me into trouble with m...",INTJ
3441,"No, I don't blame the bed when I hit my toe. ...",INTJ
...,...,...
748,"Ah, that makes more sense. Just got out of a g...",ESFP
816,Thank you :).,ESFP
971,"I have to be real with you here, I don't think...",ESFP
367,Free Personality Test - Highly Accurate | See ...,ESFP


In [141]:
df_final_train

Unnamed: 0,text,labels
4736,The reason may be a certain lowered expectatio...,INTJ
2603,"I'm terrified of being on stage, but yet I kee...",INTJ
922,I know that there are certain rules within the...,INTJ
548,Saami family :) They are not real-life people...,INTJ
4484,"I know, as an INTJ, that I'm highly attracted ...",INTJ
...,...,...
590,"I don't read books, I hear them instead.",ESFP
320,"You know, I didn't actually expect anyone to g...",ESFP
234,"If anyone is curious to delve more into him, h...",ESFP
196,"O_O. I did not expect these answers, at all. W...",ESFP


In [143]:
le = LabelEncoder()
df_final_train.labels = le.fit_transform(df_final_train.labels)
df_final_train.head()

Unnamed: 0,text,labels
4736,The reason may be a certain lowered expectatio...,10
2603,"I'm terrified of being on stage, but yet I kee...",10
922,I know that there are certain rules within the...,10
548,Saami family :) They are not real-life people...,10
4484,"I know, as an INTJ, that I'm highly attracted ...",10


In [146]:
df_final_test.labels = le.transform(df_final_test.labels)

In [147]:
le.inverse_transform(range(0,16))

array(['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP',
       'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP'],
      dtype=object)

In [148]:
model_args = ClassificationArgs(num_train_epochs=1)

model = ClassificationModel(
    'distilbert',
    'distilbert-base-uncased',
    num_labels=16,
    args=model_args,
    use_cuda=False
) 

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [151]:
model.train_model(df_final_train, output_dir="2.0_BERT_model/")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=60913.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 1'), FloatProgress(value=0.0, max=7615.0), HTML(value='')))





(7615, 2.642461184083985)

In [168]:
result, model_outputs, wrong_predictions = model.eval_model(df_final_test,f1=f1_score)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6760.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=845.0), HTML(value='')))




ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [164]:
result

{'mcc': 0.05235541569984611, 'eval_loss': 2.6152040983798237}

In [157]:
pred_sent = "I was always bullied by my family for being small and small-minded, but I don't think I ever took it too seriously. I don't ever think I would have considered trying to be an independent thinker if I'd still been able to be an independent thinker about my own life."

In [161]:
pred_sent1 = "I was always bullied by my family for being small, though.I am not sure what that means. I am not in love or I am lonely...just sad.  How are people feeling when they know something is wrong?It was not always like that. I was the only one who was really happy, and it made me feel a little better. I do not agree with those types.  They have no right to complain and everyone else just seems cool to each other. It is not fair.You're making me sound like a pussy and you're going to be very harsh, especially when you know what it is like to have my ass drilled deep into your body."

In [170]:
predictions, raw_outputs = model.predict([pred_sent, pred_sent1])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [163]:
le.inverse_transform(predictions)

array(['INTJ', 'ENFJ'], dtype=object)

In [171]:
# save the model to disk
filename = '2.0_BERT_model.sav'
pickle.dump(model, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [172]:
type(loaded_model)

simpletransformers.classification.classification_model.ClassificationModel

In [174]:
predictions, raw_outputs = loaded_model.predict([pred_sent, pred_sent1])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [176]:
le.inverse_transform(predictions)

array(['INTJ', 'ENFJ'], dtype=object)