### Text Analysis using BERT to classify MIBT Personality

##### Including imports

In [1]:
import pandas as pd
import numpy as np
import regex as re
import string as st

# visualization imports
import matplotlib.pyplot as plt

#splitting into 70 30
from sklearn.model_selection import train_test_split

#BERT
import tensorflow as tf
import transformers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input
from transformers import BertTokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tqdm.notebook import tqdm

In [2]:
data = pd.read_csv("mbti_1.csv")

### Data Cleaning

In [3]:
### Cleaning data to remove url and symbols. The pipe is used to separate posts so will be replaced with spaces
data.posts

0       'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1       'I'm finding the lack of me in these posts ver...
2       'Good one  _____   https://www.youtube.com/wat...
3       'Dear INTP,   I enjoyed our conversation the o...
4       'You're fired.|||That's another silly misconce...
                              ...                        
8670    'https://www.youtube.com/watch?v=t8edHB_h908||...
8671    'So...if this thread already exists someplace ...
8672    'So many questions when i do these things.  I ...
8673    'I am very conflicted right now when it comes ...
8674    'It has been too long since I have been on per...
Name: posts, Length: 8675, dtype: object

In [4]:
def cleaningPosts(data):
    cleanPosts = []
    for post in data.posts:
        post = post.lower()
        post = re.sub('(https|http):\/\/[0-9a-zA-Z\.\-]+\.[a-zA-Z]{1,5}(\/\S*)?', ' ', post)
        post = re.sub('[^0-9a-zA-Z]', ' ', post)   
        cleanPosts.append(post)
    return cleanPosts

In [5]:
data.posts = cleaningPosts(data)

### Data Preview

### 16 Personality Types in this Dataset

In [6]:
data

Unnamed: 0,type,posts
0,INFJ,and intj moments sportscenter not top t...
1,ENTP,i m finding the lack of me in these posts ver...
2,INTP,good one course to which i say i ...
3,INTJ,dear intp i enjoyed our conversation the o...
4,ENTJ,you re fired that s another silly misconce...
...,...,...
8670,ISFP,just because i always think of cats as fi d...
8671,ENFP,so if this thread already exists someplace ...
8672,INTP,so many questions when i do these things i ...
8673,INFP,i am very conflicted right now when it comes ...


### The number of posts classified based on the personality type

In [7]:
### The data is not as balanced therefore there might be some bias

data['type'].value_counts()

INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: type, dtype: int64

### Preparation for training

In [8]:
#splitting
trainData, testData = train_test_split(data, random_state = 0, test_size=0.3)

In [9]:
### using Tokenizer

def tokenizing(posts, tokenizer, maxLen = 150):
    allTokens = []
    for post in tqdm(posts):
        token = tokenizer.encode(post, add_special_tokens=True, max_length = maxLen)
        allTokens.append(token)
    return allTokens
        

In [10]:
def create_model(bertLayer, maxLen = 150):
    input_word_ids = Input(shape=(maxLen,), dtype=tf.int32, name="input_word_ids")
    bertOutput = bertLayer(input_word_ids)[0]
    output = Dense(16, activation='softmax')(bertLayer(input_word_ids)[0][:,0,:])
    model = Model(inputs=input_word_ids, outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.000002), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
trainInputs = tokenizing(trainData['posts'], tokenizer)
trainInputs = pad_sequences(trainInputs, maxlen=150, dtype="long", truncating='post', padding = 'post')
testInputs = tokenizing(testData['posts'], tokenizer)
testInputs = pad_sequences(testInputs, maxlen=150, dtype="long", truncating='post', padding = 'post')

  0%|          | 0/6072 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/2603 [00:00<?, ?it/s]

In [12]:
types = np.unique(data.type.values)

def get_type_index(string):
    return list(types).index(string)

In [13]:
trainData['type_index'] = data['type'].apply(get_type_index)
trainData

Unnamed: 0,type,posts,type_index
3162,INFP,nvm are you good at helping other people s...,9
320,ENTP,well sorry but i just think this is another...,3
6248,INFJ,lol hahaha first of all stop address...,8
2843,ENTP,i thought personality cafe had a rate my pic ...,3
5281,INFP,same problem here i cant get onto my origina...,9
...,...,...,...
4373,INFP,hey it seems like you have a great foundatio...,9
7891,INFJ,dear istj mother when i started my very fi...,8
4859,INTP,oh entjs how can you be scary and exciting a...,11
3264,ENFJ,hi entp and welcome to the forum wink f...,0


### Building the model

In [14]:
bertLayer= transformers.TFBertModel.from_pretrained('bert-large-uncased')

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [15]:
model = create_model(bertLayer, maxLen = 150)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_word_ids (InputLayer)  [(None, 150)]            0         
                                                                 
 tf_bert_model (TFBertModel)  TFBaseModelOutputWithPoo  335141888
                             lingAndCrossAttentions(l            
                             ast_hidden_state=(None,             
                             150, 1024),                         
                              pooler_output=(None, 10            
                             24),                                
                              past_key_values=None, h            
                             idden_states=None, atten            
                             tions=None, cross_attent            
                             ions=None)                          
                                                             

In [16]:
one_hot_labels = tf.keras.utils.to_categorical(trainData.type_index.values, num_classes=16)

In [17]:
model.fit(
np.array(trainInputs), one_hot_labels, verbose=1, epochs = 20, batch_size=2, callbacks = [tf.keras.callbacks.EarlyStopping(patience = 5)])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2b1e1d4cd60>

In [18]:
model.save('bertPersonality.h5')

In [19]:
import pickle
s = pickle.dumps(model)



INFO:tensorflow:Assets written to: ram://0cfa1ea6-5b9e-4ca4-9918-71bf05e4e8f8/assets


INFO:tensorflow:Assets written to: ram://0cfa1ea6-5b9e-4ca4-9918-71bf05e4e8f8/assets


In [20]:
pickle.dump(model, open('bertPersonality.pkl', 'wb'))



INFO:tensorflow:Assets written to: ram://a1def7de-a2b4-4885-8c1a-d34f7f80ef19/assets


INFO:tensorflow:Assets written to: ram://a1def7de-a2b4-4885-8c1a-d34f7f80ef19/assets


In [21]:
testData['type_index'] = data['type'].apply(get_type_index)
testData

Unnamed: 0,type,posts,type_index
4587,ISFP,dear isfj mother i wish you were less of a w...,13
2786,INFJ,to me i think you guys may be over analyzing...,8
2813,ENFP,nihm while nihm has her intj husband i ve go...,1
3705,INTP,i want 5 kids an astro nuclear theoretical...,11
5957,ISFP,i have the same thing as well i ve noticed t...,13
...,...,...,...
2346,INTP,yikes when i wall posted you i hadn t read...,11
1814,ISFJ,i like eggs i m sure there is at least prim...,12
7695,INFJ,i totally understand i m also strange and th...,8
3769,INTP,do you have money to have someone else do the...,11


In [24]:
testLabels = tf.keras.utils.to_categorical(testData.type_index.values, num_classes=16)

In [32]:
trainInputs

array([[ 101, 1050, 2615, ..., 2065, 2023,  102],
       [ 101, 2092, 3374, ..., 1045, 2179,  102],
       [ 101, 8840, 2140, ..., 4299, 1045,  102],
       ...,
       [ 101, 2821, 4372, ..., 1037, 8549,  102],
       [ 101, 7632, 4372, ..., 2017, 3305,  102],
       [ 101, 5875, 2017, ..., 4840, 2250,  102]])

In [30]:
testLabels

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [28]:
model.evaluate(np.array(testInputs), np.array(testLabels))

ValueError: Data cardinality is ambiguous:
  x sizes: 6072
  y sizes: 2603
Make sure all arrays contain the same number of samples.