<a href="https://colab.research.google.com/github/TiffanyNgai/Personality-classification/blob/main/MBTI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

In [None]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from google.colab import drive
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
drive.mount('/content/drive')
path = "/content/drive/MyDrive/Machine learning projects/MBTI/mbti_1.csv"
data_df = pd.read_csv(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_df

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...


In [None]:
data_df.iloc[0,1]

"'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389  84390  http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg  http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.jpg ...

## Data preprocessing

In [None]:
mbti_df = pd.DataFrame(data={'mbti':["ISTJ", "ISFJ", "INFJ", "INTJ", 
                                     "ISTP", "ISFP", "INFP", "INTP",
                                     "ESTP", "ESFP", "ENFP", "ENTP",
                                     "ESTJ", "ESFJ", "ENFJ", "ENTJ"], 
                             'labelled_type':[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]})
mbti_df

Unnamed: 0,mbti,labelled_type
0,ISTJ,0
1,ISFJ,1
2,INFJ,2
3,INTJ,3
4,ISTP,4
5,ISFP,5
6,INFP,6
7,INTP,7
8,ESTP,8
9,ESFP,9


In [None]:
labelled_data_df = data_df.merge(mbti_df, how='left', left_on='type', right_on='mbti')
labelled_data_df.drop(['type', 'mbti'], axis=1, inplace=True)
labelled_data_df

Unnamed: 0,posts,labelled_type
0,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,2
1,'I'm finding the lack of me in these posts ver...,11
2,'Good one _____ https://www.youtube.com/wat...,7
3,"'Dear INTP, I enjoyed our conversation the o...",3
4,'You're fired.|||That's another silly misconce...,15
...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908||...,5
8671,'So...if this thread already exists someplace ...,10
8672,'So many questions when i do these things. I ...,7
8673,'I am very conflicted right now when it comes ...,6


In [None]:
labelled_data_df['clean_post'] = labelled_data_df['posts'].str.lower()
# Limitation: some posts doesn't have space in between, which could cause data loss
labelled_data_df['clean_post'] = labelled_data_df['clean_post'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
# Remove numbers
labelled_data_df['clean_post'] = labelled_data_df['clean_post'].apply(lambda elem: re.sub(r"\d+", "", elem))
labelled_data_df

Unnamed: 0,posts,labelled_type,clean_post
0,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,2,and intj moments sportscenter not top ten ...
1,'I'm finding the lack of me in these posts ver...,11,im finding the lack of me in these posts very ...
2,'Good one _____ https://www.youtube.com/wat...,7,good one course to which i say i know tha...
3,"'Dear INTP, I enjoyed our conversation the o...",3,dear intp i enjoyed our conversation the oth...
4,'You're fired.|||That's another silly misconce...,15,youre firedthats another silly misconception t...
...,...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908||...,5,just because i always think of cats as fi dom...
8671,'So...if this thread already exists someplace ...,10,soif this thread already exists someplace else...
8672,'So many questions when i do these things. I ...,7,so many questions when i do these things i wo...
8673,'I am very conflicted right now when it comes ...,6,i am very conflicted right now when it comes t...


In [None]:
maxlen = 500

# Tokenize and encode text
t = Tokenizer()
t.fit_on_texts(labelled_data_df['clean_post'])
posts_sequence = t.texts_to_sequences(labelled_data_df['clean_post'])
encoded_posts = pad_sequences(posts_sequence, maxlen=maxlen)

In [None]:
print(np.all(np.isnan(encoded_posts)))
print(encoded_posts[-1])

False
[     1     30     78      2     81  12169      4   1906     32     52
     24      4    828    771   6217   8633    196      5    359     23
      4     61    679      5    906    123     24      3    612    455
     20   1490    566     57   2788     10      9     71   2128      5
      3    679    156    229     41   7853  10023      4    415    123
     70    724     34     10      1    297     68     52    667      5
   1507    623      3    187  28986      7     75      7     15    584
     39   1487      1    538    323     56      7     66      1     42
     75     23     11      4    389    332    125     96      5     15
    584      3    112      8     46   1423     17    118     12   1487
      5     12    188      1 307370     77      1     35      3    692
  18481 307371     48    173    174     16      4   3723   2788    906
    184      2  61734     48     18     16      3  17507     43      3
     80    119    201      6     63      5  27574  67987     27    118


In [None]:
one_hot_label = keras.utils.to_categorical(labelled_data_df['labelled_type'])

In [None]:
labelled_data_df['encoded_posts'] = encoded_posts.tolist()
labelled_data_df['one_hot_label'] = one_hot_label.tolist()
labelled_data_df

Unnamed: 0,posts,labelled_type,clean_post,encoded_posts,one_hot_label
0,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,2,and intj moments sportscenter not top ten ...,"[453, 187, 741, 11, 1384, 353, 316, 2, 401, 6,...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,'I'm finding the lack of me in these posts ver...,11,im finding the lack of me in these posts very ...,"[49, 145, 85, 3715, 3716, 31, 1, 27, 4, 3851, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,'Good one _____ https://www.youtube.com/wat...,7,good one course to which i say i know tha...,"[442, 24, 4, 206, 1952, 23, 25, 2648, 11, 1324...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,"'Dear INTP, I enjoyed our conversation the o...",3,dear intp i enjoyed our conversation the oth...,"[4875, 3, 74, 8, 4463, 178, 8, 133, 51, 42, 92...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,'You're fired.|||That's another silly misconce...,15,youre firedthats another silly misconception t...,"[2, 832, 45, 262, 814, 32, 5627, 43, 141, 17, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908||...,5,just because i always think of cats as fi dom...,"[4, 1198, 307248, 6, 3, 656, 2942, 31693, 754,...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
8671,'So...if this thread already exists someplace ...,10,soif this thread already exists someplace else...,"[7468, 40, 1, 207, 1060, 32, 527, 38473, 10, 1...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8672,'So many questions when i do these things. I ...,7,so many questions when i do these things i wo...,"[307314, 194, 1, 558, 307315, 22, 4, 263, 2642...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
8673,'I am very conflicted right now when it comes ...,6,i am very conflicted right now when it comes t...,"[123, 1, 711, 2, 568, 12, 109, 16, 1419, 8, 1,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(encoded_posts, one_hot_label, test_size=0.3)

In [None]:
print("X_train:", X_train.shape)
print("Y_train:", Y_train.shape)

X_train: (6072, 500)
Y_train: (6072, 16)


In [None]:
#Ensure there is no NULL value
print(np.all(np.isnan(X_train)))
print(np.all(np.isnan(Y_train)))

False
False


In [None]:
X_train

array([[ 1140,   186,    22, ...,   439,    32,     4],
       [ 1263, 12442,   141, ...,  1137,  1490,     9],
       [ 3684,    13,    31, ...,  1268,     3,  1046],
       ...,
       [  143,     4,   398, ...,  6645, 16096,    10],
       [ 9299,   119,    38, ...,   354,   159,    39],
       [    1,   525,    14, ..., 17293,   725,   119]], dtype=int32)

In [None]:
Y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

##Create model

In [None]:
from zmq.constants import XREQ
max_features = 20000
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add bidirectional LSTM
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Dropout(rate=0.4)(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
x = layers.Dropout(rate=0.4)(x)

# Add a classifier
outputs = layers.Dense(16, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 128)         2560000   
                                                                 
 bidirectional_2 (Bidirectio  (None, None, 128)        98816     
 nal)                                                            
                                                                 
 dropout (Dropout)           (None, None, 128)         0         
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 128)               0   

## Model training

In [None]:
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
model.fit(X_train, Y_train, batch_size=32, epochs=10, validation_data=(X_val, Y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe537215390>

## Reference

- https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/
- https://www.kdnuggets.com/2020/03/tensorflow-keras-tokenization-text-data-prep.html
- https://keras.io/examples/nlp/bidirectional_lstm_imdb/


In [None]:
string = str(data_df.iat[0,1])
substr = "|||"
count = string.count(substr)
print(count)
#TODO: split the string into different columns of substring with ||| as separator

49
