# MBTI (Myers-Briggs Type Indicator) RNN

In [15]:
!pip show tensorflow

Name: tensorflow
Version: 1.14.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /opt/conda/lib/python3.6/site-packages
Requires: keras-applications, tensorflow-estimator, gast, absl-py, numpy, grpcio, tensorboard, google-pasta, keras-preprocessing, termcolor, wheel, wrapt, astor, protobuf, six
Required-by: fancyimpute


In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle

## Load dataset

The dataset is a 'csv' file, so we'll use pandas to load it. We shall print the shape and the first few entries of the dataset to understand what we're working with. Accordingly, we need to choose what strategy to use to clean the data.

In [2]:
# Load Dataset
text=pd.read_csv("../input/mbti_1.csv" ,index_col='type')
print(text.shape)
print(text[0:5])
print(text.iloc[2])

(8675, 1)
                                                  posts
type                                                   
INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
ENTP  'I'm finding the lack of me in these posts ver...
INTP  'Good one  _____   https://www.youtube.com/wat...
INTJ  'Dear INTP,   I enjoyed our conversation the o...
ENTJ  'You're fired.|||That's another silly misconce...
posts    'Good one  _____   https://www.youtube.com/wat...
Name: INTP, dtype: object


## Preprocessing labels
The neural letwork cannot understand string labels, so we one-hot-encode them using sklearn.preprocessing.LabelBinarizer. I'm displaying the first few labels to see if everything's okay.

In [3]:
from sklearn.preprocessing import LabelBinarizer

# One hot encode labels
labels=text.index.tolist()
encoder=LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
labels=encoder.fit_transform(labels)
labels=np.array(labels)
print(labels[50:55])

[[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]]


In [4]:
mbti_dict={0:'ENFJ',1:'ENFP',2:'ENTJ',3:'ENTP',4:'ESFJ',5:'ESFP',6:'ESTJ',7:'ESTP',8:'INFJ',9:'INFP',10:'INTJ',11:'INTP',12:'ISFJ',13:'ISFP',14:'ISFP',15:'ISTP'}

### Preprocessing posts

We can see that the posts are very noisy, so they need to be cleaned. For this I'm doing the following:

1. Converting all letters to lowercase.
2. Remove '|||'
3. Removing punctuation.
4. Removing URLs, links etc..
5. Convert words to integers

We'll leave unicode emojis alone.

In [5]:
import re

# Function to clean data ... will be useful later
def post_cleaner(post):
    """cleans individual posts`.
    Args:
        post-string
    Returns:
         cleaned up post`.
    """
    # Covert all uppercase characters to lower case
    post = post.lower() 
    
    # Remove |||
    post=post.replace('|||',"") 

    # Remove URLs, links etc
    post = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', post, flags=re.MULTILINE) 
    # This would have removed most of the links but probably not all 

    # Remove puntuations 
    puncs1=['@','#','$','%','^','&','*','(',')','-','_','+','=','{','}','[',']','|','\\','"',"'",';',':','<','>','/']
    for punc in puncs1:
        post=post.replace(punc,'') 

    puncs2=[',','.','?','!','\n']
    for punc in puncs2:
        post=post.replace(punc,' ') 
    # Remove extra white spaces
    post=re.sub( '\s+', ' ', post ).strip()
    return post

In [6]:
# Clean up posts
# Covert pandas dataframe object to list. I prefer using lists for prepocessing. 
posts=text.posts.tolist()
posts=[post_cleaner(post) for post in posts]

In [7]:
# Count total words
from collections import Counter

word_count=Counter()
for post in posts:
    word_count.update(post.split(" "))

In [8]:
# Size of the vocabulary available to the RNN
vocab_len=len(word_count)
print(vocab_len)

print(len(posts[0]))

172984
3094


### Convert words to integers

In [9]:
# Create a look up table 
vocab = sorted(word_count, key=word_count.get, reverse=True)
# Create your dictionary that maps vocab words to integers here
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

posts_ints=[]
for post in posts:
    posts_ints.append([vocab_to_int[word] for word in post.split()])

print(posts_ints[0])
print(len(posts_ints[0]))

[5, 141, 1287, 61293, 22, 703, 1850, 2069, 61294, 89, 72, 2, 84, 13390, 286, 11, 39, 108, 24, 2176, 14, 84, 6, 375, 196, 2, 723, 286, 12342, 7, 2, 241, 115, 12, 133, 148, 574, 24, 134, 1035, 185, 5881, 2140, 2, 459, 189, 762, 11, 61295, 61296, 279, 3, 416, 6, 39, 7396, 34, 86, 814, 14, 4, 238, 3, 22, 20, 3969, 43, 2, 59, 11, 217, 475, 6, 1558, 154, 3, 524, 2, 205, 242, 26, 242, 6, 2559, 26, 61297, 61298, 308, 5, 274, 492, 712, 1216, 12655, 61299, 41, 257, 7322, 987, 6, 1077, 39, 3300, 5, 1, 31, 161, 1077, 64, 169, 1108, 11, 39, 130, 2803, 3858, 1989, 11, 7024, 150, 154, 28515, 26, 4, 5882, 2233, 510, 211, 62, 16, 565, 3459, 443, 2804, 8, 309, 95, 32, 6508, 218, 7, 81, 3, 33, 49, 47, 90, 408, 221, 626, 309, 218, 630, 342, 5, 4236, 40, 532, 73, 43, 76, 11, 7024, 5185, 9, 1027, 4, 464, 492, 5, 4, 75, 46, 41, 8, 940, 4, 75, 46, 41, 8, 9, 833, 1489, 11, 8, 1, 56, 22, 354, 10262, 2, 684, 6, 111, 626, 11286, 613, 194, 35, 103, 39, 417, 464, 568, 1120, 62, 5, 35, 21, 39, 92, 954, 417, 464, 568

 ### Make posts uniform
We can see that the lengths of the posts aren't uniform, so we'll limit number of words in each post to 1000.For posts with less than 1000 words, we'll pad with zeros.

In [10]:
posts_lens = Counter([len(x) for x in posts])
print("Zero-length reviews: {}".format(posts_lens[0]))
print("Maximum review length: {}".format(max(posts_lens)))
print("Minimum review length: {}".format(min(posts_lens)))

seq_len = 500
features=np.zeros((len(posts_ints),seq_len),dtype=int)
for i, row in enumerate(posts_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]
print(features[:10])

Zero-length reviews: 0
Maximum review length: 9588
Minimum review length: 13
[[   5  141 1287 ...  222  278    1]
 [  18  751    2 ...    2 1660 4189]
 [  75   46  386 ...   24 2234   75]
 ...
 [   1  259    3 ...   17  631    3]
 [  18   22  120 ... 4330  659   11]
 [  11   19 1197 ...   47 2496  112]]


### Preparing tranining, test and validation datasets

In [11]:
# Split data into training, test and validation

split_frac = 0.8

num_ele=int(split_frac*len(features))
rem_ele=len(features)-num_ele
train_x, val_x = features[:num_ele],features[num_ele:int(rem_ele/2)+num_ele]
train_y, val_y = labels[:num_ele],labels[num_ele:int(rem_ele/2)+num_ele]

test_x =features[num_ele+int(rem_ele/2):]
test_y = labels[num_ele+int(rem_ele/2):]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(6940, 500) 
Validation set: 	(867, 500) 
Test set: 		(868, 500)


## The RNN

In [12]:
lstm_size = 256
lstm_layers = 1
batch_size = 256
learning_rate = 0.01
embed_dim=250

In [13]:
n_words = len(vocab_to_int) + 1 # Adding 1 because we use 0's for padding, dictionary started at 1

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    input_data = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

In [14]:
n_words

172985

In [15]:
# Embedding
with graph.as_default():
    embedding= tf.Variable(tf.random_uniform(shape=(n_words,embed_dim),minval=-1,maxval=1))
    embed=tf.nn.embedding_lookup(embedding,input_data)
    print(embed.shape)

(?, ?, 250)


In [16]:
# LSTM cell
with graph.as_default():
    # basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop = tf.contrib.rnn.DropoutWrapper(lstm,output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([drop]* lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

In [17]:
with graph.as_default():
    outputs,final_state=tf.nn.dynamic_rnn(cell,embed,dtype=tf.float32 )



In [18]:
with graph.as_default():
    
    pre = tf.layers.dense(outputs[:,-1], 16, activation=tf.nn.relu)
    predictions=tf.layers.dense(pre, 16, activation=tf.nn.softmax)
    
    cost = tf.losses.mean_squared_error(labels_, predictions)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)



In [19]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [20]:
def get_batches(x, y, batch_size=100):    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

## Training

In [21]:
epochs = 3

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {input_data: x,
                    labels_: y,
                    keep_prob: 1.0,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {input_data: x,
                            labels_: y,
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/mbti.ckpt")

Epoch: 0/3 Iteration: 5 Train loss: 0.058
Epoch: 0/3 Iteration: 10 Train loss: 0.057
Epoch: 0/3 Iteration: 15 Train loss: 0.057
Epoch: 0/3 Iteration: 20 Train loss: 0.056
Epoch: 0/3 Iteration: 25 Train loss: 0.055
Val acc: 0.938
Epoch: 1/3 Iteration: 30 Train loss: 0.054
Epoch: 1/3 Iteration: 35 Train loss: 0.056
Epoch: 1/3 Iteration: 40 Train loss: 0.055
Epoch: 1/3 Iteration: 45 Train loss: 0.052
Epoch: 1/3 Iteration: 50 Train loss: 0.052
Val acc: 0.937
Epoch: 2/3 Iteration: 55 Train loss: 0.051
Epoch: 2/3 Iteration: 60 Train loss: 0.051
Epoch: 2/3 Iteration: 65 Train loss: 0.052
Epoch: 2/3 Iteration: 70 Train loss: 0.050
Epoch: 2/3 Iteration: 75 Train loss: 0.048
Val acc: 0.933
Epoch: 2/3 Iteration: 80 Train loss: 0.045


## Testing

In [21]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {input_data: x,
                labels_: y,
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

Test accuracy: 0.937


In [23]:
import pickle

# Save the vocab_to_int dictionary
with open('vocab_to_int.pickle', 'wb') as handle:
    pickle.dump(vocab_to_int, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [22]:
# After training, save the model
output_dir = '/kaggle/working/'
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())  # Initialize all variables
    saver.save(sess, output_dir + "mbti.ckpt")


In [26]:
# Reload the saved model
with tf.Session(graph=graph) as sess:
    saver.restore(sess, output_dir + "mbti.ckpt")

    # Example of making predictions
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    test_predictions = []
    for x, y in get_batches(test_x, test_y, batch_size):
        feed = {input_data: x,
                labels_: y,
                keep_prob: 1,
                initial_state: test_state}
        batch_predictions, test_state = sess.run([predictions, final_state], feed_dict=feed)
        test_predictions.extend(batch_predictions.argmax(axis=1))

    # Convert predicted indices back to labels
    predicted_labels = [mbti_dict[i] for i in test_predictions]

    # Process predicted_labels as needed for your application
    print("Predicted labels:", predicted_labels)


Predicted labels: ['ESTP', 'ESTJ', 'ENFJ', 'ESTJ', 'ENTJ', 'ENFJ', 'ESTJ', 'ENTP', 'ESTJ', 'ESTJ', 'ESTP', 'ESTJ', 'ESFJ', 'ESFJ', 'ESFJ', 'ENTP', 'ESTP', 'ISFP', 'ESTP', 'ESTJ', 'ESTP', 'ESTP', 'INFP', 'ESTP', 'ESTJ', 'ENFJ', 'ESTP', 'ESTP', 'ISFP', 'ESTJ', 'ESFJ', 'ENTJ', 'ISFP', 'ESTP', 'INFP', 'ESFP', 'ENTJ', 'ENFJ', 'ENTJ', 'ESFJ', 'ESTP', 'ESTP', 'INFJ', 'ESTP', 'ESFJ', 'ENFJ', 'ESTJ', 'INTP', 'ENTJ', 'INFJ', 'ESFP', 'INFJ', 'ESTP', 'ESTJ', 'ENTJ', 'ESTP', 'ENFJ', 'ENTP', 'ESTP', 'ISFP', 'ENTJ', 'ESTP', 'ESTJ', 'ENTP', 'ENFJ', 'ENFJ', 'INFJ', 'ENTP', 'ESTJ', 'INFP', 'ENFJ', 'ESTJ', 'ESFJ', 'ESFJ', 'ESFP', 'ESFJ', 'ENFJ', 'ESTJ', 'INFP', 'ESFJ', 'ESTP', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ', 'ESTJ', 'ENTJ', 'ESTP', 'ESTJ', 'ENFJ', 'ENTJ', 'INFP', 'ESTJ', 'ESTJ', 'ESTP', 'ENTP', 'ISFP', 'ESTJ', 'ESTP', 'ESTJ', 'ENFJ', 'ESTJ', 'ESTP', 'ESTP', 'ENTJ', 'ENTJ', 'ESTJ', 'ESFJ', 'ENFJ', 'ENFJ', 'INFP', 'ESTJ', 'INFP', 'ENTJ', 'INFJ', 'ENTJ', 'ESTP', 'ESFJ', 'ESTJ', 'ESFJ', 'ESFJ', 'ESTJ', 'ESTP