In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle

In [2]:
text=pd.read_csv("mbti_1.csv" ,index_col='type')
print(text.shape)
print(text[0:5])
#print(text.iloc[2])

(8675, 1)
                                                  posts
type                                                   
INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
ENTP  'I'm finding the lack of me in these posts ver...
INTP  'Good one  _____   https://www.youtube.com/wat...
INTJ  'Dear INTP,   I enjoyed our conversation the o...
ENTJ  'You're fired.|||That's another silly misconce...


In [3]:
from sklearn.preprocessing import LabelBinarizer

# One hot encode labels
labels=text.index.tolist()
encoder=LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
labels=encoder.fit_transform(labels)
labels=np.array(labels)
print(labels[50:55])

[[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]]


In [4]:
mbti_dict={0:'ENFJ',1:'ENFP',2:'ENTJ',3:'ENTP',4:'ESFJ',5:'ESFP',6:'ESTJ',7:'ESTP',8:'INFJ',9:'INFP',10:'INTJ',11:'INTP',12:'ISFJ',13:'ISFP',14:'ISFP',15:'ISTP'}

In [5]:
import re

# Function to clean data..
def post_cleaner(post):
    """cleans individual posts`.
    Args:
        post-string
    Returns:
         cleaned up post`.
    """
    # Covert all uppercase characters to lower case
    post = post.lower() 
    
    # Remove |||
    post=post.replace('|||',"") 

    # Remove URLs, links etc
    post = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', post, flags=re.MULTILINE) 
    # This would have removed most of the links but probably not all 

    # Remove puntuations 
    puncs1=['@','#','$','%','^','&','*','(',')','-','_','+','=','{','}','[',']','|','\\','"',"'",';',':','<','>','/']
    for punc in puncs1:
        post=post.replace(punc,'') 

    puncs2=[',','.','?','!','\n']
    for punc in puncs2:
        post=post.replace(punc,' ') 
    # Remove extra white spaces
    post=re.sub( '\s+', ' ', post ).strip()
    return post

In [6]:
posts=text.posts.tolist()
posts=[post_cleaner(post) for post in posts]

In [7]:
from collections import Counter

word_count=Counter()
for post in posts:
    word_count.update(post.split(" "))
    
vocab_len=len(word_count)
print(vocab_len)

172989


In [8]:
vocab = sorted(word_count, key=word_count.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
posts_ints=[]

for post in posts:
    posts_ints.append([vocab_to_int[word] for word in post.split()])

In [9]:
print(posts_ints[0][:10])
print(len(posts_ints[0]))

[5, 141, 1287, 91380, 22, 703, 1852, 2075, 139374, 89]
566


In [10]:
posts_lens = Counter([len(x) for x in posts])
print(np.mean(posts_lens.keys())- (0.5*np.std(posts_lens.keys())))

5321.75576391


In [11]:
seq_len = 1000
features=np.zeros((len(posts_ints),seq_len),dtype=int)
for i, row in enumerate(posts_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]
#print(features[1555])

In [12]:
split_frac = 0.8

num_ele=int(split_frac*len(features))
rem_ele=len(features)-num_ele
train_x, val_x = features[:num_ele],features[num_ele: num_ele + int(rem_ele/2)]
train_y, val_y = labels[:num_ele],labels[num_ele:num_ele + int(rem_ele/2)]

test_x =features[num_ele + int(rem_ele/2):]
test_y = labels[num_ele + int(rem_ele/2):]

In [13]:
lstm_size = 256
lstm_layers = 1
batch_size = 256
learning_rate = 0.01
embed_dim=250
n_words = len(vocab_to_int) + 1

In [14]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
int_to_vocab = {ii: word for word,ii in vocab_to_int.iteritems()}

In [27]:
#model[int_to_vocab[2]]

In [17]:
wv_emb = []
unk=[]
for k in range(1,len(int_to_vocab)):
    try:
        wv_emb.append(model[int_to_vocab[k]])
    except:
        unk.append(int_to_vocab[k])
len(unk)

116753

In [26]:
unk[160:170]

['hisher',
 'winki',
 'thatd',
 'keirsey',
 'enxp',
 'niti',
 '65',
 '36',
 'favourites',
 'travelling']

In [16]:
graph = tf.Graph()
with graph.as_default():
    input_data = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    embedding= tf.Variable(tf.random_uniform(shape=(n_words,embed_dim),minval=-1,maxval=1))
    embed=tf.nn.embedding_lookup(embedding,input_data)
    #print(embed.shape)
    
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    drop = tf.contrib.rnn.DropoutWrapper(lstm,output_keep_prob=keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop]* lstm_layers)
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    outputs,final_state=tf.nn.dynamic_rnn(cell,embed,dtype=tf.float32 )
    pre = tf.layers.dense(outputs[:,-1], 16, activation=tf.nn.relu)
    predictions=tf.layers.dense(pre, 16, activation=tf.nn.softmax)
    
    cost = tf.losses.mean_squared_error(labels_, predictions)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [17]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [18]:
epochs = 3

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {input_data: x,
                    labels_: y,
                    keep_prob: 1.0,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {input_data: x,
                            labels_: y,
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/mbti.ckpt")


('Epoch: 0/3', 'Iteration: 5', 'Train loss: 0.058')
('Epoch: 0/3', 'Iteration: 10', 'Train loss: 0.056')
('Epoch: 0/3', 'Iteration: 15', 'Train loss: 0.056')
('Epoch: 0/3', 'Iteration: 20', 'Train loss: 0.055')
('Epoch: 0/3', 'Iteration: 25', 'Train loss: 0.055')
Val acc: 0.938
('Epoch: 1/3', 'Iteration: 30', 'Train loss: 0.054')
('Epoch: 1/3', 'Iteration: 35', 'Train loss: 0.055')
('Epoch: 1/3', 'Iteration: 40', 'Train loss: 0.054')
('Epoch: 1/3', 'Iteration: 45', 'Train loss: 0.051')
('Epoch: 1/3', 'Iteration: 50', 'Train loss: 0.054')
Val acc: 0.937
('Epoch: 2/3', 'Iteration: 55', 'Train loss: 0.052')
('Epoch: 2/3', 'Iteration: 60', 'Train loss: 0.053')
('Epoch: 2/3', 'Iteration: 65', 'Train loss: 0.052')
('Epoch: 2/3', 'Iteration: 70', 'Train loss: 0.050')
('Epoch: 2/3', 'Iteration: 75', 'Train loss: 0.046')
Val acc: 0.930
('Epoch: 2/3', 'Iteration: 80', 'Train loss: 0.048')


In [19]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {input_data: x,
                labels_: y,
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints/mbti.ckpt
Test accuracy: 0.933
