In [141]:
import json
import re
import nltk

In [142]:
# This file contains 8600ish users Data
# In the form: {'MBTI Type', 'Social Media Posts'}
dataFile = open('formatted_data.json', 'r')

In [143]:
data = []

In [144]:
# loading the json data into a list
for line in dataFile:
    data.append(json.loads(line))

In [145]:
types = []
posts = []

In [146]:
# Separating data into 2 separate lists for preprocessing
# For the most part we process the 'post' data
# we skip the first element as that is only the label
for i in range(1, len(data)):
    types.append(data[i]['Type'])
    posts.append(data[i]['Post'])

In [147]:
# List shuffling to make sure our models are valid
# Commented out for debugging purposes
import random
SEED = 673

random.seed(SEED)
random.shuffle(types)
random.shuffle(posts)

In [148]:
# Turning the posts from: 'post1|||post2|||post3'
#                     to: ['post1', 'post2', 'post3']
# expects a list of posts as strings
# returns a list of lists of posts
def vectorize_post_data(posts):
    for index in range(0, len(posts)):
        posts[index] = posts[index].split("|||")
        
    return posts
        
posts = vectorize_post_data(posts)

In [149]:
def remove_hyperTextLinksFromPosts(posts):
    for index in range(0, len(posts)):
        usable_post = [post for post in posts[index] if not re.search(r'^(.)*http(.)*$', post)]
        posts[index] = usable_post
    
    return posts
        
posts = remove_hyperTextLinksFromPosts(posts)

In [150]:
def remove_MBTIClassifiersFromPosts(posts):
    MBTI_regex = r'[\w]*(i|e)(s|n)(f|t)(p|j)[\w]*'
    for i in range(0, len(posts)):
        for j in range(0, len(posts[i])):
            posts[i][j] = re.sub(MBTI_regex, ' ', posts[i][j], flags=re.IGNORECASE)
        
    return posts
    
posts = remove_MBTIClassifiersFromPosts(posts)

In [151]:
import nltk

In [152]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def isInt(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

def tokenize_posts(posts):
    tokenizer = RegexpTokenizer(r'\w+')
    
    for i in range(0, len(posts)):
        user_words = []
        for j in range(0, len(posts[i])):
            post = tokenizer.tokenize(posts[i][j])
            for word in post:
                if not isInt(word) and len(word) > 1:
                    user_words.append(word.lower())
                user_words = [w for w in user_words if not w in stop_words]
        posts[i] = user_words
    
    return posts

posts = tokenize_posts(posts)

In [153]:
all_words_list = []

for user in posts:
    for word in user:
        all_words_list.append(word)


In [154]:
from collections import Counter

In [155]:
freq_list = Counter(all_words_list)
dictionary = freq_list.most_common(10000)

In [156]:
dictionary = list(zip(*dictionary))[0]

In [157]:
nums = range(0, 10000)
word_int = dict(zip(dictionary, nums))

In [158]:
x_vals = []

for user in posts:
    x_vals.append([word_int[x] for x in user if x in word_int.keys()])

In [159]:
intro_extro = [word[0:1] for word in types[0:len(types)]]
bin_intro_extro = []
for letter in intro_extro:
    if (letter == 'I'):
        bin_intro_extro.append(0)
    else:
        bin_intro_extro.append(1)

In [160]:
import numpy as np
x = np.array(x_vals)
test_data = x[:1500]
train_data = x[1500:]

def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

In [161]:
from keras.utils.np_utils import to_categorical

y = np.asarray(bin_intro_extro).astype('float32')
y_test = y[:1500]
y_train = y[1500:]

In [162]:
x_val = x_train[:1500]
x_partial_train = x_train[1500:]

y_val = y_train[:1500]
y_partial_train = y_train[1500:]

In [181]:
from keras import models
from keras import layers
import tensorflow as tf

def build_model():
    model = models.Sequential()
    model.add(layers.Dense(16, activation='relu', input_shape = (10000,)))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

In [187]:
#Iterated K-Fold Validation
k = 4

# // is floor div operator
num_val_samples = len(x_train) // k
num_epochs = 20
all_scores = []

for i in range(1):
    random.shuffle(x_train)
    random.shuffle(y_train)
    for i in range(k):
        print('processing fold #', i)
        val_data = x_train[i * num_val_samples: (i + 1) * num_val_samples]
        val_targets = y_train[i * num_val_samples: (i + 1) * num_val_samples]

        partial_train_data = np.concatenate(
            [x_train[:i * num_val_samples],
             x_train[(i + 1) * num_val_samples:]],
            axis=0)
        partial_train_targets = np.concatenate(
            [y_train[:i * num_val_samples],
             y_train[(i + 1) * num_val_samples:]],
            axis=0)

        model = build_model()
        with tf.device('/gpu:0'):
            model.fit(partial_train_data, partial_train_targets,
                epochs=num_epochs, batch_size=len(partial_train_data), verbose=0)
            
            val_loss, val_acc = model.evaluate(val_data, val_targets)
        print("Accuracy:", val_acc)
        all_scores.append(val_acc)
    
print ("Accuracy Mean:", np.mean(all_scores))

processing fold # 0
Accuracy: 0.781372002230898
processing fold # 1
Accuracy: 0.7791411042944786
processing fold # 2
Accuracy: 0.7657557166759621
processing fold # 3
Accuracy: 0.7512548800892359
Accuracy Mean: 0.7693809258226437


In [185]:
model = build_model()
with tf.device('/gpu:0'):
    model.fit(x_train, y_train, epochs = 25, batch_size = 512)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [186]:
results = model.evaluate(x_test, y_test)
results



[0.5835738066037496, 0.7620000003178914]