In [None]:
import json
import re

In [None]:
# This file contains 8600ish users Data
# In the form: {'MBTI Type', 'Social Media Posts'}
dataFile = open('formatted_data.json', 'r')

In [None]:
data = []

In [None]:
# loading the json data into a list
for line in dataFile:
    data.append(json.loads(line))

In [None]:
types = []
posts = []

In [None]:
# Separating data into 2 separate lists for preprocessing
# For the most part we process the 'post' data
# we skip the first element as that is only the label
for i in range(1, len(data)):
    types.append(data[i]['Type'])
    posts.append(data[i]['Post'])

In [None]:
# Turning the posts from: 'post1|||post2|||post3'
#                     to: ['post1', 'post2', 'post3']
# expects a list of posts as strings
# returns a list of lists of posts
def vectorize_post_data(posts):
    for index in range(0, len(posts)):
        posts[index] = posts[index].split("|||")
        
    return posts

In [None]:
def remove_hyperTextLinksFromPosts(posts):
    for index in range(0, len(posts)):
        usable_post = [post for post in posts[index] if not re.search(r'^(.)*http(.)*$', post)]
        posts[index] = usable_post
    
    return posts
        
posts = remove_hyperTextLinksFromPosts(posts)

In [None]:
def remove_MBTIClassifiersFromPosts(posts):
    MBTI_regex = r'[\w]*(i|e)(s|n)(f|t)(p|j)[\w]*'
    for i in range(0, len(posts)):
        for j in range(0, len(posts[i])):
            posts[i][j] = re.sub(MBTI_regex, ' ', posts[i][j], flags=re.IGNORECASE)
        
    return posts
    
posts = remove_MBTIClassifiersFromPosts(posts)

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def isInt(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

def tokenize_posts(posts):
    tokenizer = RegexpTokenizer(r'\w+')
    
    for i in range(0, len(posts)):
        user_words = []
        for j in range(0, len(posts[i])):
            post = tokenizer.tokenize(posts[i][j])
            for word in post:
                if not isInt(word) and len(word) > 1:
                    user_words.append(word.lower())
                user_words = [w for w in user_words if not w in stop_words]
        posts[i] = user_words
    
    return posts

posts = tokenize_posts(posts)

In [None]:
import pickle

tokenized_posts = []
with open ('tokenized_formatted_data.txt', 'rb') as fp:
    tokenized_posts = pickle.load(fp)

In [None]:
all_words_list = []

for user in tokenized_posts:
    for word in user:
        all_words_list.append(word)

In [None]:
from collections import Counter

In [None]:
freq_list = Counter(all_words_list)
dictionary = freq_list.most_common(10000)

In [None]:
dictionary = list(zip(*dictionary))[0]

In [None]:
nums = range(0, 10000)
word_int = dict(zip(dictionary, nums))

In [None]:
x_vals = []

for user in tokenized_posts:
    x_vals.append([word_int[x] for x in user if x in word_int.keys()])

In [None]:
intro_extro = [word[0:1] for word in types[0:len(types)]]
bin_intro_extro = []
for letter in intro_extro:
    if (letter == 'I'):
        bin_intro_extro.append(0)
    else:
        bin_intro_extro.append(1)


In [None]:
import numpy as np
import random
random.seed(500)

x = np.array(x_vals)
random.shuffle(x)
test_data = x[:1500]
train_data = x[1500:]

def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

In [None]:
from keras.utils.np_utils import to_categorical

y = np.asarray(bin_intro_extro).astype('float32')
random.shuffle(y)
y_test = y[:1500]
y_train = y[1500:]

In [None]:
x_val = x_train[:1500]
x_partial_train = x_train[1500:]

y_val = y_train[:1500]
y_partial_train = y_train[1500:]

In [None]:
from keras import models 
from keras import layers
def build_model():
    model = models.Sequential()
    model.add(layers.Embedding(10000, 128))
    model.add(layers.LSTM(128))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

model = build_model()
history = model.fit(x_partial_train, y_partial_train, epochs = 12, batch_size = 10, validation_data=(x_val,y_val))
#results = model.evaluate(x_test, y_test)
#results

In [None]:
model = build_model()
#with tf.device('/gpu:0'):
history = model.fit(x_partial_train, y_partial_train, epochs = 12, batch_size = 512, validation_data=(x_val, y_val), verbose=0)

In [None]:
import matplotlib.pyplot as plt

history_dict = history.history
train_loss = history_dict['loss']
val_loss = history_dict['val_loss']

train_acc = history_dict['acc']
val_acc   = history_dict['val_acc']

epochs = range(1, len(history_dict['acc']) + 1)

plt.plot(epochs, train_loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

plt.clf()

plt.plot(epochs, train_acc, 'bo', label='Training Accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation Accuracy')
plt.title('Training and validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
 model = build_model()
with tf.device('/gpu:0'):
    model.fit(x_train, y_train, epochs= 4, batch_size=512)
results = model.evaluate(x_test, y_test)
results