In [1]:
import json
import re

### Feed Forward Neural Network Parameters

In [2]:
SEED = 600
# 1st Layer
LAYER1_SIZE = 16
LAYER1_ACTIVATION = 'relu'
LAYER1_INPUT_DIMENSION = 10000

LAYER1_PARAMS = [str(LAYER1_SIZE), LAYER1_ACTIVATION, LAYER1_INPUT_DIMENSION]

# 2nd LAYER
LAYER2_SIZE = 16
LAYER2_ACTIVATION = 'relu'

LAYER2_PARAMS = [str(LAYER2_SIZE), LAYER2_ACTIVATION]

# 3rd LAYER
LAYER3_SIZE = 1
LAYER3_ACTIVATION = 'sigmoid'

LAYER3_PARAMS = [str(LAYER3_SIZE), LAYER3_ACTIVATION]

# Geralizers
DROPOUT_RATE = 0.5
L1 = 0.001
L2 = 0.001

# TRAINING
EPOCHS = 50
BATCH_SIZE = 4000

TRAIN_PARAMS = [EPOCHS, BATCH_SIZE]

# COMPILATION
OPTIMIZER = 'rmsprop'
LOSS = 'binary_crossentropy'
METRICS = 'accuracy'

COMPILATION_PARAMS = [OPTIMIZER, LOSS, METRICS]

In [3]:
# This file contains 8600ish users Data
# In the form: {'MBTI Type', 'Social Media Posts'}
dataFile = open('formatted_data.json', 'r')

In [4]:
data = []

In [5]:
# loading the json data into a list
for line in dataFile:
    data.append(json.loads(line))

In [6]:
types = []
posts = []

In [7]:
# Separating data into 2 separate lists for preprocessing
# For the most part we process the 'post' data
# we skip the first element as that is only the label
for i in range(1, len(data)):
    types.append(data[i]['Type'])
    #posts.append(data[i]['Post'])

In [8]:
import pickle

tokenized_posts = []
with open ('tokenized_formatted_data.txt', 'rb') as fp:
    tokenized_posts = pickle.load(fp)

In [9]:
all_words_list = []

for user in tokenized_posts:
    for word in user:
        all_words_list.append(word)

In [10]:
from collections import Counter

In [11]:
freq_list = Counter(all_words_list)
dictionary = freq_list.most_common(10000)

In [12]:
dictionary = list(zip(*dictionary))[0]

In [13]:
nums = range(0, 10000)
word_int = dict(zip(dictionary, nums))

In [14]:
x_vals = []

for user in tokenized_posts:
    x_vals.append([word_int[x] for x in user if x in word_int.keys()])

In [15]:
intro_extro = [word[0:1] for word in types[0:len(types)]]
bin_intro_extro = []
for letter in intro_extro:
    if (letter == 'I'):
        bin_intro_extro.append(0)
    else:
        bin_intro_extro.append(1)


In [16]:
import numpy as np
import random

x = np.array(x_vals)
random.seed(SEED)
random.shuffle(x)
test_data = x[:1500]
train_data = x[1500:]

def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

In [17]:
from keras.utils.np_utils import to_categorical

y = np.asarray(bin_intro_extro).astype('float32')
random.seed(SEED)
random.shuffle(y)
y_test = y[:1500]
y_train = y[1500:]

Using TensorFlow backend.


In [18]:
x_val = x_train[:1500]
x_partial_train = x_train[1500:]

y_val = y_train[:1500]
y_partial_train = y_train[1500:]

In [19]:
# FEED FORWARD MODEL

from keras import models
from keras import layers
from keras import regularizers
import tensorflow as tf
#kernel_regularizer=regularizers.l1_l2(l1=0.001, l2=0.001)
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(LAYER1_SIZE, activation=LAYER1_ACTIVATION, input_shape = (LAYER1_INPUT_DIMENSION,)))
    model.add(layers.Dense(LAYER2_SIZE, activation=LAYER2_ACTIVATION))
    model.add(layers.Dense(1, activation=LAYER3_ACTIVATION))
    model.compile(optimizer = OPTIMIZER, loss = LOSS, metrics = [METRICS])
    return model

In [20]:
model = build_model()
with tf.device('/gpu:0'):
    history = model.fit(x_train, y_train, epochs = EPOCHS, batch_size = BATCH_SIZE)
    results = model.evaluate(x_test, y_test)
str(results)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


'[0.5870801531473796, 0.7719999996821085]'

In [21]:
import datetime
now = datetime.datetime.now()

log_file = open('results.log', 'a+')

stat = "=================================\n" + \
str(now.month) + "/" + str(now.day) + "/" + str(now.year) + " " + \
str(now.hour) + ":" + str(now.minute) + ":" + str(now.second) + "\n" + \
"\nSEED:\t" + str(SEED) +" \n" + \
"Layer1:\t" + str(LAYER1_PARAMS) +" \n" + \
"Layer2:\t" + str(LAYER2_PARAMS) +" \n" + \
"Layer3:\t" + str(LAYER3_PARAMS) +" \n" + \
"Generalizers:\t" + "\n" + \
"Compilation:\t" + str(COMPILATION_PARAMS) +" \n" + \
"Training: " + "EPOCHS " + str(EPOCHS) + " | " + "BATCH SIZE " + str(BATCH_SIZE) + "\n" + \
"\tRESULTS:\t" + "LOSS:" + str(results[0]) +  " | "+"ACCURACY:" + (str(results[1])) + "\n" + \
"\n"

print(stat)
log_file.write(stat)
log_file.close()

6/27/2018 1:3:45

SEED:	600 
Layer1:	['16', 'relu', 10000] 
Layer2:	['16', 'relu'] 
Layer3:	['1', 'sigmoid'] 
Generalizers:	
Compilation:	['rmsprop', 'binary_crossentropy', 'accuracy'] 
Training: EPOCHS 50 | BATCH SIZE 4000
	RESULTS:	LOSS:0.5870801531473796 | ACCURACY:0.7719999996821085


