In [1]:
import json
import re

### Feed Forward Neural Network Parameters

In [2]:
SEED = 600
# 1st Layer
LAYER1_SIZE = 16
LAYER1_ACTIVATION = 'relu'
LAYER1_INPUT_DIMENSION = 10000

LAYER1_PARAMS = [str(LAYER1_SIZE), LAYER1_ACTIVATION, LAYER1_INPUT_DIMENSION]

# 2nd LAYER
LAYER2_SIZE = 16
LAYER2_ACTIVATION = 'relu'

LAYER2_PARAMS = [str(LAYER2_SIZE), LAYER2_ACTIVATION]

# 3rd LAYER
LAYER3_SIZE = 1
LAYER3_ACTIVATION = 'sigmoid'

LAYER3_PARAMS = [str(LAYER3_SIZE), LAYER3_ACTIVATION]

# Geralizers
DROPOUT_RATE = 0.5
L1 = 0.001
L2 = 0.001

# TRAINING
EPOCHS = 20
BATCH_SIZE = 32

TRAIN_PARAMS = [EPOCHS, BATCH_SIZE]

# COMPILATION
OPTIMIZER = 'rmsprop'
LOSS = 'binary_crossentropy'
METRICS = 'accuracy'

COMPILATION_PARAMS = [OPTIMIZER, LOSS, METRICS]

In [3]:
# This file contains 8600ish users Data
# In the form: {'MBTI Type', 'Social Media Posts'}
dataFile = open('formatted_data.json', 'r')

In [4]:
data = []

In [5]:
# loading the json data into a list
for line in dataFile:
    data.append(json.loads(line))

In [6]:
types = []
posts = []

In [7]:
# Separating data into 2 separate lists for preprocessing
# For the most part we process the 'post' data
# we skip the first element as that is only the label
for i in range(1, len(data)):
    types.append(data[i]['Type'])
    posts.append(data[i]['Post'])

In [8]:
# Turning the posts from: 'post1|||post2|||post3'
#                     to: ['post1', 'post2', 'post3']
# expects a list of posts as strings
# returns a list of lists of posts
def vectorize_post_data(posts):
    for index in range(0, len(posts)):
        posts[index] = posts[index].split("|||")
        
    return posts

posts = vectorize_post_data(posts)

In [9]:
def remove_hyperTextLinksFromPosts(posts):
    for index in range(0, len(posts)):
        usable_post = [post for post in posts[index] if not re.search(r'^(.)*http(.)*$', post)]
        posts[index] = usable_post
    
    return posts
        
posts = remove_hyperTextLinksFromPosts(posts)

In [10]:
def remove_MBTIClassifiersFromPosts(posts):
    MBTI_regex = r'[\w]*(i|e)(s|n)(f|t)(p|j)[\w]*'
    for i in range(0, len(posts)):
        for j in range(0, len(posts[i])):
            posts[i][j] = re.sub(MBTI_regex, ' ', posts[i][j], flags=re.IGNORECASE)
        
    return posts
    
posts = remove_MBTIClassifiersFromPosts(posts)

In [11]:

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
maxlen = 50
max_words = 10000
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(posts)
sequences = tokenizer.texts_to_sequences(posts)

Using TensorFlow backend.


In [12]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' %len(word_index))

Found 391958 unique tokens.


In [13]:
posts_data = pad_sequences(sequences, maxlen=maxlen)

In [14]:
labels = np.asarray(types)
print('shape of data tensor:', posts_data.shape)
print('shape of label tensor', labels.shape)

shape of data tensor: (8675, 50)
shape of label tensor (8675,)


In [15]:
labels

array(['INFJ', 'ENTP', 'INTP', ..., 'INTP', 'INFP', 'INFP'], dtype='<U4')

In [16]:
import random
random.seed(SEED)
random.shuffle(posts_data)
random.seed(SEED)
random.shuffle(labels)

In [17]:
intro_extro = [word[0:1] for word in types[0:len(labels)]]
bin_intro_extro = []
for letter in intro_extro:
    if (letter == 'I'):
        bin_intro_extro.append(0)
    else:
        bin_intro_extro.append(1)


In [18]:
len(bin_intro_extro)

8675

In [19]:
x_train = posts_data[1500:]
y_train = bin_intro_extro[1500:]
x_val = x_train[1500:]
y_val = y_train[1500:]
x_test = posts_data[:1500]
y_test = bin_intro_extro[:1500]

In [20]:
embeddings_index = {}
f = open('glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [36]:
import pickle

with open('LSTM_formated_data.txt', 'wb') as fp:
    pickle.dump(posts_data, fp)
    


In [49]:
len(posts_data)

8675

In [21]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i<max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            

In [33]:
from keras.layers import LSTM
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(10000, 16))
model.add(LSTM(16))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer = 'rmsprop', loss = "binary_crossentropy", metrics = ['accuracy'])

history = model.fit(x_train, y_train, epochs = 20, batch_size = BATCH_SIZE)
results = model.evaluate(x_test, y_test)
model.summary()


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 16)          160000    
_________________________________________________________________
lstm_7 (LSTM)                (None, 16)                2112      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 17        
Total params: 162,129
Trainable params: 162,129
Non-trainable params: 0
_________________________________________________________________


In [34]:
results

[0.6578556467692057, 0.6606666669845581]

In [24]:
model.layers[0].set_weights([embedding_marix])
model.layers[0].trainable = False

NameError: name 'embedding_marix' is not defined

In [None]:
model.compile(optimizer= 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuaracy'])
history = model.fit(x_train, y_train, 40, batch_size= 32, validation-data = (x_val, y_val))
model.save_weights('pre_trained_glove_model.h5')

In [None]:
model = build_model()
with tf.device('/gpu:0'):
    history = model.fit(x_train, y_train, epochs = EPOCHS, batch_size = BATCH_SIZE)
    results = model.evaluate(x_test, y_test)
str(results)

In [None]:
import datetime
now = datetime.datetime.now()

log_file = open('results.log', 'a+')

stat = "=================================\n" + \
str(now.month) + "/" + str(now.day) + "/" + str(now.year) + " " + \
str(now.hour) + ":" + str(now.minute) + ":" + str(now.second) + "\n" + \
"\nSEED:\t" + str(SEED) +" \n" + \
"Layer1:\t" + str(LAYER1_PARAMS) +" \n" + \
"Layer2:\t" + str(LAYER2_PARAMS) +" \n" + \
"Layer3:\t" + str(LAYER3_PARAMS) +" \n" + \
"Generalizers:\t" + "\n" + \
"Compilation:\t" + str(COMPILATION_PARAMS) +" \n" + \
"Training: " + "EPOCHS " + str(EPOCHS) + " | " + "BATCH SIZE " + str(BATCH_SIZE) + "\n" + \
"\tRESULTS:\t" + "LOSS:" + str(results[0]) +  " | "+"ACCURACY:" + (str(results[1])) + "\n" + \
"\n"

print(stat)
log_file.write(stat)
log_file.close()

In [None]:
import matplotlib.pyplot as plt

history_dict = history.history
train_loss = history_dict['loss']
val_loss = history_dict['val_loss']

train_acc = history_dict['acc']
val_acc   = history_dict['val_acc']

epochs = range(1, len(history_dict['acc']) + 1)

plt.plot(epochs, train_loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

plt.clf()

plt.plot(epochs, train_acc, 'bo', label='Training Accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation Accuracy')
plt.title('Training and validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()