In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Dropout, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
import tensorflow_hub as hub
from tensorboard.plugins.hparams import api as hp
import numpy as np
import pandas as pd
import json
import random
import matplotlib.pyplot as plt

In [2]:
# Load Pretrained Word2Vec
embed = hub.load("https://tfhub.dev/google/Wiki-words-500/2")

In [3]:
#Method to return the maximum length of elems in the dataframe.
#This is used to create a fixed length input for the first RNN cell.
def get_max_length(df):
    max_length = 0
    for row in df['feature']:
        if len(row.split(" ")) > max_length:
            max_length = len(row.split(" "))
    return max_length

#Return word2vec value for each word in sentence.
#add encoded words to numpy array for the first RNN cell.
def get_word2vec_enc(features):
    encoded_features = []
    for feature in features:
        tokens = feature.split(" ")
        word2vec_embedding = embed(tokens)
        encoded_features.append(word2vec_embedding)
    return encoded_features


#adding zero padding to all elems such that, they are all the same length
def get_padded_encoded_features(encoded_features):
    #Longest feature (longest question is hardcoded (from get_max_length()))
    max_length = 684
    padded_features_encoding = []
    for enc_feature in encoded_features:
        zero_padding_cnt = max_length - enc_feature.shape[0]
        pad = np.zeros((1, 500))
        for i in range(zero_padding_cnt):
            enc_feature = np.concatenate((pad, enc_feature), axis=0)
        padded_features_encoding.append(enc_feature)
    return padded_features_encoding

#return one hot encoding for isCorrect (Y value for RNN).
def answer_encode(answer):
    if answer:
        return [1,0]
    else:
        return [0,1]
    

#encode strings to numeric value
def preprocess(df):
    # encode words into word2vec
    features = df['feature'].tolist()
    
    encoded_features = get_word2vec_enc(features)
    padded_encoded_features = get_padded_encoded_features(encoded_features)
    # encoded answers
    answers = df['correct'].tolist()
    encoded_answer = [answer_encode(answer) for answer in answers]
    X = np.array(padded_encoded_features)
    Y = np.array(encoded_answer)
    return X, Y

In [4]:
#Read in features from JSON file
with open('all_plus_500.json', 'r', encoding="utf-8") as myFileAll:
     dataAll = myFileAll.read()

#Load all into objects
objAll = json.loads(dataAll)
print("Size of data: ")
print(len(objAll))

##Shuffle our data so our model "doesn't learn patterns"
random.shuffle(objAll)

#Split data into train/test (80/20) 
print(objAll[1])
train_data = objAll[:3757]
test_data = objAll[3757:]

Size of data: 
4696
{'feature': ' The following events took place in a state that  does not recognize common law marriage. The  state does recognize the common law estate of  tenancy by the entirety and has no statute on the  subject. Wade Sloan and Mary Isaacs, who were never  formally married, lived together over a sevenyear period. During this time Mary identified  herself as “Mrs. Sloan” with the knowledge and  consent of Wade. Wade and Mary maintained  several charge accounts at retail stores under  the names “Mr. and Mrs. Wade Sloan,” and  they filed joint income tax returns as Mr. and  Mrs. Sloan. Within this period Wade decided  to buy a home. The deed was in proper form  and identified the grantees as “Wade Sloan and  Mary Sloan his wife, and their heirs and assigns  forever as tenants by the entirety.” Wade made  a down payment of $10,000 and gave a note and  mortgage for the unpaid balance. Both Wade  and Mary signed the note and mortgage for the  unpaid balance. Both Wade a

In [5]:
print(type(objAll))
print(len(train_data))
print(len(test_data))


<class 'list'>
3757
939


In [6]:
#Preprocessing for Train/Test data frames for model
df = pd.DataFrame(train_data)
train_X, train_Y = preprocess(df)
test_df = pd.DataFrame(test_data)
test_X, test_Y = preprocess(test_df)

MemoryError: Unable to allocate 9.57 GiB for an array with shape (3757, 684, 500) and data type float64

In [None]:
#Building the model two layers of Bi-LSTMs
model = Sequential()
model.add(Bidirectional(LSTM(64,  return_sequences=True)))
model.add(Dropout(0.1))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(2, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
#Train on data
history = model.fit(train_X, train_Y,epochs=3, verbose=1)

In [None]:
model.summary()

In [None]:
#Evaluate on test_data
random.shuffle(test_X)
random.shuffle(test_Y)
score, acc = model.evaluate(test_X, test_Y, verbose=1)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
plt.figure(figsize=[10,5])
plt.plot(history.history['accuracy'], label='accuracy')
#plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.title('Accuracy Curves - RNN')

In [None]:
model1 = Sequential()
model1.add(LSTM(128,  return_sequences=True))
model1.add(Dropout(0.1))
model1.add(LSTM(32))
model1.add(Dense(2, activation='softmax'))

In [None]:
model1.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
history1 = model1.fit(train_X, train_Y,epochs=3, verbose=1)

In [None]:
model2 = Sequential()
model2.add(LSTM(128,  return_sequences=True))
model2.add(Dropout(0.15))
model2.add(LSTM(32))
model2.add(Dense(2, activation='softmax'))

model2.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history2 = model2.fit(train_X, train_Y,epochs=5, verbose=1)