In [97]:
import numpy as np
import pandas as pd
np.random.seed(7)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Embedding, LSTM, Input, Concatenate, Dropout, Bidirectional, Reshape, Flatten
from keras import optimizers
from keras.models import load_model, Model
from keras import callbacks
from matplotlib import pyplot
import emoji
import json, argparse, os
import re
import io
import sys
sys.path.append(os.getcwd())
from helper_functions import *
from bidir_model import *

#get all test data
print("Processing test/train data for first model...")
trainIndices, trainTexts, labels, u1_train, u2_train, u3_train, smil_train = preprocessData(trainDataPath, mode="train")
validationIndices, validationTexts, validationLabels, u1_val, u2_val, u3_val, smil_val = preprocessData(validationDataPath, mode="train")
testIndices, testTexts, testLabels, u1_test, u2_test, u3_test, smil_test = preprocessData(testDataPath, mode="train")

print("Extracting tokens...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(u1_train+u2_train+u3_train)

print("Right format...")
u1_trainSequences, u2_trainSequences, u3_trainSequences, smil_trainSeq = tokenizer.texts_to_sequences(u1_train), tokenizer.texts_to_sequences(u2_train), tokenizer.texts_to_sequences(u3_train), tokenizer.texts_to_sequences(smil_train)
u1_testSequences, u2_testSequences, u3_testSequences, smil_testSeq = tokenizer.texts_to_sequences(u1_test), tokenizer.texts_to_sequences(u2_test), tokenizer.texts_to_sequences(u3_test), tokenizer.texts_to_sequences(smil_test)
u1_valSequences, u2_valSequences, u3_valSequences, smil_valSeq = tokenizer.texts_to_sequences(u1_val), tokenizer.texts_to_sequences(u2_val), tokenizer.texts_to_sequences(u3_val), tokenizer.texts_to_sequences(smil_val)
u1_data = pad_sequences(u1_trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
u2_data = pad_sequences(u2_trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
u3_data = pad_sequences(u3_trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
smil_data = pad_sequences(smil_trainSeq, maxlen=20)
labels = to_categorical(np.asarray(labels))
u1_valData = pad_sequences(u1_valSequences, maxlen=MAX_SEQUENCE_LENGTH)
u2_valData = pad_sequences(u2_valSequences, maxlen=MAX_SEQUENCE_LENGTH)
u3_valData = pad_sequences(u3_valSequences, maxlen=MAX_SEQUENCE_LENGTH)
smil_valData = pad_sequences(smil_valSeq, maxlen=20)
validationLabels = to_categorical(np.asarray(validationLabels))
u1_testData = pad_sequences(u1_testSequences, maxlen=MAX_SEQUENCE_LENGTH)
u2_testData = pad_sequences(u2_testSequences, maxlen=MAX_SEQUENCE_LENGTH)
u3_testData = pad_sequences(u3_testSequences, maxlen=MAX_SEQUENCE_LENGTH)
smil_testData = pad_sequences(smil_testSeq, maxlen=20)
testLabels = to_categorical(np.asarray(testLabels))
    
print("Preprocess for second model...")
trainIndices, labels2, t1, t2, t3, len1, len2, len3, case1, case2, case3, smil1, smil2, smil3 = preprocessData2(trainDataPath)
ind_val, labels2_val, t1_val, t2_val, t3_val, len1_val, len2_val, len3_val, case1_val, case2_val, case3_val, smil1_val, smil2_val, smil3_val = preprocessData2(validationDataPath)
ind_test, labels2_test, t1_test, t2_test, t3_test, len1_test, len2_test, len3_test, case1_test, case2_test, case3_test, smil1_test, smil2_test, smil3_test = preprocessData2(testDataPath)

print("Extracting tokens for second model...")
tokenizerModelTwo = Tokenizer(num_words=MAX_NB_WORDS)
tokenizerModelTwo.fit_on_texts(t1+t2+t3)
wordIndexModelTwo = tokenizerModelTwo.word_index

t1, t2, t3 = tokenizerModelTwo.texts_to_sequences(t1), tokenizerModelTwo.texts_to_sequences(t2), tokenizerModelTwo.texts_to_sequences(t3)
t1_val, t2_val, t3_val = tokenizerModelTwo.texts_to_sequences(t1_val), tokenizerModelTwo.texts_to_sequences(t2_val), tokenizerModelTwo.texts_to_sequences(t3_val)
t1_test, t2_test, t3_test = tokenizerModelTwo.texts_to_sequences(t1_test), tokenizerModelTwo.texts_to_sequences(t2_test), tokenizerModelTwo.texts_to_sequences(t3_test)

t1, t2, t3 = pad_sequences(t1, maxlen=MAX_SEQUENCE_LENGTH), pad_sequences(t2, maxlen=MAX_SEQUENCE_LENGTH), pad_sequences(t3, maxlen=MAX_SEQUENCE_LENGTH)
t1_val, t2_val, t3_val = pad_sequences(t1_val, maxlen=MAX_SEQUENCE_LENGTH), pad_sequences(t2_val, maxlen=MAX_SEQUENCE_LENGTH), pad_sequences(t3_val, maxlen=MAX_SEQUENCE_LENGTH)
t1_test, t2_test, t3_test = pad_sequences(t1_test, maxlen=MAX_SEQUENCE_LENGTH), pad_sequences(t2_test, maxlen=MAX_SEQUENCE_LENGTH), pad_sequences(t3_test, maxlen=MAX_SEQUENCE_LENGTH)

meta_data = np.asarray([len1, len2, len3, case1, case2, case3, smil1, smil2, smil3]).T
meta_data_val = np.asarray([len1_val, len2_val, len3_val, case1_val, case2_val, case3_val, smil1_val, smil2_val, smil3_val]).T
meta_data_test = np.asarray([len1_test, len2_test, len3_test, case1_test, case2_test, case3_test, smil1_test, smil2_test, smil3_test]).T
metrics = {"accuracy" : [], "microPrecision" : [], "microRecall" : [], "microF1" : []}

#Load models
print("Load models...")
model1 = load_model('EP100_LR100e-5_LDim128_BS2500.h5')
model2 = load_model('../b_smileyZeroEmbeddings_F1_73/EP2_LR100e-5_LDim128_BS200.h5')


Processing test/train data for first model...
Extracting tokens...
Right format...
Preprocess for second model...
Extracting tokens for second model...
Load models...


In [30]:
print("Train and val data preds...")
preds1_train = model1.predict([t1, t2, t3, meta_data], batch_size=BATCH_SIZE)
preds2_train = model2.predict([u1_data,u2_data,u3_data, smil_data], batch_size=BATCH_SIZE)
preds1_val = model1.predict([t1_val, t2_val, t3_val, meta_data_val], batch_size=BATCH_SIZE)
preds2_val = model2.predict([u1_valData,u2_valData,u3_valData, smil_valData], batch_size=BATCH_SIZE)


Train and val data preds...


In [None]:
print("Make predictions...")
preds1 = model1.predict([t1_test, t2_test, t3_test, meta_data_test], batch_size=BATCH_SIZE)
preds2 = model2.predict([u1_testData, u2_testData, u3_testData, smil_testData], batch_size=BATCH_SIZE)

In [75]:
def recode_to_other(preds):
    recoded_preds = np.zeros((len(preds),2))
    for row in range(0,len(preds)):
        if (np.argmax(preds[row,:]) == 0):
            recoded_preds[row,0] = 1
        else:
            recoded_preds[row,1] = 1
    return recoded_preds

In [99]:
def buildModel():
    inp = Input(shape=(6,), dtype='float32')
    out = Dense(4, activation='sigmoid')(inp)
    model = Model([inp], out)
    adam = optimizers.adam(lr=LEARNING_RATE)
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])
    return model

model = buildModel()
train_data = np.concatenate((preds1_train, preds2_train), axis=1)
val_data = np.concatenate((preds1_val, preds2_val), axis=1)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('EP%d_LR%de-5_LDim%d_BS%d.h5'%(NUM_EPOCHS, int(LEARNING_RATE*(10**5)), LSTM_DIM, BATCH_SIZE), monitor='val_loss', mode='min', verbose=1, save_best_only=True)
# fit model
history = model.fit(train_data, labels, validation_data=(val_data, validationLabels), epochs=100, batch_size=BATCH_SIZE, verbose=2, callbacks=[es, mc])

Train on 30160 samples, validate on 2755 samples
Epoch 1/100
 - 6s - loss: 1.2807 - acc: 0.5677 - val_loss: 1.0492 - val_acc: 0.8211

Epoch 00001: val_loss improved from inf to 1.04923, saving model to EP2_LR100e-5_LDim128_BS200.h5
Epoch 2/100
 - 0s - loss: 1.1371 - acc: 0.6726 - val_loss: 0.9055 - val_acc: 0.8701

Epoch 00002: val_loss improved from 1.04923 to 0.90552, saving model to EP2_LR100e-5_LDim128_BS200.h5
Epoch 3/100
 - 0s - loss: 0.9981 - acc: 0.7187 - val_loss: 0.7716 - val_acc: 0.8733

Epoch 00003: val_loss improved from 0.90552 to 0.77159, saving model to EP2_LR100e-5_LDim128_BS200.h5
Epoch 4/100
 - 0s - loss: 0.8664 - acc: 0.8370 - val_loss: 0.6571 - val_acc: 0.9009

Epoch 00004: val_loss improved from 0.77159 to 0.65711, saving model to EP2_LR100e-5_LDim128_BS200.h5
Epoch 5/100
 - 0s - loss: 0.7517 - acc: 0.8703 - val_loss: 0.5682 - val_acc: 0.8995

Epoch 00005: val_loss improved from 0.65711 to 0.56820, saving model to EP2_LR100e-5_LDim128_BS200.h5
Epoch 6/100
 - 0s - 

In [110]:
#model_best = load_model('/Users/rasmushallen/Desktop/master_thesis_code/natemusMasters/b_LSTM_F1_73/b_smileyZeroEmbeddings_F1_73/EP2_LR100e-5_LDim128_BS200.h5')
#best_pred = model_best.predict([u1_testData, u2_testData, u3_testData, smil_testData], batch_size=BATCH_SIZE)

test_data = np.concatenate((preds1, preds2), axis=1)
predictions = model.predict(test_data)
getMetrics(predictions, testLabels)

True Positives per class :  [4275.  194.  204.  251.]
False Positives per class :  [151. 119. 123. 192.]
False Negatives per class :  [402.  90.  46.  47.]
Class happy : Precision : 0.620, Recall : 0.683, F1 : 0.650
Class sad : Precision : 0.624, Recall : 0.816, F1 : 0.707
Class angry : Precision : 0.567, Recall : 0.842, F1 : 0.677
Ignoring the Others class, Macro Precision : 0.6034, Macro Recall : 0.7805, Macro F1 : 0.6806
Ignoring the Others class, Micro TP : 649, FP : 434, FN : 183
Accuracy : 0.8938, Micro Precision : 0.5993, Micro Recall : 0.7800, Micro F1 : 0.6778


(0.8938101288800145, 0.5992613, 0.7800481, 0.6778067400289556)