In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Embedding, LSTM
from keras import optimizers
from keras.models import load_model
import tensorflow_hub as hub
from matplotlib import pyplot
import tf_metrics
from scipy.special import softmax
import emoji
import json, argparse, os
import re
import io
import sys
sys.path.append(os.getcwd())
from helper_functions import *

Using TensorFlow backend.
W0606 19:20:59.758970 4500112832 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [2]:
np.random.seed(7)

global trainDataPath, testDataPath, solutionPath, gloveDir
global NUM_FOLDS, NUM_CLASSES, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM
global BATCH_SIZE, LSTM_DIM, DROPOUT, NUM_EPOCHS, LEARNING_RATE, EARLY_STOPPING

#parser = argparse.ArgumentParser(description="Baseline Script for SemEval")
#parser.add_argument('-config', help='Config to read details', required=True)
#args = parser.parse_args()

with open('testBaseline.config') as configfile:
    config = json.load(configfile)

trainDataPath = config["train_data_path"]
validationDataPath = config["validation_data_path"]
testDataPath = config["test_data_path"]
solutionPath = config["solution_path"]
gloveDir = config["glove_dir"]

NUM_FOLDS = config["num_folds"]
NUM_CLASSES = config["num_classes"]
MAX_NB_WORDS = config["max_nb_words"]
MAX_SEQUENCE_LENGTH = config["max_sequence_length"]
EMBEDDING_DIM = config["embedding_dim"]
BATCH_SIZE = config["batch_size"]
LSTM_DIM = config["lstm_dim"]
DROPOUT = config["dropout"]
LEARNING_RATE = config["learning_rate"]
NUM_EPOCHS = config["num_epochs"]
EARLY_STOPPING = config["early_stopping"]
label2emotion = {0:"others", 1:"happy", 2: "sad", 3:"angry"}
emotion2label = {"others":0, "happy":1, "sad":2, "angry":3}

In [3]:
def preprocessData(dataFilePath, mode):
    """Load data from a file, process and return indices, conversations and labels in separate lists
    Input:
        dataFilePath : Path to train/test file to be processed
        mode : "train" mode returns labels. "test" mode doesn't return labels.
    Output:
        indices : Unique conversation ID list
        conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag
        labels : [Only available in "train" mode] List of labels
    """
    indices = []
    conversations = []
    labels = []
    u1 = []
    u2 = []
    u3 = []
    smileys = []

    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            # Convert multiple instances of . ? ! , to single instance
            # okay...sure -> okay . sure
            # okay???sure -> okay ? sure
            # Add whitespace around such punctuation
            # okay!sure -> okay ! sure
            repeatedChars = ['.', '?', '!', ',']
            for c in repeatedChars:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' ' + c + ' '
                line = cSpace.join(lineSplit)

            line = line.strip().split('\t')
            if mode == "train":
                # Train data contains id, 3 turns and label
                label = emotion2label[line[4]]
                labels.append(label)

            conv = ' <eos> '.join(line[1:4])

            #Replace non-unicode smilys with unicode
            conv = str2emoji(conv)

            #Separate smilys w unicode
            conv = add_space(conv)

            #Many of the words not in embeddings are problematic due to apostrophes e.g. didn't
            conv = fix_apos(conv)

            #Remove any duplicate spaces
            duplicateSpacePattern = re.compile(r'\ +')
            conv = re.sub(duplicateSpacePattern, ' ', conv)

            #Find all smilys as array of strings
            row_smileys = remove_text(conv)

            #Do the same operations for each turn
            u1_line = conv.split(' <eos> ')[0]
            u2_line = conv.split(' <eos> ')[1]
            u3_line = conv.split(' <eos> ')[2]

            u1.append(re.sub(duplicateSpacePattern, ' ', u1_line.lower()))
            u2.append(re.sub(duplicateSpacePattern, ' ', u2_line.lower()))
            u3.append(re.sub(duplicateSpacePattern, ' ', u3_line.lower()))
            smileys.append(row_smileys)

            indices.append(int(line[0]))
            conversations.append(conv.lower())

    if mode == "train":
        return indices, conversations, labels, u1, u2, u3, smileys
    else:
        return indices, conversations, u1, u2, u3, smileys

In [4]:
def getEmbeddingMatrix(wordIndex):
    """Populate an embedding matrix using a word-index. If the word "happy" has an index 19,
       the 19th row in the embedding matrix should contain the embedding vector for the word "happy".
    Input:
        wordIndex : A dictionary of (word : index) pairs, extracted using a tokeniser
    Output:
        embeddingMatrix : A matrix where every row has 200 dimensional GloVe embedding
    """
    embeddingsIndex = {}
    # Load the embedding vectors from ther GloVe file
    with io.open(os.path.join(gloveDir, 'glove.twitter.27B.200d.txt'), encoding="utf8") as f:
        for line in f:
            values = line.split(' ')
            word = values[0]
            embeddingVector = np.asarray(values[1:], dtype='float32')
            embeddingsIndex[word] = embeddingVector

    print('Found %s word vectors.' % len(embeddingsIndex))

    # Minimum word index of any word is 1.
    embeddingMatrix = np.zeros((len(wordIndex) + 1, EMBEDDING_DIM))
    for word, i in wordIndex.items():
        embeddingVector = embeddingsIndex.get(word)
        if embeddingVector is not None:
            # words not found in embedding index will be all-zeros.
            embeddingMatrix[i] = embeddingVector
        else:
            embeddingMatrix[i] = np.random.random(EMBEDDING_DIM)

    return embeddingMatrix

######### Smily embeddings ##################
def getSmileyEmbeddings(wordIndex):

    embeddingsIndex = {}

    with io.open(os.path.join(gloveDir, 'emoji2vec.txt'), encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            embeddingVector = np.asarray(values[1:], dtype='float32')
            embeddingsIndex[word] = embeddingVector

    smileyEmbeddings = np.zeros((len(wordIndex)+1, 300))
    for smiley, i in wordIndex.items():
        embeddingVector = embeddingsIndex.get(smiley)
        if embeddingVector is not None:
            # words not found in embedding index will be all-zeros.
            smileyEmbeddings[i] = embeddingVector
        else:
            smileyEmbeddings[i] = np.zeros(300)
            #smileyEmbeddings[i] = np.random.random(300)

    return smileyEmbeddings
#############################################

In [5]:
print("Processing training data...")
trainIndices, trainTexts, labels, u1_train, u2_train, u3_train, smil_train = preprocessData(trainDataPath, mode="train")

print("Processing validation data...")
validationIndices, validationTexts, validationLabels, u1_val, u2_val, u3_val, smil_val = preprocessData(validationDataPath, mode="train")

print("Processing test data...")
testIndices, testTexts, testLabels, u1_test, u2_test, u3_test, smil_test = preprocessData(testDataPath, mode="train")

print("Extracting tokens...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(u1_train+u2_train+u3_train)
u1_trainSequences, u2_trainSequences, u3_trainSequences, smil_trainSeq = tokenizer.texts_to_sequences(u1_train), tokenizer.texts_to_sequences(u2_train), tokenizer.texts_to_sequences(u3_train), tokenizer.texts_to_sequences(smil_train)
u1_valSequences, u2_valSequences, u3_valSequences, smil_valSeq = tokenizer.texts_to_sequences(u1_val), tokenizer.texts_to_sequences(u2_val), tokenizer.texts_to_sequences(u3_val), tokenizer.texts_to_sequences(smil_val)
u1_testSequences, u2_testSequences, u3_testSequences, smil_testSeq = tokenizer.texts_to_sequences(u1_test), tokenizer.texts_to_sequences(u2_test), tokenizer.texts_to_sequences(u3_test), tokenizer.texts_to_sequences(smil_test)

wordIndex = tokenizer.word_index
print("Found %s unique tokens." % len(wordIndex))

print("Populating embedding matrix...")
embeddingMatrix = getEmbeddingMatrix(wordIndex)
smileyEmbeddings = getSmileyEmbeddings(wordIndex)

u1_data = pad_sequences(u1_trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
u2_data = pad_sequences(u2_trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
u3_data = pad_sequences(u3_trainSequences, maxlen=MAX_SEQUENCE_LENGTH)
smil_data = pad_sequences(smil_trainSeq, maxlen=20)
labels_cat = to_categorical(np.asarray(labels))
u1_valData = pad_sequences(u1_valSequences, maxlen=MAX_SEQUENCE_LENGTH)
u2_valData = pad_sequences(u2_valSequences, maxlen=MAX_SEQUENCE_LENGTH)
u3_valData = pad_sequences(u3_valSequences, maxlen=MAX_SEQUENCE_LENGTH)
smil_valData = pad_sequences(smil_valSeq, maxlen=20)
validationLabels_cat = to_categorical(np.asarray(validationLabels))
u1_testData = pad_sequences(u1_testSequences, maxlen=MAX_SEQUENCE_LENGTH)
u2_testData = pad_sequences(u2_testSequences, maxlen=MAX_SEQUENCE_LENGTH)
u3_testData = pad_sequences(u3_testSequences, maxlen=MAX_SEQUENCE_LENGTH)
smil_testData = pad_sequences(smil_testSeq, maxlen=20)
testLabels_cat = to_categorical(np.asarray(testLabels))

Processing training data...
Processing validation data...
Processing test data...
Extracting tokens...
Found 14612 unique tokens.
Populating embedding matrix...
Found 1193514 word vectors.


In [6]:
model = load_model('EP100_LR100e-5_LDim128_BS200.h5')
predictions_train = model.predict([u1_data, u2_data, u3_data, smil_data], batch_size=BATCH_SIZE)
predictions_val = model.predict([u1_valData, u2_valData, u3_valData, smil_valData], batch_size=BATCH_SIZE)
predictions_test = model.predict([u1_testData, u2_testData, u3_testData, smil_testData], batch_size=BATCH_SIZE)

Instructions for updating:
Colocations handled automatically by placer.


W0606 19:22:36.189707 4500112832 deprecation.py:323] From /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


W0606 19:22:36.347696 4500112832 deprecation.py:506] From /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.cast instead.


W0606 19:22:39.001325 4500112832 deprecation.py:323] From /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.


In [7]:
def getMetrics(predictions, ground):
    """Given predicted labels and the respective ground truth labels, display some metrics
    Input: shape [# of samples, NUM_CLASSES]
        predictions : Model output. Every row has 4 decimal values, with the highest belonging to the predicted class
        ground : Ground truth labels, converted to one-hot encodings. A sample belonging to Happy class will be [0, 1, 0, 0]
    Output:
        accuracy : Average accuracy
        microPrecision : Precision calculated on a micro level. Ref - https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin/16001
        microRecall : Recall calculated on a micro level
        microF1 : Harmonic mean of microPrecision and microRecall. Higher value implies better classification
    """
    # [0.1, 0.3 , 0.2, 0.1] -> [0, 1, 0, 0]
    discretePredictions = to_categorical(predictions.argmax(axis=1))

    truePositives = np.sum(discretePredictions*ground, axis=0)
    falsePositives = np.sum(np.clip(discretePredictions - ground, 0, 1), axis=0)
    falseNegatives = np.sum(np.clip(ground-discretePredictions, 0, 1), axis=0)

    print("True Positives per class : ", truePositives)
    print("False Positives per class : ", falsePositives)
    print("False Negatives per class : ", falseNegatives)

    # ------------- Macro level calculation ---------------
    macroPrecision = 0
    macroRecall = 0
    # We ignore the "Others" class during the calculation of Precision, Recall and F1
    for c in range(1, NUM_CLASSES):
        precision = truePositives[c] / (truePositives[c] + falsePositives[c])
        macroPrecision += precision
        recall = truePositives[c] / (truePositives[c] + falseNegatives[c])
        macroRecall += recall
        f1 = ( 2 * recall * precision ) / (precision + recall) if (precision+recall) > 0 else 0
        print("Class %s : Precision : %.3f, Recall : %.3f, F1 : %.3f" % (label2emotion[c], precision, recall, f1))

    macroPrecision /= 3
    macroRecall /= 3
    macroF1 = (2 * macroRecall * macroPrecision ) / (macroPrecision + macroRecall) if (macroPrecision+macroRecall) > 0 else 0
    print("Ignoring the Others class, Macro Precision : %.4f, Macro Recall : %.4f, Macro F1 : %.4f" % (macroPrecision, macroRecall, macroF1))

    # ------------- Micro level calculation ---------------
    truePositives = truePositives[1:].sum()
    falsePositives = falsePositives[1:].sum()
    falseNegatives = falseNegatives[1:].sum()

    print("Ignoring the Others class, Micro TP : %d, FP : %d, FN : %d" % (truePositives, falsePositives, falseNegatives))

    microPrecision = truePositives / (truePositives + falsePositives)
    microRecall = truePositives / (truePositives + falseNegatives)

    microF1 = ( 2 * microRecall * microPrecision ) / (microPrecision + microRecall) if (microPrecision+microRecall) > 0 else 0
    # -----------------------------------------------------

    predictions = predictions.argmax(axis=1)
    ground = ground.argmax(axis=1)
    accuracy = np.mean(predictions==ground)

    print("Accuracy : %.4f, Micro Precision : %.4f, Micro Recall : %.4f, Micro F1 : %.4f" % (accuracy, microPrecision, microRecall, microF1))
    return accuracy, microPrecision, microRecall, microF1

In [8]:
accuracy, microPrecision, microRecall, microF1 = getMetrics(np.asarray(predictions_test), testLabels_cat)


True Positives per class :  [4403.  204.  204.  239.]
False Positives per class :  [166.  91.  93. 109.]
False Negatives per class :  [274.  80.  46.  59.]
Class happy : Precision : 0.692, Recall : 0.718, F1 : 0.705
Class sad : Precision : 0.687, Recall : 0.816, F1 : 0.746
Class angry : Precision : 0.687, Recall : 0.802, F1 : 0.740
Ignoring the Others class, Macro Precision : 0.6884, Macro Recall : 0.7788, Macro F1 : 0.7308
Ignoring the Others class, Micro TP : 647, FP : 293, FN : 185
Accuracy : 0.9167, Micro Precision : 0.6883, Micro Recall : 0.7776, Micro F1 : 0.7302


In [9]:
accuracy, microPrecision, microRecall, microF1 = getMetrics(np.asarray(predictions_val), validationLabels_cat)


True Positives per class :  [2213.  105.   99.  127.]
False Positives per class :  [80. 54. 24. 53.]
False Negatives per class :  [125.  37.  26.  23.]
Class happy : Precision : 0.660, Recall : 0.739, F1 : 0.698
Class sad : Precision : 0.805, Recall : 0.792, F1 : 0.798
Class angry : Precision : 0.706, Recall : 0.847, F1 : 0.770
Ignoring the Others class, Macro Precision : 0.7236, Macro Recall : 0.7927, Macro F1 : 0.7566
Ignoring the Others class, Micro TP : 331, FP : 131, FN : 86
Accuracy : 0.9234, Micro Precision : 0.7165, Micro Recall : 0.7938, Micro F1 : 0.7531


In [10]:
y_p = np.asarray(predictions_test).argmax(axis=1)

In [11]:
y_t = np.asarray(testLabels)

In [12]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_t, y_p)

array([[4403,   91,   85,   98],
       [  77,  204,    1,    2],
       [  37,    0,  204,    9],
       [  52,    0,    7,  239]])

In [13]:
testData = pd.read_csv(testDataPath, sep='\t')
discretePredictions = to_categorical(predictions_test.argmax(axis=1))
pred_emotion = []
probabilities = []
for pred in predictions_test:
    emotion = label2emotion[np.argmax(pred)]
    prob = [ '%.2f' % elem for elem in pred ]
    probabilities.append(prob)
    pred_emotion.append(emotion)    
testData['predictions'] = pred_emotion
testData['probabilities'] = probabilities

In [14]:
pd.set_option('display.max_rows', 6000)
def avg_fault(col):
    sorted_list = sorted(col, reverse=True)[0:2]
    avrg = (float(sorted_list[0]) - float(sorted_list[1]))/2
    return avrg

def certainty(col):
    return sum(list(map(float, col)))

def softmax_(col):
    return softmax(list(map(np.float16,col)))
    
testData['topTwo'] = testData['probabilities'].apply(avg_fault)
testData['otherProbability'] = testData['probabilities'].apply(certainty)
testData['Probs'] = testData['probabilities'].apply(softmax_)
testData[['others','happy','sad','angry']] = pd.DataFrame(testData.Probs.values.tolist(), index= testData.index)

In [16]:
testData[(testData.label!='angry')&(testData.predictions=='angry')][['turn1','turn2','turn3','label','others','happy','sad','angry']]
#testData[(testData.label==testData.predictions)][['turn1','turn2','turn3','label','others','happy','sad','angry']]

Unnamed: 0,turn1,turn2,turn3,label,others,happy,sad,angry
2,Yes,How so?,I want to fuck babu,others,0.266602,0.165039,0.166626,0.401855
388,Ok... No problem,"ok i hope what you stay ok, be safe:)",Fuck off,others,0.22937,0.163208,0.163208,0.443848
427,Chatting with you,what are you chatting? Who invited you ?,A stupid bot,others,0.176147,0.174438,0.174438,0.474121
521,I asked seelfi7,what?,Selfish,others,0.241455,0.175293,0.177002,0.406006
634,Can you teach Java?,define teach me,Are you stupid?,others,0.17749,0.174072,0.174072,0.473389
668,But I hate basket ball I love football,you forgot to say 'fantasy',Okk fantasy my lord 😒,others,0.19751,0.171753,0.182251,0.448486
682,You are super parson,I second that notion.,"Stop , comments",others,0.229736,0.165161,0.165161,0.440186
709,No I'm not going to stop as I told you,you just did,Ummmmaaaah,others,0.341309,0.15332,0.15332,0.351807
716,I have more friends. I don't need you anymore.,better be talking about me,I don't need you anymore.,others,0.301514,0.158813,0.168579,0.371582
818,I am in your heart,Thank you!!! 😭❤️,Where are you nothing any message and reply an...,others,0.284668,0.157715,0.157715,0.399902


In [18]:
testingIndices, testingTexts, testingLabels, u1_testing, u2_testing, u3_testing, smil_testing = preprocessData('testing.txt', mode="train")

u1_testingSequences, u2_testingSequences, u3_testingSequences, smil_testingSeq = tokenizer.texts_to_sequences(u1_testing), tokenizer.texts_to_sequences(u2_testing), tokenizer.texts_to_sequences(u3_testing), tokenizer.texts_to_sequences(smil_testing)

u1_testing = pad_sequences(u1_testingSequences, maxlen=MAX_SEQUENCE_LENGTH)
u2_testing = pad_sequences(u2_testingSequences, maxlen=MAX_SEQUENCE_LENGTH)
u3_testing = pad_sequences(u3_testingSequences, maxlen=MAX_SEQUENCE_LENGTH)
smil_testing = pad_sequences(smil_testingSeq, maxlen=20)
testinglabels_cat = to_categorical(np.asarray(testingLabels))


In [19]:
model = load_model('EP100_LR100e-5_LDim128_BS200.h5')
predictions_testing = model.predict([u1_testing, u2_testing, u3_testing, smil_testing], batch_size=BATCH_SIZE)

In [20]:
testingData = pd.read_csv('testing.txt', sep='\t')
discretePredictions = to_categorical(predictions_testing.argmax(axis=1))
pred_emotion = []
probabilities = []
for pred in predictions_testing:
    emotion = label2emotion[np.argmax(pred)]
    prob = [ '%.2f' % elem for elem in pred ]
    probabilities.append(prob)
    pred_emotion.append(emotion)    
testingData['predictions'] = pred_emotion
testingData['probabilities'] = probabilities

In [25]:
pd.set_option('display.max_rows', 6000)
def avg_fault(col):
    sorted_list = sorted(col, reverse=True)[0:2]
    avrg = (float(sorted_list[0]) - float(sorted_list[1]))/2
    return avrg

def certainty(col):
    return sum(list(map(float, col)))

def softmax_(col):
    return softmax(list(map(np.float16,col)))

testingData['Probs'] = testingData['probabilities'].apply(softmax_)
testingData[['others','happy','sad','angry']] = pd.DataFrame(testingData.Probs.values.tolist(), index= testingData.index)

In [26]:
testingData[['turn1','turn2','turn3','label','others','happy','sad','angry']]

Unnamed: 0,turn1,turn2,turn3,label,others,happy,sad,angry
0,You are such a good friend,Thanks!,😁,happy,0.221191,0.445312,0.167114,0.167114
1,Why don’t you ever write me back?,Cause you’re a creep.,I know... I am... 😔,sad,0.247925,0.193115,0.344971,0.213379
2,I don’t get it… why can’t I ever get a date?,🙃 too ugly?,Screw you!,angry,0.192017,0.175537,0.175537,0.458252
3,Too much pollution in the world,Or too many people…,Or both…,others,0.475098,0.174805,0.174805,0.174805
4,You are such a good friend,Thanks!,I really fucking mean it! 😁,happy,0.264404,0.165283,0.158813,0.410889
5,Why don’t you ever write me back?,Cause you’re a creep.,I know... I am...,sad,0.445801,0.177734,0.177734,0.198486
6,I don’t get it… why can’t I ever get a date?,🙃 too ugly?,Screw you! 😭,angry,0.190308,0.188477,0.419434,0.202148
7,Too much pollution in the world 😭,Or too many people…,Or both…,others,0.294434,0.173462,0.360107,0.173462
