In [None]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import thinc.extra.datasets
import pandas as pd
import spacy
from spacy.util import minibatch, compounding
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from numpy import array, unique, array_equal, zeros, int8, append
max_fatures = 4000


def load_data():
    print("Loading Data...")
    PrimaryEmotion = pd.read_csv('emotion.xls.csv')
    print("Data Loaded")
    return (PrimaryEmotion['sentence'], array(PrimaryEmotion['emotion']))



def sort_to_2_emotions(sentence_list, emotion_list):
    sorted_list = []
    sorted_emo = []
    for (data, emo) in zip(sentence_list, emotion_list):
        if (emo == 'Joy' or emo == 'Love' or emo == 'Optimism' or emo == 'Awe' or emo == 'Trust'):
            sorted_list.append(data)
            sorted_emo.append("positive")
        if (emo == 'Anger' or emo == 'Disgust' or emo == 'Sadness' or emo == 'Aggression' or emo == 'Contempt' or emo == 'Disapproval' or emo == 'Remorse'):
            sorted_list.append(data)
            sorted_emo.append("negative")
    return (sorted_list, sorted_emo)



def tokenize(sentences):
    print("Tokenizing...")
    tokenizer = Tokenizer(num_words=max_fatures, split=' ')
    tokenizer.fit_on_texts(sentences)
    X = tokenizer.texts_to_sequences(sentences)
    X = pad_sequences(X)
    print("Tokenization Completed")
    return X



def train(tokenized_sentences, emotion_list):
    embed_dim = 128
    lstm_out = 196
    test_percent=.20
    batch_size = 32

    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = tokenized_sentences.shape[1]))
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    
    # len(set(emotion_list)) is a hacky way of geting the number of unique elements
    # in a regualar python list (non-numpy)
    model.add(Dense(unique(emotion_list).size,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    #print(model.summary())

    Y = pd.get_dummies(emotion_list).values
    model.fit(tokenized_sentences, Y, epochs = 30, batch_size=batch_size, verbose = 1)
    #X_train, X_test, Y_train, Y_test = train_test_split(tokened_sentences,Y, test_size = test_percent, random_state = 42)
    #print(X_train.shape,Y_train.shape)
    #print(X_test.shape,Y_test.shape)
    return model

### MISTAKE_LIST FORMAT::
### MISTAKE_LIST [VALUES FOR WHICH MISTAKE WAS MADE][INDEX OF MISTAKES]

def test(model, tokenized_data, sentence_list, emotion_list, mistake_list):
    predictions = model.predict(tokenized_data, batch_size=32)
    error = 0
    for i in range(len(predictions)):
            pos = list(predictions[i]).index(max(predictions[i]))
            if (unique(emotion_list)[pos] != emotion_list[i]):
                mistake_list[list(unique(emotion_list)).index(emotion_list[i])][pos] += 1
    return mistake_list

    
    
#################################
#############MAIN################
#################################
#   jupyter notebook is weird   #




(sentence_list, emotion_list) = load_data()


### Comment the Below line for all 18 emotions. This sorts into "positive" and "negative"
#sentence_list, emotion_list = sort_to_2_emotions(sentence_list, emotion_list)

tokenized_data = tokenize(sentence_list)

mistake_list = zeros((18, 18), dtype=int8)



#breaks
print("Training may take a while. Please be patient")
for i in range(0,4):
    print('Starting training {} / 5 ...'.format(str(i+1)))
    testing_data_start = int((i*len(tokenized_data))/5)
    testing_data_end = int((((i+1)*len(tokenized_data))/5))
    
    testing_data = tokenized_data[testing_data_start:testing_data_end]
    testing_emo = emotion_list[testing_data_start:testing_data_end]
                       
    training_data = array(list(tokenized_data[:testing_data_start]) +  list( tokenized_data[testing_data_end:] ))
    training_emo = array(list(emotion_list[:testing_data_start]) + list(emotion_list[testing_data_end:]    ))
    model = train(training_data, training_emo)
    mistake_list = test(model, testing_data, sentence_list, emotion_list, mistake_list)
print("Training Complete!")


In [9]:
from numpy import delete, zeros

sums = zeros(18, dtype=int)

for i in range(len(mistake_list)):
    for j in range(len(mistake_list)):
        mistake_list[i][j] = mistake_list[i][j] * 100 / list(emotion_list).count(unique(emotion_list)[i]) 

print(mistake_list)

for i in range(len(mistake_list)):
    sums[i] = sum(mistake_list[i])


strings = []
for i in range(len(mistake_list)):
    temp_list = mistake_list[i]
    strings.append("\\textbf{")
    strings[i] +=  unique(emotion_list)[i] + "}&"
    for j in range(5):
        strings[i] += unique(emotion_list)[list(temp_list).index(max(temp_list))] + " & "  + str(max(temp_list)) + "&"
        temp_list[list(temp_list).index(max(temp_list))] = 0
    strings[i] += "\\\ \\hline "
print("hello")
h = "".join(strings)
print(h)

[[ 0  2  4  9  0  6  9  0  0  2  0 22  9  4  6  0  4  0]
 [ 0  0  7  6  0  6 17  1  6  8  0 38 11  0  4  3  5  1]
 [ 1  3  0  2  0  5 10  1  7  5  2 27  6  0  5  1  2  0]
 [ 0  2  4  0  0  1  5  0  4  7  1 14  8  0  1  0  1  0]
 [ 0  6  3  6  0  0  3  0  3  3  0 25  6  0  3  3  0  0]
 [ 0  3  6  7  0  0 13  1  7  4  0 27  4  1  2  0  2  0]
 [ 0  2  7  6  0  2  0  0  4  5  2 22 11  0  4  2  3  0]
 [ 2  2  6  2  0  2  9  0  4  9  2 23 16  0  6  0  4  0]
 [ 0  3  6  4  0  1 13  0  0  1  3 17  3  0  0  0  0  0]
 [ 1  2  5  4  0  1  8  0  3  0  1 20  5  0  0  2  2  0]
 [ 0  4  4  4  0  2 11  0  5  2  0 11 10  0  2  1  2  0]
 [ 0  2  6  5  0  4  7  0  5  3  1  0  7  0  2  2  2  0]
 [ 0  2  2  6  0  4  9  0  2  6  2 21  0  0  2  0  1  0]
 [ 0 11  6  9  0  4  6  2  0  2  0 31  6  0  6  2  0  0]
 [ 1  3  6  4  0  2 10  0  3  2  5 23 11  0  0  2  0  0]
 [ 0  3  6  5  0  3  3  0  6  6  1 16  2  0  2  0  1  0]
 [ 0  0  5  6  0  0 11  0  1  5  0 21  7  0  3  1  0  0]
 [ 3  0  0  3  0  3  0  0  0  3