In [7]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import thinc.extra.datasets
import pandas as pd
import spacy
from spacy.util import minibatch, compounding
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from numpy import array, unique, array_equal, zeros, int8, append
max_fatures = 1000


def load_data():
    print("Loading Data...")
    PrimaryEmotion = pd.read_csv('emotion.xls.csv')
    print("Data Loaded")
    return (PrimaryEmotion['sentence'], array(PrimaryEmotion['emotion']))



def sort_to_2_emotions(sentence_list, emotion_list):
    sorted_list = []
    sorted_emo = []
    for (data, emo) in zip(sentence_list, emotion_list):
        if (emo == 'Joy' or emo == 'Love' or emo == 'Optimism' or emo == 'Awe' or emo == 'Trust'):
            sorted_list.append(data)
            sorted_emo.append("positive")
        if (emo == 'anger' or emo == 'Disgust' or emo == 'Sadness' or emo == 'Aggression' or emo == 'Contempt' or emo == 'Disapproval' or emo == 'Remorse'):
            sorted_list.append(data)
            sorted_emo.append("negative")
    return (sorted_list, sorted_emo)



def tokenize(sentences):
    print("Tokenizing...")
    tokenizer = Tokenizer(num_words=max_fatures, split=' ')
    tokenizer.fit_on_texts(sentences)
    X = tokenizer.texts_to_sequences(sentences)
    X = pad_sequences(X)
    print("Tokenization Completed")
    return X



def train(tokenized_sentences, emotion_list):
    print("Training...")
    embed_dim = 128
    lstm_out = 196
    test_percent=.20

    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = tokenized_sentences.shape[1]))
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    
    # len(set(emotion_list)) is a hacky way of geting the number of unique elements
    # in a regualar python list (non-numpy)
    model.add(Dense(unique(emotion_list).size,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    #print(model.summary())

    Y = pd.get_dummies(emotion_list).values
    #X_train, X_test, Y_train, Y_test = train_test_split(tokened_sentences,Y, test_size = test_percent, random_state = 42)
    #print(X_train.shape,Y_train.shape)
    #print(X_test.shape,Y_test.shape)

    batch_size = 32
    model.fit(tokenized_sentences, Y, epochs = 10, batch_size=batch_size, verbose = 1)
    print("Training Completed")
    print("Testing Against Control... (% of the data) ", test_percent)
    score,acc = model.evaluate(tokenized_sentences, Y, verbose = 1, batch_size = batch_size)
    print("Score   :", score)
    print("Accuracy:", acc)
    return model

### MISTAKE_LIST FORMAT::
### MISTAKE_LIST [VALUES FOR WHICH MISTAKE WAS MADE][INDEX OF MISTAKES]

def test(model, tokenized_data, sentence_list, emotion_list, mistake_list):
    predictions = model.predict(tokenized_data, batch_size=32)
    error = 0
    for i in range(len(predictions)):
     
        #  "If the current tokenized data array is in X_test (untrained tokenized arrays)
            pos = list(predictions[i]).index(max(predictions[i]))
            print("\n\n")
            print(sentence_list[i])
            print("\nPredicted emotion: ", unique(emotion_list)[pos])
            print("Actual emotion   : ", emotion_list[i])
            if (unique(emotion_list)[pos] != emotion_list[i]):
                mistake_list[list(unique(emotion_list)).index(emotion_list[i])][pos] += 1
    return mistake_list

    
    
#################################
#############MAIN################
#################################
#   jupyter notebook is weird   #




(sentence_list, emotion_list) = load_data()


### Comment the Below line for all 18 emotions. This sorts into "positive" and "negative"
#sentence_list, emotion_list = sort_to_2_emotions(sentence_list, emotion_list)

tokenized_data = tokenize(sentence_list)

mistake_list = zeros((18, 18), dtype=int8)



#breaks
for i in range(0,4):
    testing_data_start = int((i*len(tokenized_data))/5)
    testing_data_end = int((((i+1)*len(tokenized_data))/5))
    print("hello", testing_data_start)
                          
    testing_data = tokenized_data[testing_data_start:testing_data_end]
    testing_emo = emotion_list[testing_data_start:testing_data_end]
                       
    training_data = array(list(tokenized_data[:testing_data_start]) +  list( tokenized_data[testing_data_end:] ))
    training_emo = array(list(emotion_list[:testing_data_start]) + list(emotion_list[testing_data_end:]    ))
    model = train(training_data, training_emo)
    mistake_list = test(model, testing_data, sentence_list, emotion_list, mistake_list)


Loading Data...
Data Loaded
Tokenizing...
Tokenization Completed
hello 0
Training...
Epoch 1/10
  64/2020 [..............................] - ETA: 1:14 - loss: 2.8886 - acc: 0.0781

KeyboardInterrupt: 

In [6]:
from numpy import delete
#print(mistake_list)
sums = zeros(18, dtype=int)
#print(unique(emotion_list)[6])

std_mistake_list = zeros((18, 18), dtype=float)
#for i in range(len(mistake_list)):
#    for j in range(len(mistake_list)):
#        mistake_list[i][j] = mistake_list[i][j] * 100 / list(emotion_list).count(unique(emotion_list)[i]) 

print(mistake_list)

for i in range(len(mistake_list)):
    sums[i] = sum(mistake_list[i])

sums_copy = sums
while (sum(sums)!= 0):
    print(unique(emotion_list)[list(sums).index(max(sums))], max(sums), )
    sums[list(sums).index(max(sums))] = 0

strings = []
for i in range(len(mistake_list)):
    temp_list = mistake_list[i]
    strings.append("")
    while (sum(temp_list) != 0):
        strings[i] += unique(emotion_list)[list(temp_list).index(max(temp_list))] + " & "  + max(temp_list).to_string()
        temp_list[list(temp_list).index(max(temp_list))] = 0
print("hello")
print(strings)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
hello
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
