In [None]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import thinc.extra.datasets
import pandas as pd
import spacy
from spacy.util import minibatch, compounding
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.utils import class_weight
import numpy as np
from numpy import array, unique, array_equal
max_fatures = 1500
import matplotlib.pyplot as plt
import seaborn as sns
from keras import metrics
import keras
def load_data():
    print("Loading Data...")
    PrimaryEmotion = pd.read_csv('emotion.xls.csv')
    print("Data Loaded")
    return (PrimaryEmotion['sentence'], array(PrimaryEmotion['emotion']))

def sort_to_2_emotions(sentence_list, emotion_list):
    sorted_list = []
    sorted_emo = []
    for (data, emo) in zip(sentence_list, emotion_list):
        if (emo == 'Joy' or emo == 'Love' or emo == 'Optimism' or emo == 'Awe' or emo == 'Trust'):
            sorted_list.append(data)
            sorted_emo.append("positive")
        if (emo == 'Anger' or emo == 'Disgust' or emo == 'Sadness' or
            emo == 'Aggression' or emo == 'Contempt' or emo == 'Disapproval' or emo == 'Remorse'):
            sorted_list.append(data)
            sorted_emo.append("negative")
    return (sorted_list, sorted_emo)

#Below the data is sorted into nine emotion groups. Eight of the groups are the outter layer of the wheel, or the combinations of 
#two emotion groups. The ninth group is "Ambiguous" and "Neutral" put together.
def sort_to_9_emotions(sentence_list, emotion_list):
    sorted_list = []
    sorted_emo = []
    for (data, emo) in zip(sentence_list, emotion_list):
        if (emo == 'Contempt' or emo == 'Anger' or emo == 'Disgust'):
            sorted_list.append(data)
            sorted_emo.append("Contempt")
        if (emo == 'Aggression' or emo == 'Anticipation' or emo == 'Anger'):
            sorted_list.append(data)
            sorted_emo.append("Aggression")
        if (emo == 'Optimism' or emo == 'Anticipation' or emo == 'Joy'):
            sorted_list.append(data)
            sorted_emo.append("Optimism")
        if (emo == 'Love' or emo == 'Joy' or emo == 'Trust'):
            sorted_list.append(data)
            sorted_emo.append("Love")
        if (emo == 'Submission' or emo == 'Trust' or emo == 'Fear'):
            sorted_list.append(data)
            sorted_emo.append("Submission")
        if (emo == 'Awe' or emo == 'Fear' or emo == 'Surprise'):
            sorted_list.append(data)
            sorted_emo.append("Awe")
        if (emo == 'Disapproval' or emo == 'Surprise' or emo == 'Saddness'):
            sorted_list.append(data)
            sorted_emo.append("Disapproval")
        #if (emo == 'Remorse' or emo == 'Saddness' or emo == 'Disgust'):
         #   sorted_list.append(data)
          #  sorted_emo.append("Remorse")
        if (emo == 'Neutral' or emo == 'Ambiguous'):
            sorted_list.append(data)
            sorted_emo.append("Neutral")
    return (sorted_list, sorted_emo)



def tokenize(sentences):
    print("Tokenizing...")
    tokenizer = Tokenizer(num_words=max_fatures, split=' ')
    tokenizer.fit_on_texts(sentences)
    X = tokenizer.texts_to_sequences(sentences)
    X = pad_sequences(X)
    print("Tokenization Completed")
    return X



def train(tokened_sentences, emotion_list, weigh_even=False):
    print("Training...")
    embed_dim = 128
    lstm_out = 196
    test_percent=0.1
    epo = 1000
    
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = tokened_sentences.shape[1]))
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(lstm_out, dropout=0.35, recurrent_dropout=0.35))
    
    # len(set(emotion_list)) is a hacky way of geting the number of unique elements
    # in a regualar python list (non-numpy)
    model.add(Dense(unique(emotion_list).size,activation='softplus'))

    
    model.compile(loss = 'binary_crossentropy', optimizer=keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.1, amsgrad=True),metrics = [metrics.categorical_accuracy])
    #print(model.summary())
    
    Y = pd.get_dummies(emotion_list).values
    X_train, X_test, Y_train, Y_test = train_test_split(tokened_sentences,Y, test_size = test_percent, random_state = 42)

    

    batch_size = 128
    if (weigh_even):
        y_integers = np.argmax(Y_train, axis=1)
        class_weights = class_weight.compute_sample_weight('balanced', unique(emotion_list), y_integers)
        model.fit(X_train, Y_train, epochs = epo, batch_size=batch_size, verbose = 1,class_weight=class_weights,validation_split=0.1)
    else:
        model.fit(X_train, Y_train, epochs = epo, batch_size=batch_size, verbose = 1,validation_split=0.1)
    
    print("Training Completed")
    print("Testing Against Control... (% of the data) ", test_percent)
    score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
    print("Score   :", score)
    print("Accuracy:", acc)
    return (model, X_test)

### When Splitting data (train_test_split), we don't retain where in the 
### origional set the data is located, thus it takes
### a little trickery to see the results while while only testing against untrained data
### top_predictions variable changes how many predictions given
def test(model, X_test, tokenized_data, sentence_list, emotion_list, top_predictions=3):
    predictions = model.predict(tokenized_data, batch_size=32)
    error = 0
    for i in range(len(predictions)):
     
        #  "If the current tokenized data array is in X_test (untrained tokenized arrays)
        
        if(any(array_equal(tokenized_data[i], x) for x in X_test)):
            print("\n\n")

            
            pos = list(predictions[i]).index(max(predictions[i]))
            if (unique(emotion_list)[pos] != emotion_list[i]):
                error += 1
            temp = predictions[i]
            
            print(sentence_list[i])

            for j in range(top_predictions):
                pos = list(temp).index(max(temp))
                print("\n # {%s} Predicted emotion : ",j+1, unique(emotion_list)[pos])
                temp[pos] = 0
            print("Actual emotion   : ", emotion_list[i])
    
def count_errors(model,X_test, tokenized_data, sentence_list, emotion_list):
    dims =  len(unique(emotion_list))
    mistake_list = np.zeros((dims,dims), dtype=np.int)
    predictions = model.predict(tokenized_data, batch_size=32)
    for i in range(len(predictions)):
        if(any(array_equal(tokenized_data[i], x) for x in X_test)):
            pos = list(predictions[i]).index(max(predictions[i]))
            mistake_list[list(unique(emotion_list)).index(emotion_list[i])][pos] += 1
  #  for i in range(len(mistake_list)):
   #     for j in range(len(mistake_list)):
    #        mistake_list[i][j] = mistake_list[i][j] * 400 / list(emotion_list).count(unique(emotion_list)[i]) 
    print(mistake_list)
    return mistake_list


def graph_errors(mistake_list, emotion_list):
    dim = len(mistake_list[0])
    
    false =  np.zeros((dim,3), dtype=np.int)
    
    #true positives
    for i in range(dim):
              false[i][0] = mistake_list[i][i]
    #false positives
    for i in range(dim):
        sum = 0
        for j in range(dim):
            if (i != j):
                sum += mistake_list[i][j]
        false[i][1] = sum
        
            #false Negatives
    for i in range(dim):
        sum = 0
        for j in range(dim):
            if (i != j):
                sum += mistake_list[j][i]
        false[i][2] = sum
    df = pd.DataFrame(false)
    df.columns = ["Correct", "False Positives", "False Negatives"]
    df.insert(0, "Emotion", np.unique(emotion_list))
    (_, counts) = np.unique(emotion_list,return_counts=True)
    for i in range(counts.size):
        counts[i] = counts[i] / 15
    df.insert(1,"Amount of Data / 15",counts)
    print(df)
    df.plot.bar(x='Emotion',figsize=(12,6))
    #plot = sns.barplot(x="Emotion",y=Correct, data=df)




#################################
#############MAIN################
#################################
#   jupyter notebook is weird   #




(sentence_list, emotion_list) = load_data()


### Comment the Below line for all 18 emotions. This sorts into "positive" and "negative"
sentence_list, emotion_list = sort_to_2_emotions(sentence_list, emotion_list)

tokenized_data = tokenize(sentence_list)

(model, X_test) = train(tokenized_data, emotion_list)

#test(model, X_test, tokenized_data, sentence_list, emotion_list)

mistake_list = count_errors(model, X_test, tokenized_data, sentence_list, emotion_list)

graph_errors(mistake_list, emotion_list)



Loading Data...
Data Loaded
Tokenizing...
Tokenization Completed
Training...
Train on 1142 samples, validate on 127 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000

In [None]:
 graph_errors(mistake_list, emotion_list)
