In [16]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.simplefilter("ignore") 

from matplotlib import test
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from collections import Counter

import json
import matplotlib.pyplot as plt
import gzip
import pandas as pd
import copy
import os
import numpy as np
from pretty_confusion_matrix import pp_matrix_from_data

# Part 1

In [2]:
sentiments = []
emotions = []
comments = []

commentsTrainVector = None
commentsTestVector = None

In [3]:
#1.2
emotionsGZIP = gzip.open("./goemotions.json.gz", "rb")
emotionsJSON = json.load(emotionsGZIP)

In [4]:
def createPieChart(dict, dictName):
    labels = []
    values = []
    for x,y in dict.items():
        labels.append(x)
        values.append(y)
    plt.pie(values, labels=labels, autopct=lambda p:f'{p:.2f}%, {p*sum(values)/100 :.0f}')
    plt.savefig(fname='./graphs/'+dictName+'.pdf', format='pdf')
    plt.clf()
    

In [5]:
#1.3
for value in emotionsJSON:
        emotions.append(value[1])
        sentiments.append(value[2])
        comments.append(value[0])

createPieChart(Counter(emotions), 'emotions_with_values')
createPieChart(Counter(sentiments), 'sentiments_with_values')

<Figure size 640x480 with 0 Axes>

# Part 2

In [6]:
#2.1
vectorizer = CountVectorizer()
X  = vectorizer.fit(comments)
print("Vocabulary size: ", len(X.vocabulary_))

Vocabulary size:  30449


In [7]:
#2.2
comments_train, comments_test, sentiments_train, sentiments_test, emotions_train, emotions_test = train_test_split(comments, sentiments, emotions, test_size=0.2, random_state=0)

## 2.3

In [8]:
#2.3
vectorizer = CountVectorizer()
commentsTrainVector = vectorizer.fit_transform(comments_train)
commentsTestVector = vectorizer.transform(comments_test)


def getBaseClassifiersPredictions(classifier, commentsTrainVector, commentsTestVector, sentiments_train, emotions_train):

    #emotions
    classifier.fit(commentsTrainVector, emotions_train)
    emotions_classifier = copy.deepcopy(classifier)
    emotions_pred = classifier.predict(commentsTestVector)
    print(emotions_pred)

    #sentiments
    classifier.fit(commentsTrainVector, sentiments_train)
    sentiments_classifier = copy.deepcopy(classifier)
    sentiments_pred = classifier.predict(commentsTestVector)
    print(sentiments_pred)

    return emotions_pred, sentiments_pred, emotions_classifier, sentiments_classifier

def getGridSearchWithModelAndParams(model, params, cvCount, jobs, commentsTrainVector, commentsTestVector, sentiments_train, emotions_train):
    #Setup GridSearch and hyperparams
    tunedClassifier = GridSearchCV(model, params, cv=cvCount, n_jobs=jobs)
    hyperparams = list(params.keys())
    hyperparamsWithParams = ['param_' + paramInList for paramInList in hyperparams]
    
    #emotions
    tunedClassifier.fit(commentsTrainVector, emotions_train)
    emotions_tunedClassifier = copy.deepcopy(tunedClassifier)
    emotions_cv_results = tunedClassifier.cv_results_
    df = pd.DataFrame(emotions_cv_results)
    
    emotions_pred = tunedClassifier.predict(commentsTestVector)
    
    print(df[hyperparamsWithParams])
    print(tunedClassifier.best_score_)
    print(tunedClassifier.best_params_)
    print(tunedClassifier.predict(commentsTestVector))

    #sentiments
    tunedClassifier.fit(commentsTrainVector, sentiments_train)
    sentiments_tunedClassifier = copy.deepcopy(tunedClassifier)
    sentiments_cv_results = tunedClassifier.cv_results_
    df = pd.DataFrame(sentiments_cv_results)

    sentiments_pred = tunedClassifier.predict(commentsTestVector)
    
    print(df[hyperparamsWithParams])
    print(tunedClassifier.best_score_)
    print(tunedClassifier.best_params_)
    print(tunedClassifier.predict(commentsTestVector))

    return emotions_pred, sentiments_pred, emotions_tunedClassifier, sentiments_tunedClassifier

In [9]:
#2.3.1
emotions_baseMNB_pred, sentiments_baseMNB_pred, emotions_baseMNB_classifier, sentiments_baseMNB_classifier = getBaseClassifiersPredictions(MultinomialNB(), commentsTrainVector, commentsTestVector, sentiments_train, emotions_train)

['neutral' 'neutral' 'neutral' ... 'neutral' 'neutral' 'neutral']
['neutral' 'neutral' 'neutral' ... 'neutral' 'neutral' 'neutral']


In [19]:
#2.3.2
# emotions_baseDT_pred, sentiment_baseDT_pred, emotions_baseDT_classifier, sentiments_baseDT_classifier = getBaseClassifiersPredictions(DecisionTreeClassifier(), commentsTrainVector, commentsTestVector, sentiments_train, emotions_train)

In [20]:
#2.3.3
# emotions_baseMLP_pred, sentiment_baseMLP_pred, emotions_baseMLP_classifier, sentiments_baseMLP_classifier = getBaseClassifiersPredictions(MLPClassifier(max_iter=2), commentsTrainVector, commentsTestVector, sentiments_train, emotions_train)

In [21]:
 #2.3.4
mnb_classifier = MultinomialNB()
mnb_params = {
    'alpha': [0, 0.5, 1, 10]
    }
# emotions_topMNB_pred, sentiments_topMNB_pred, emotions_topMNB_tunedClassifier, sentiments_topMNB_tunedClassifier = getGridSearchWithModelAndParams(mnb_classifier, mnb_params, 10, 2, commentsTrainVector, commentsTestVector, sentiments_train, emotions_train)

In [22]:
#2.3.5
dt_classifier = DecisionTreeClassifier()
dt_params = {
    'criterion': ['gini', 'entropy'], # explain why gini or entropy https://quantdare.com/decision-trees-gini-vs-entropy/
    'max_depth': [2, 5],
    'min_samples_split': [1,2,3]
}
# emotions_topDT_pred, sentiments_topDT_pred, emotions_topDT_tunedClassifier, sentiments_topDT_tunedClassifier = getGridSearchWithModelAndParams(dt_classifier, dt_params, 5, 2, commentsTrainVector, commentsTestVector, sentiments_train, emotions_train)

In [13]:
#2.3.6
mlp_classifier = MLPClassifier(max_iter = 1)
mlp_params = {
    'activation': ['logistic', 'tanh', 'relu', 'identity'],
    'hidden_layer_sizes': [(10, 30), (5,5,5)],
    'solver': ['adam', 'sgd']
    }
# emotions_topMLP_pred, sentiments_topMLP_pred, emotions_topMLP_tunedClassifier, sentiments_topMLP_tunedClassifier = getGridSearchWithModelAndParams(mlp_classifier, mlp_params, 5, 2, commentsTrainVector, commentsTestVector, sentiments_train, emotions_train)

## 2.4

In [35]:
from sklearn.metrics import plot_confusion_matrix


def createNewFile(fName):
    i = 0
    fullFileName = "./precision/"+fName+"_%s.txt"
    while os.path.exists( fullFileName % i):
        i += 1

    fh = open(fullFileName % i, "w")
    return fh

def generateConfusionMatrix(clf, X_test, y_test):
    # confusionMatrix = confusion_matrix(y_test, y_pred)
    # # print(confusionMatrix.shape)
    # print(len(set(y_test)))
    # print(len(set(y_pred)))
    # # print(len(y_pred))
    # # print(pp_matrix_from_data(y_test, y_pred))
    # return confusionMatrix
    plot_confusion_matrix(clf, X_test, y_test)
    plt.show()


def createPrecisionReport(fName, modelName=None, hyperParams=None, emotions_pred=None, sentiments_pred=None, emotions_classifier=None, sentiments_classifier=None):
    file = createNewFile(fName)

    file.write('Model: \t'+modelName+'\n')
    file.write('HyperParams: \n')
    if hyperParams is not None:
        file.write(json.dumps(hyperParams, indent=4)+'\n')
    file.write('\n')
    file.write('Classifications: Emotions\n')
    file.write(np.array2string(generateConfusionMatrix(emotions_test, emotions_pred)))



    file.close()


# https://www.educative.io/answers/how-to-create-a-confusion-matrix-in-python-using-scikit-learn

In [36]:
#BaseMNB
generateConfusionMatrix(emotions_baseMNB_classifier, comments_test, emotions_test)
# createPrecisionReport("baseMNB", modelName = "base_MultinomialNB", hyperParams=mlp_params, emotions_pred = emotions_baseMNB_pred, sentiments_pred = sentiments_baseMNB_pred, emotions_classifier = emotions_baseMNB_classifier, sentiments_classifier = sentiments_baseMNB_classifier)

ValueError: Expected 2D array, got 1D array instead:
array=['Who would win: The Green Arrow or 1 Electric Boi'
 "Level 50 and I started playing a few days after launch. And I've been playing whenever I had free time to finish those daily orders."
 'So why not leave it in his garage with the doors locked?' ...
 'I don’t think she beat [NAME] but I def believe she hits this dude and [NAME].. none of this is news tho'
 'Growing up there were people in our church who hated the Smurfs because they were magic using communists.'
 'The photos are haunting'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.