In [2]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.simplefilter("ignore") 

from matplotlib import test
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from collections import Counter

import json
import matplotlib.pyplot as plt
import gzip
import pandas as pd

In [3]:
emotionsGZIP = gzip.open("./goemotions.json.gz", "rb")
emotionsJSON = json.load(emotionsGZIP)

classification = []
emotions = []
comments = []

commentsTrainVector = None
commentsTestVector = None

In [4]:
def createPieChart(dict, dictName):
    labels = []
    values = []
    for x,y in dict.items():
        labels.append(x)
        values.append(y)
    plt.pie(values, labels=labels, autopct=lambda p:f'{p:.2f}%, {p*sum(values)/100 :.0f}')
    plt.savefig('./graphs/'+dictName)
    plt.clf()


def readData():
    for value in emotionsJSON:
        emotions.append(value[1])
        classification.append(value[2])
        comments.append(value[0])

In [5]:
def run_q1():
    createPieChart(Counter(emotions), 'emotions_with_values')
    createPieChart(Counter(classification), 'classification_with_values')

In [31]:
def getVocabulary():
    vectorizer = CountVectorizer()
    X  = vectorizer.fit(comments)
    print("Vocabulary size: ", len(X.vocabulary_))

def createTrainTestSplit():
    global comments_train
    global comments_test
    global classification_train
    global classification_test
    global emotions_train
    global emotions_test
    comments_train, comments_test, classification_train, classification_test, emotions_train, emotions_test = train_test_split(comments, classification, emotions, test_size=0.2, random_state=0)
    
    
def getBaseClassifiersPredictions(classifier, commentsTrainVector, commentsTestVector, classification_train, emotions_train):

    #emotions
    classifier.fit(commentsTrainVector, emotions_train)
    emotions_pred = classifier.predict(commentsTestVector)
    print(emotions_pred)

    #classifications
    classifier.fit(commentsTrainVector, classification_train)
    classifications_pred = classifier.predict(commentsTestVector)
    print(classifications_pred)

    return emotions_pred, classifications_pred

def getGridSearchWithModelAndParams(model, params, cvCount, iterations, commentsTrainVector, commentsTestVector, classification_train, emotions_train):
    tunedClassifier = GridSearchCV(model, params, cv=10, n_jobs=2)
    hyperparams = list(params.keys())
    hyperparamsWithParams = ['param_' + paramInList for paramInList in hyperparams]
    
    #emotions
    tunedClassifier.fit(commentsTrainVector, emotions_train)
    df = pd.DataFrame(tunedClassifier.cv_results_)
    
    emotions_pred = tunedClassifier.predict(commentsTestVector)
    
    print(df[hyperparamsWithParams])
    print(tunedClassifier.best_score_)
    print(tunedClassifier.best_params_)
    print(tunedClassifier.predict(commentsTestVector))

    #classifications
    tunedClassifier.fit(commentsTrainVector, classification_train)
    df = pd.DataFrame(tunedClassifier.cv_results_)

    classifications_pred = tunedClassifier.predict(commentsTestVector)
    
    print(df[hyperparamsWithParams])
    print(tunedClassifier.best_score_)
    print(tunedClassifier.best_params_)
    print(tunedClassifier.predict(commentsTestVector))

    return emotions_pred, classifications_pred




In [7]:
def baseMNB():
    getBaseClassifiersPredictions(MultinomialNB(), commentsTrainVector, commentsTestVector, classification_train, emotions_train)    

In [8]:
def baseDT():
    getBaseClassifiersPredictions(DecisionTreeClassifier(), commentsTrainVector, commentsTestVector, classification_train, emotions_train)

In [9]:
def baseMLP():
    getBaseClassifiersPredictions(MLPClassifier(), commentsTrainVector, commentsTestVector, classification_train, emotions_train)

In [13]:
def run_q2():
    #2.1
    getVocabulary()

    #2.2
    createTrainTestSplit()
    
    #2.3
    vectorizer = CountVectorizer()
    global commentsTrainVector
    global commentsTestVector

    commentsTrainVector = vectorizer.fit_transform(comments_train)
    commentsTestVector = vectorizer.transform(comments_test)

    #2.3.1
    # baseMNB()
    
    #2.3.2
    # baseDT()
    
    #2.3.3
    # baseMLP()

    #2.3.4
    mnb_classifier = MultinomialNB()
    mnb_params = {'alpha': [0, 0.5, 1, 10]}
    # getGridSearchWithModelAndParams(mnb_classifier, mnb_params, 10, 2, commentsTrainVector, commentsTestVector, classification_train, emotions_train)
    
    #2.3.5
    dt_classifier = DecisionTreeClassifier()
    dt_params = {
        'criterion': ['gini', 'entropy'], # explain why gini or entropy https://quantdare.com/decision-trees-gini-vs-entropy/
        'max_depth': [2, 5],
        'min_samples_split': [1,2,3]
    }
    getGridSearchWithModelAndParams(dt_classifier, dt_params, 5, 1, commentsTrainVector, commentsTestVector, classification_train, emotions_train)

    #2.3.6
    
    mlp_params = {
        'activation': ['logistic', 'tanh', 'relu', 'identity'],
        'hidden_layer_sizes': [(25, 45), (10,15,20)],
        'solver': ['adam', 'sgd']
    }

In [32]:
#1.2
readData()
#1.3
# run_q1()
run_q2()

Vocabulary size:  30449
   param_criterion param_max_depth param_min_samples_split
0             gini               2                       1
1             gini               2                       2
2             gini               2                       3
3             gini               5                       1
4             gini               5                       2
5             gini               5                       3
6          entropy               2                       1
7          entropy               2                       2
8          entropy               2                       3
9          entropy               5                       1
10         entropy               5                       2
11         entropy               5                       3
0.37798640966087393
{'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2}
['neutral' 'neutral' 'neutral' ... 'neutral' 'neutral' 'neutral']
   param_criterion param_max_depth param_min_samples_split
0 