# Imports

In [None]:
import gzip
import json
import matplotlib.pyplot as plt
import numpy as np
import gensim.downloader
import copy
import warnings
import nltk
nltk.download('punkt')
#warnings.filterwarnings('ignore') #comment out to see warnings
from sklearn import tree
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neural_network import MLPClassifier
from nltk.tokenize import word_tokenize

# Load File

In [None]:
f = gzip.open('goemotions.json.gz','rb') # Open .gz zip file
jsonFile = f.read()
y = json.loads(jsonFile)  # Store all contents into an array
y = np.array(y) # Convert array to np array

# Generate Pie Charts

In [None]:
# Extract the 2nd and 3rd columns (emotions and sentiments)
emotions = y[:,1] 
sentiments = y[:,2]

# Count the total number of each emotion/sentiment
eValues, eCounts = np.unique(emotions, return_counts=True)
sValues, sCounts = np.unique(sentiments, return_counts=True)

efig = plt.figure(figsize=(6, 6))
plt.pie(eCounts, labels = eValues)
plt.show() 
efig.savefig('emotions.pdf', dpi=efig.dpi)
sfig = plt.figure(figsize=(6, 6))
plt.pie(sCounts, labels = sValues)
plt.show() 
sfig.savefig('sentiments.pdf', dpi=sfig.dpi)

# Process Dataset

In [None]:
corpus = y[:,0]
#Transform corpus to word count sparse matrix
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(corpus)
#Print the number of words
print(len(vectorizer.get_feature_names_out())) 

# Process Dataset (NO STOP WORDS VERSION)

In [None]:
#Delete following line to process the dataset with no stop words (part 2.5)
"""" 
corpus = y[:,0]
#Transform corpus to word count sparse matrix
vectorizer = CountVectorizer(stop_words='english')
x = vectorizer.fit_transform(corpus)
#Print the number of words
print(len(vectorizer.get_feature_names_out()))

# Split Dataset

In [None]:
#Split the dataset and the emotions/sentiments into train and test with seed 1
corpus_nonvector_train, corpus_nonvector_test, corpus_train, corpus_test, emotions_train, emotions_test, sentiments_train, sentiments_test = train_test_split(corpus, x, emotions, sentiments, test_size=0.2, random_state=1)

# Base-MNB

In [None]:
#Create and train model
classifierMNBemotions = MultinomialNB()
modelMNBemotions = classifierMNBemotions.fit(corpus_train, emotions_train)
predictMNBemotions = modelMNBemotions.predict(corpus_test)

#Create confusion matrix and print metrics report
confusionMatrixMNBemotions = confusion_matrix(emotions_test, predictMNBemotions)
print(classification_report(emotions_test, predictMNBemotions, zero_division=0))

#Write everything to file
f = open("Base_MNB_emotions.txt", "w")
f.write("Base Multinomial Naive Bayes for emotions with alpha=1 \n")
f.write(np.array2string(confusionMatrixMNBemotions))
f.write("\n")
f.write(classification_report(emotions_test, predictMNBemotions, zero_division=0))
f.close()

In [None]:
#Create and train model
classifierMNBsentiments = MultinomialNB()
modelMNBsentiments = classifierMNBsentiments.fit(corpus_train, sentiments_train)
predictMNBsentiments = modelMNBsentiments.predict(corpus_test)

#Create confusion matrix and print metrics report
confusionMatrixMNBsentiments = confusion_matrix(sentiments_test, predictMNBsentiments)
print(classification_report(sentiments_test, predictMNBsentiments, zero_division=0))

#Write everything to file
f = open("Base_MNB_sentiments.txt", "w")
f.write("Base Multinomial Naive Bayes for sentiments with alpha=1 \n")
f.write(np.array2string(confusionMatrixMNBsentiments))
f.write("\n")
f.write(classification_report(sentiments_test, predictMNBsentiments, zero_division=0))
f.close()

# Top-MNB

In [None]:
#Create and train model
classifierTopMNBemotions = MultinomialNB()
TMNBE = GridSearchCV(estimator=classifierTopMNBemotions, param_grid={'alpha': [0, 0.1, 0.5, 1]}, verbose=1)
TMNBE.fit(corpus_train, emotions_train)
print(TMNBE.best_params_)

In [None]:
#Create confusion matrix and print metrics report
predictTopMNBemotions = TMNBE.predict(corpus_test)
confusionMatrixTopMNBemotions = confusion_matrix(emotions_test, predictTopMNBemotions)
print(classification_report(emotions_test, predictTopMNBemotions, zero_division=0))

#Write everything to file
f = open("Top_MNB_emotions.txt", "w")
f.write("Top Multinomial Naive Bayes (using GridSearch) for emotions with the following parameters: \n")
f.write(str(TMNBE.best_params_))
f.write("\n")
f.write(np.array2string(confusionMatrixTopMNBemotions))
f.write("\n")
f.write(classification_report(emotions_test, predictTopMNBemotions, zero_division=0))
f.close()

In [None]:
#Create and train model
classifierTopMNBsentiments = MultinomialNB()
TMNBS = GridSearchCV(estimator=classifierTopMNBsentiments, param_grid={'alpha': [0, 0.1, 0.5, 1]}, verbose=1)
TMNBS.fit(corpus_train, sentiments_train)
print(TMNBS.best_params_)

In [None]:
#Create confusion matrix and print metrics report
predictTopMNBsentiments = TMNBS.predict(corpus_test)
confusionMatrixTopMNBsentiments = confusion_matrix(sentiments_test, predictTopMNBsentiments)
print(classification_report(sentiments_test, predictTopMNBsentiments, zero_division=0))

#Write everything to file
f = open("Top_MNB_sentiments.txt", "w")
f.write("Top Multinomial Naive Bayes (using GridSearch) for sentiments with the following parameters: \n")
f.write(str(TMNBS.best_params_))
f.write("\n")
f.write(np.array2string(confusionMatrixTopMNBsentiments))
f.write("\n")
f.write(classification_report(sentiments_test, predictTopMNBsentiments, zero_division=0))
f.close()

# Base-DT

In [None]:
classifierDTemotions = tree.DecisionTreeClassifier() 
modelDTemotions = classifierDTemotions.fit(corpus_train, emotions_train)
predictDTemotions = modelDTemotions.predict(corpus_test)

#Create confusion matrix and print metrics report
confusionMatrixDTemotions = confusion_matrix(emotions_test, predictDTemotions)
print(classification_report(emotions_test, predictDTemotions, zero_division=0))

#Write everything to file
f = open("Base_DT_emotions.txt", "w")
f.write("Base Decision Tree for emotions with alpha=1 \n")
f.write(np.array2string(confusionMatrixDTemotions))
f.write("\n")
f.write(classification_report(emotions_test, predictDTemotions, zero_division=0))
f.close()

In [None]:
#Create and train model
classifierDTsentiments = tree.DecisionTreeClassifier()
modelDTsentiments = classifierDTsentiments.fit(corpus_train, sentiments_train)
predictDTsentiments = modelDTsentiments.predict(corpus_test)

#Create confusion matrix and print metrics report
confusionMatrixDTsentiments = confusion_matrix(sentiments_test, predictDTsentiments)
print(classification_report(sentiments_test, predictDTsentiments, zero_division=0))

#Write everything to file
f = open("Base_DT_sentiments.txt", "w")
f.write("Base Decision Tree for sentiments with alpha=1 \n")
f.write(np.array2string(confusionMatrixDTsentiments))
f.write("\n")
f.write(classification_report(sentiments_test, predictDTsentiments, zero_division=0))
f.close()

# Top-DT

In [None]:
parametersDT = {'criterion':('entropy', 'gini'), 'max_depth':[40, 120], 'min_samples_split':[40, 80, 120]}
classifierDTemotions = GridSearchCV(tree.DecisionTreeClassifier(), parametersDT) 

modelDTemotions = classifierDTemotions.fit(corpus_train, emotions_train)
predictDTemotions = modelDTemotions.predict(corpus_test)

#Create confusion matrix and print metrics report
confusionMatrixDTemotions = confusion_matrix(emotions_test, predictDTemotions)
print(classification_report(emotions_test, predictDTemotions, zero_division=0))

#Write everything to file
f = open("Top_DT_emotions.txt", "w")
f.write("Top Decision Tree for emotions with with the following parameters: \n")
f.write(str(modelDTemotions.best_params_))
f.write("\n")
f.write(np.array2string(confusionMatrixDTemotions))
f.write("\n")
f.write(classification_report(emotions_test, predictDTemotions, zero_division=0))
f.close()

In [None]:
parametersDT = {'criterion':('entropy', 'gini'), 'max_depth':[40, 120], 'min_samples_split':[40, 80, 120]}
classifierDTsentiments = GridSearchCV(tree.DecisionTreeClassifier(), parametersDT) 

modelDTsentiments = classifierDTsentiments.fit(corpus_train, sentiments_train)
predictDTsentiments = modelDTsentiments.predict(corpus_test)

#Create confusion matrix and print metrics report
confusionMatrixDTsentiments = confusion_matrix(sentiments_test, predictDTsentiments)
print(classification_report(sentiments_test, predictDTsentiments, zero_division=0))

#Write everything to file
f = open("Top_DT_sentiments.txt", "w")
f.write("Top Decision Tree for sentiments with the following parameters: \n")
f.write(str(modelDTsentiments.best_params_))
f.write("\n")
f.write(np.array2string(confusionMatrixDTsentiments))
f.write("\n")
f.write(classification_report(sentiments_test, predictDTsentiments, zero_division=0))
f.close()

# Base-MLP

In [None]:
#Create and train model
classifierMLPemotions = MLPClassifier()
modelMLPemotions = classifierMLPemotions.fit(corpus_train, emotions_train)
predictMLPemotions = modelMLPemotions.predict(corpus_test)

#Create confusion matrix and print metrics report
confusionMatrixMLPemotions = confusion_matrix(emotions_test, predictMLPemotions)
print(classification_report(emotions_test, predictMLPemotions, zero_division=0))

#Write everything to file
f = open("Base_MLP_emotions.txt", "w")
f.write("Base Multi-Layered Perceptron for emotions with default parameters \n")
f.write(np.array2string(confusionMatrixMLPemotions))
f.write("\n")
f.write(classification_report(emotions_test, predictMLPemotions, zero_division=0))
f.close()

In [None]:
#Create and train model
classifierMLPsentiments = MLPClassifier()
modelMLPsentiments = classifierMLPsentiments.fit(corpus_train, sentiments_train)
predictMLPsentiments = modelMLPsentiments.predict(corpus_test)

#Create confusion matrix and print metrics report
confusionMatrixMLPsentiments = confusion_matrix(sentiments_test, predictMLPsentiments)
print(classification_report(sentiments_test, predictMLPsentiments, zero_division=0))

#Write everything to file
f = open("Base_MLP_sentiments.txt", "w")
f.write("Base Multi-Layered Perceptron for sentiments with default parameters \n")
f.write(np.array2string(confusionMatrixMLPsentiments))
f.write("\n")
f.write(classification_report(sentiments_test, predictMLPsentiments, zero_division=0))
f.close()

# Top-MLP

In [None]:
parametersMLP = {
    'activation': ('logistic', 'tanh', 'relu', 'identity'),
    'hidden_layer_sizes': ((30, 50,), (10,10,10,)),
    'solver': ('adam', 'sgd')
}


In [None]:
topMLPemotions = GridSearchCV(MLPClassifier(), parametersMLP)
topMLPemotions.fit(corpus_train, emotions_train)
predictTopMLPemotions = topMLPemotions.predict(corpus_test)

#Create confusion matrix and print metrics report
confusionMatrixTopMLPemotions = confusion_matrix(emotions_test, predictTopMLPemotions)
print(classification_report(emotions_test, predictTopMLPemotions, zero_division=0))

#Write everything to file
f = open("Top_MLP_emotions.txt", "w")
f.write("Top Multi-Layered Perceptron for emotions with the following parameters: \n") #include hyper-parameters in string
f.write(str(topMLPemotions.best_params_))
f.write("\n")
f.write(np.array2string(confusionMatrixTopMLPemotions))
f.write("\n")
f.write(classification_report(emotions_test, predictTopMLPemotions, zero_division=0))
f.close()

In [None]:
topMLPsentiments = GridSearchCV(MLPClassifier(), parametersMLP)
topMLPsentiments.fit(corpus_train, sentiments_train)
predictTopMLPsentiments = topMLPsentiments.predict(corpus_test)

#Create confusion matrix and print metrics report
confusionMatrixTopMLPsentiments = confusion_matrix(sentiments_test, predictTopMLPsentiments)
print(classification_report(sentiments_test, predictTopMLPsentiments, zero_division=0))

#Write everything to file
f = open("Top_MLP_sentiments.txt", "w")
f.write("Top Multi-Layered Perceptron for sentiments with the following parameters: \n")  #include hyper-parameters in string
f.write(str(topMLPsentiments.best_params_))
f.write("\n")
f.write(np.array2string(confusionMatrixTopMLPsentiments))
f.write("\n")
f.write(classification_report(sentiments_test, predictTopMLPsentiments, zero_division=0))
f.close()

# Embeddings

In [None]:
#uncomment the desired embedding model

#Load the pre-trained model
embeddings = gensim.downloader.load("word2vec-google-news-300")

#Different models for exploration
#embeddings = gensim.downloader.load("fasttext-wiki-news-subwords-300")
#embeddings = gensim.downloader.load("glove-twitter-200")

# Tokenize

In [None]:
#Tokenize every word of the training set and store it into an array
trainTokens = [""]*len(corpus_nonvector_train)
for i in range(len(corpus_nonvector_train)):
    trainTokens[i]=word_tokenize(corpus_nonvector_train[i])
    
#Tokenize every word of the test set and store it into an array
testTokens = [""]*len(corpus_nonvector_test)
for i in range(len(corpus_nonvector_test)):
    testTokens[i]=word_tokenize(corpus_nonvector_test[i])

In [None]:
#Print the number of tokens overall
traintokencount=0
for i in range(len(trainTokens)):
    traintokencount+=len(trainTokens[i])
testtokencount=0
for i in range(len(testTokens)):
    testtokencount+=len(testTokens[i])
print(traintokencount+testtokencount)

# Embedding of each post - Hits and Misses

In [None]:
#Loop through the test and training set counting misses
trainmissesCount = 0
for i in range(len(trainTokens)):
    for j in range(len(trainTokens[i])):
        if(not embeddings.__contains__(trainTokens[i][j])):
            trainmissesCount+=1
print("%d hits and %d misses - %2.2f%% hit-rate for the training set" % (traintokencount-trainmissesCount, trainmissesCount, (1-trainmissesCount/traintokencount)*100))

testmissesCount=0
for i in range(len(testTokens)):
    for j in range(len(testTokens[i])):
        if(not embeddings.__contains__(testTokens[i][j])):
            testmissesCount+=1
print("%d hits and %d misses - %2.2f%% hit-rate for the test set" % (testtokencount-testmissesCount, testmissesCount, (1-testmissesCount/testtokencount)*100))
print("%d hits and %d misses - %2.2f%% hit-rate overall" % (traintokencount-trainmissesCount+testtokencount-testmissesCount, trainmissesCount+testmissesCount, (1-((trainmissesCount+testmissesCount)/(traintokencount+testtokencount)))*100))

# Embedding of each post

In [None]:
#Define embedding function - loops through a 2D array and converts it to embedding
def embed(arr):
    embeddedArr = []    
    for i in range(len(arr)):
        embeddedArr.append([])
        for j in range(len(arr[i])):
            if(embeddings.__contains__(arr[i][j])):
                embeddedArr[i].append(embeddings[arr[i][j]])
    return embeddedArr

#Embed the tokens of the training set
embeddedPostsTrain = []
embeddedPostsTrain = embed(trainTokens)

#Embed the tokens of the test set
embeddedPostsTest = []
embeddedPostsTest = embed(testTokens)

In [None]:
#Clean up the embedded vectors (Remove sentences with no words appearing in the model) as well as the corresponding entries in the y vectors
#Copy all of the y vectors
embedEmotionsTrain = copy.deepcopy(emotions_train)
embedEmotionsTest = copy.deepcopy(emotions_test)
embedSentimentsTrain = copy.deepcopy(sentiments_train)
embedSentimentsTest = copy.deepcopy(sentiments_test)

#Loop through the Training vectors, deleting the entries corresponding to empty embeds
i=0
while(i<len(embeddedPostsTrain)):
    if(not embeddedPostsTrain[i]):
        embeddedPostsTrain = np.delete(embeddedPostsTrain, i)
        embedEmotionsTrain = np.delete(embedEmotionsTrain, i)
        embedSentimentsTrain = np.delete(embedSentimentsTrain, i)
    else:
        i+=1

#Repeat for the test Sets
i=0
while(i<len(embeddedPostsTest)):
    if(not embeddedPostsTest[i]):
        embeddedPostsTest = np.delete(embeddedPostsTest, i)
        embedEmotionsTest = np.delete(embedEmotionsTest, i)
        embedSentimentsTest = np.delete(embedSentimentsTest, i)
    else:
        i+=1

# Compute average embedding

In [None]:
#Average the embeddings for the test set
embeddedTrain = []
for i in range(len(embeddedPostsTrain)):
    embeddedTrain.append(sum(embeddedPostsTrain[i])/len(embeddedPostsTrain[i]))
    
#Average the embeddings for the test set
embeddedTest = []
for i in range(len(embeddedPostsTest)):
    embeddedTest.append(sum(embeddedPostsTest[i])/len(embeddedPostsTest[i]))

# Base-MLP: Embeddings

In [None]:
#Create and train model
embedBaseMLPEmotions = MLPClassifier().fit(embeddedTrain, embedEmotionsTrain)
embedBaseMLPEmotions_pred = embedBaseMLPEmotions.predict(embeddedTest)

#Create confusion matrix and print metrics report
embedBaseMLPemotions_matrix = confusion_matrix(embedEmotionsTest, embedBaseMLPEmotions_pred)
print(classification_report(embedEmotionsTest, embedBaseMLPEmotions_pred, zero_division=0))

#Write everything to file
f = open("Base_MLP_emotions-Embeddings(word2vec).txt", "w")
f.write("Base Multi-Layered Perceptron for emotions from the embedded Reddit posts with defaut parameters \n")
f.write(np.array2string(embedBaseMLPemotions_matrix))
f.write("\n")
f.write(classification_report(embedEmotionsTest, embedBaseMLPEmotions_pred, zero_division=0))
f.close()

In [None]:
#Create and train model
embedBaseMLPSentiments = MLPClassifier().fit(embeddedTrain, embedSentimentsTrain)
embedBaseMLPSentiments_pred = embedBaseMLPSentiments.predict(embeddedTest)

#Create confusion matrix and print metrics report
embedBaseMLPSentiments_matrix = confusion_matrix(embedSentimentsTest, embedBaseMLPSentiments_pred)
print(classification_report(embedSentimentsTest, embedBaseMLPSentiments_pred, zero_division=0))

#Write everything to file
f = open("Base_MLP_sentiments-Embeddings(word2vec).txt", "w")
f.write("Base Multi-Layered Perceptron for sentiments from the embedded Reddit posts with default parameters \n")
f.write(np.array2string(embedBaseMLPSentiments_matrix))
f.write("\n")
f.write(classification_report(embedSentimentsTest, embedBaseMLPSentiments_pred, zero_division=0))
f.close()

# Top-MLP: Embeddings

In [None]:
parametersMLP = {
    'activation': ('logistic', 'tanh', 'relu', 'identity'),
    'hidden_layer_sizes': ((30, 50,), (10,10,10,)),
    'solver': ('adam', 'sgd')
}

In [None]:
#Create and train model
embedTopMLPEmotions = GridSearchCV(MLPClassifier(), parametersMLP, cv=3, scoring='accuracy').fit(embeddedTrain, embedEmotionsTrain)
embedTopMLPEmotions_pred = embedTopMLPEmotions.predict(embeddedTest)

#Create confusion matrix and print metrics report
embedTopMLPEmotions_matrix = confusion_matrix(embedEmotionsTest, embedTopMLPEmotions_pred)
print(classification_report(embedEmotionsTest, embedTopMLPEmotions_pred, zero_division=0))

#Write everything to file
f = open("Top_MLP_emotions-Embeddings(word2vec).txt", "w")
f.write("Top Multi-Layered Perceptron for emotions from the embedded Reddit posts with the following parameters: \n")
f.write(str(embedTopMLPEmotions.best_params_))
f.write("\n")
f.write(np.array2string(embedTopMLPEmotions_matrix))
f.write("\n")
f.write(classification_report(embedEmotionsTest, embedTopMLPEmotions_pred, zero_division=0))
f.close()

In [None]:
#Create and train model
embedTopMLPSentiments = GridSearchCV(MLPClassifier(), parametersMLP, cv=3, scoring='accuracy').fit(embeddedTrain, embedSentimentsTrain)
embedTopMLPSentiments_pred = embedTopMLPSentiments.predict(embeddedTest)

#Create confusion matrix and print metrics report
embedTopMLPSentiments_matrix = confusion_matrix(embedSentimentsTest, embedTopMLPSentiments_pred)
print(classification_report(embedSentimentsTest, embedTopMLPSentiments_pred, zero_division=0))

#Write everything to file
f = open("Top_MLP_sentiments-Embeddings(word2vec).txt", "w")
f.write("Top Multi-Layered Perceptron for sentiments from the embedded Reddit posts with the following parameters: \n")
f.write(str(embedTopMLPSentiments.best_params_))
f.write("\n")
f.write(np.array2string(embedTopMLPSentiments_matrix))
f.write("\n")
f.write(classification_report(embedSentimentsTest, embedTopMLPSentiments_pred, zero_division=0))
f.close()