In [None]:
import numpy  as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn           as sns

import re

import nltk
from   nltk.stem   import WordNetLemmatizer
from   nltk.corpus import stopwords

from   sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from   sklearn.ensemble        import RandomForestClassifier
from   sklearn.tree            import DecisionTreeClassifier
from   sklearn.neural_network  import MLPClassifier

from   sklearn.feature_extraction.text import TfidfVectorizer

from   sklearn.metrics import classification_report, accuracy_score, hamming_loss, multilabel_confusion_matrix

import pickle

In [None]:
# CONFIG

pd.options.display.max_columns = None
pd.options.display.max_rows    = None


TRAIN_SIZE       = 0.95

VEC_NGRAM_RANGE  = (1, 3)
VEC_SMOOTH_IDF   = True
VEC_SUBLINEAR_TF = True


TRAIN_N_RFTREES  = 20


TRAIN_DECI_TREE_CRITERION = "gini"


TRAIN_MLPC_SOLVER = "adam"
TRAIN_MLPC_EPOCHS = 11

In [None]:
# load dataset "GoEmotions" : https://github.com/google-research/google-research/tree/master/goemotions/data/full_dataset
# https://arxiv.org/pdf/2005.00547.pdf

df1 = pd.read_csv("../../datasets/goemotions_1.csv")
df2 = pd.read_csv("../../datasets/goemotions_2.csv")
df3 = pd.read_csv("../../datasets/goemotions_3.csv")

# merge all the three segments into one pd.DataFrame
df  = pd.concat([df1, df2, df3])

print("shape: ", df.shape)
df.tail()

In [None]:
# clean-up dataset

df = df.drop(columns=["id", "author", "subreddit", "link_id", "parent_id", "created_utc", "rater_id"])  # only account for useful columns
df = df[df["example_very_unclear"] == False]    # ignore "unclear" rows
df = df.drop(columns=["example_very_unclear"])  # drop this column


print("shape: ", df.shape)
print("null values: ", df.isnull().sum().sum())

In [None]:
# data pre-processing

nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")


def preprocess(text):
    res = str()

    
    text = text.lower()

    # regexes
    url = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    usr_name = r"@[^\s]+"
    non_numalpha = r"[^a-zA-Z0-9]"

    # apply them
    text = re.sub(url, " URL", text)        # remove urls
    text = re.sub(usr_name, " USER", text)  # remove user tags
    text = re.sub(non_numalpha, " ", text)  # remove special chars

    
    # remove stopwords and lemmatize  the rest
    for token in text.split():
        if token not in stop_words:
            token = lemmatizer.lemmatize(token)
            res += (token + ' ')

    return res




df.text = df.text.apply(lambda x : preprocess(x))
df.tail()

In [None]:
# save labels to txt file for frontend use

np.savetxt("../model/emotions.csv", [df.columns[1:]], fmt="%s", delimiter=",")

In [None]:
# split the dataset into training data (95%) and testing data (5%)

X_train_text, X_test_text, y_train_target, y_test_target = train_test_split(df["text"], df[df.columns[1:]], test_size=1-TRAIN_SIZE, random_state=0)


print("Dataset Size:       ", len(df))
print("Training Data Size: ", len(X_train_text))
print("Testing Data Size:  ", len(X_test_text))

In [None]:
# vectorize
vectorizer = TfidfVectorizer(ngram_range=VEC_NGRAM_RANGE, smooth_idf=VEC_SMOOTH_IDF, sublinear_tf=VEC_SUBLINEAR_TF)
vectorizer.fit(X_train_text)


# transform
X_train_text = vectorizer.transform(X_train_text)
X_test_text  = vectorizer.transform(X_test_text)

In [None]:
ngrams_accepted = vectorizer.get_feature_names_out()
ngrams_ignored  = vectorizer.stop_words_

print("Number of ngrams accepted (sizes: 1-3): ", len(ngrams_accepted))
print("Number of ngrams ignored  (sizes: 1-3): ", len(ngrams_ignored))

In [None]:
# save the vectorizer to disk

f = open("../model/vectorizer.pickle", "wb")
pickle.dump(vectorizer, f)
f.close()

In [None]:
def examine_model(model):
    y_pred = model.predict(X_test_text)
    
    print(classification_report(y_test_target, y_pred, zero_division=0))
    print("Hamming Loss: ", round(hamming_loss(y_test_target, y_pred), 4))
    

    labels = df.columns[1:]
    mlcm = multilabel_confusion_matrix(y_test_target, y_pred)
    
    fig, axs = plt.subplots(nrows=7, ncols=4)
    plt.subplots_adjust(wspace=0.5, hspace=0.9)
    fig.set_figheight(16)
    fig.set_figwidth(16)

    for i in range(len(labels)):
        group_names = ["True Neg.", "False Pos.", "False Neg.", "True Pos."]
        group_percs = ["{0:.2%}".format(value) for value in mlcm[i].flatten() / np.sum(mlcm[i])]
        annots = [f"{v1}\n{v2}" for v1, v2 in zip(group_names, group_percs)]
        annots = np.asarray(annots).reshape(2, 2)
        
        sns.heatmap(data=mlcm[i], annot=annots, fmt="", cmap="Blues", cbar=False, ax=axs.flat[i])
        
        axs.flat[i].set_xlabel("Predicted")
        axs.flat[i].set_ylabel("Actual")
        axs.flat[i].title.set_text(labels[i])

    plt.show()

In [None]:
# TRAIN: Random Forest Classifier

RFCModel = RandomForestClassifier(verbose=100, n_jobs=-1, n_estimators=TRAIN_N_RFTREES)
RFCModel.fit(X_train_text, y_train_target)

In [None]:
# examine Random Forest Classifier Model

print("Random Forest Classifier Model")
examine_model(RFCModel)

In [None]:
# save the Random Forest Classifier Model to disk

f = open("../model/RFCModel.pickle", "wb")
pickle.dump(RFCModel, f)
f.close()

In [None]:
# TRAIN: Decision Tree Classifier

DTCModel = DecisionTreeClassifier(criterion=TRAIN_DECI_TREE_CRITERION)
DTCModel.fit(X_train_text, y_train_target)

In [None]:
# examine Decision Tree Classifier Model

print("Decision Tree Classifier Model")
examine_model(DTCModel)

In [None]:
# save the Decision Tree Classifier Model to disk

f = open("../model/DTCModel.pickle", "wb")
pickle.dump(DTCModel, f)
f.close()

In [None]:
# TRAIN: Neural Network - Multi-layer Perceptron Classifier

MLPCModel = MLPClassifier(solver=TRAIN_MLPC_SOLVER, max_iter=TRAIN_MLPC_EPOCHS, verbose=100)
MLPCModel.fit(X_train_text, y_train_target)

In [None]:
# examine MLP Classifier Model

print("Neural Network: MLP Classifier Model")
examine_model(MLPCModel)

In [None]:
# save the MLP Classifier Model to disk

f = open("../model/MLPCModel.pickle", "wb")
pickle.dump(MLPCModel, f)
f.close()

In [None]:
def gen_output_df(query_strs, model):
    X = vectorizer.transform(list(map(lambda x : preprocess(x), query_strs)))
    y = model.predict_proba(X)

    labels  = df.columns[1:]

    output_df = pd.DataFrame(columns=df.columns)

    # for all input strings
    for i in range(len(query_strs)):
        output_df.loc[i, df.columns[0]] = query_strs[i]    # record input text

        # for all labels
        for j in range(len(y)):
            output_df.loc[i, df.columns[j + 1]] = round(y[j][i][1] * 100, 2)    # record label probability

    
    return output_df




def vizualize_output_df(output_df):
    fig, axs = plt.subplots(nrows=output_df.shape[0], ncols=1)
    plt.subplots_adjust(hspace=0.5)
    fig.set_figheight(8 * output_df.shape[0])
    fig.set_figwidth(8)

    
    for i in range(output_df.shape[0]):
        axis = axs.flat[i]
        
        p = sns.barplot(data=output_df.loc[i, df.columns[1:]], ax=axis, color="dodgerblue")
        p.bar_label(p.containers[0], fmt=(lambda x: f"{round(x)}%" if x > 0 else ""), fontsize=6)
        
        axis.set_xlabel("LABELS")
        axis.set_ylabel("PROBABILITY (%)")
        axis.tick_params(axis='x', labelrotation=90)
        axis.set(ylim=(0, 100))
        axis.title.set_text("string: \"" + output_df.loc[i, df.columns[0]] + "\"\n")

    plt.show()

In [None]:
# at this stage, our model is ready. let's run some queries

query_strs = ["This cat looks very cute!", "I hate YouTube's ad. revenue model"]
model      = RFCModel

output_df  = gen_output_df(query_strs, model)

vizualize_output_df(output_df)

In [None]:
def predict_labels(query_strs, model):
    texts   = vectorizer.transform(list(map(lambda x : preprocess(x), query_strs)))
    targets = model.predict(texts)

    
    for i in range(len(query_strs)):
        text   = query_strs[i]
        target = targets[i]
        emotions = []
    
        print(text, end=" ===> ")
    
        for j in range(len(target)):
            if target[j] == 1:
                emotions.append(df.columns[1:][j])
    
        print(emotions)

In [None]:
predict_labels(query_strs, model)