### Traditional machine learning models

In [2]:
#imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import hamming_loss
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import json

Importing our datasets and setting the variables

In [3]:
# Read the three GO emotions datasets
df_train = pd.read_csv('./data/goemotions1.csv')
df_test = pd.read_csv('./data/goemotions2.csv')
df_dev = pd.read_csv('./data/goemotions3.csv')

# Combine the datasets into a single dataframe
df_combined = pd.concat([df_train, df_test, df_dev], ignore_index=True)

data cleaning and preprocessing

In [4]:
df_combined = df_combined.drop_duplicates()

# Drop observations where "example_very_unclear" is True these observations have no emotion label and therefore no predictive value
df_combined = df_combined[df_combined['example_very_unclear'] == False]
#remove duplicates in the text column
df_combined = df_combined.drop_duplicates(subset=['text'])
print(f"Number of observations {len(df_combined)}")

Number of observations 57730


Mapping ekman emotions to the corresponding GO emotion label

In [5]:
# Define the Ekman mapping as specified in the GO emotions dataset
# Load the Ekman mapping from a JSON file in the data folder
with open('./data/ekman_mapping.json', 'r') as file:
    ekman_mapping = json.load(file)

# Create a new DataFrame with Ekman categories, including the 'text' column
df_ekman = df_combined[['text']].copy()

# Populate the Ekman categories based on the mapping
for ekman_emotion, go_emotions in ekman_mapping.items():
    # Sum the columns for each GO emotion that maps to this Ekman emotion
    df_ekman[ekman_emotion] = df_combined[go_emotions].sum(axis=1).clip(upper=1)

Splitting the dataset into train and test sets

In [6]:
#split the df_ekman int 80/20 train/test split
X_train, X_test, y_train, y_test = train_test_split(df_ekman['text'], df_ekman.drop(columns=['text']), test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Fix the sample sizes to match (using 1000 for consistency)
X_train_small = X_train[:2500]
y_train_small = y_train[:2500]
X_test_small = X_test[:1000]
y_test_small = y_test[:1000]

# Print shapes of the small datasets
print(f"X_train_small shape: {X_train_small.shape}")
print(f"y_train_small shape: {y_train_small.shape}")
print(f"X_test_small shape: {X_test_small.shape}")
print(f"y_test_small shape: {y_test_small.shape}")

X_train shape: (46184,)
X_test shape: (11546,)
y_train shape: (46184, 7)
y_test shape: (11546, 7)
X_train_small shape: (2500,)
y_train_small shape: (2500, 7)
X_test_small shape: (1000,)
y_test_small shape: (1000, 7)


## Building the models

Logistic regression - making classifier and model hyperparameters tuning

In [21]:
# Create a pipeline with TF-IDF vectorization and MultiOutputClassifier
# Define the pipeline
ekman_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', MultiOutputClassifier(LogisticRegression()))
])

# Params for the grid search
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__max_features': [500, 1000, None],
    'classifier__estimator__C': [0.1, 1, 2, 3, 10],
    'classifier__estimator__solver': ['liblinear', 'lbfgs', 'saga'],
    'classifier__estimator__max_iter': [500, 1000, 2000]
}

#gridseach for best params on small dataset due to very long processing time on the training 
grid_search = GridSearchCV(estimator=ekman_pipeline, param_grid=param_grid, cv=3, scoring='f1_weighted', error_score='raise')
grid_search.fit(X_train_small, y_train_small)
print("Best parameters found: ", grid_search.best_params_)




Best parameters found:  {'classifier__estimator__C': 10, 'classifier__estimator__max_iter': 2000, 'classifier__estimator__solver': 'saga', 'tfidf__max_features': 1000, 'tfidf__ngram_range': (1, 1)}


Logistic regression - creation of tuned model and training

In [None]:
#converting data to tfidf with the best params from the grid search
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 1),
    max_features=(1000)
)

# Fit the vectorizer on the training data and transform both train and test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

lr_classifier = MultiOutputClassifier(LogisticRegression(
    C=10,
    solver='saga',
    max_iter=2000
))

# Fit the classifier on the training data
lr_classifier.fit(X_train_tfidf, y_train)

#make predictions
pred = lr_classifier.predict(X_test_tfidf)

Logistic regression model evaluation

In [25]:
#calculate accuracy and hamming loss
accuracy = accuracy_score(y_test, pred)
hamming = hamming_loss(y_test, pred)
print(f"Accuracy: {accuracy}")
print(f"Hamming Loss: {hamming}")

# Print classification report
lr_classification_report = classification_report(y_test, pred, target_names=['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'], zero_division=0)
print(classification_report(y_test, pred, target_names=ekman_mapping.keys(), zero_division=0))


Accuracy: 0.31889831976442057
Hamming Loss: 0.12760139565959763
              precision    recall  f1-score   support

       anger       0.61      0.16      0.25      1588
     disgust       0.43      0.07      0.13       258
        fear       0.58      0.14      0.22       254
         joy       0.79      0.60      0.68      4728
     sadness       0.66      0.21      0.32      1010
    surprise       0.62      0.11      0.18      1608
     neutral       0.47      0.21      0.29      3144

   micro avg       0.69      0.33      0.45     12590
   macro avg       0.60      0.21      0.30     12590
weighted avg       0.65      0.33      0.42     12590
 samples avg       0.36      0.34      0.35     12590



Decision tree - making the classifer and hyperparameter tuning 

In [29]:
# Define the pipeline for Decision Tree
ekman_pipeline_dt = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', MultiOutputClassifier(DecisionTreeClassifier()))
])

# Parameters for the grid search for Decision Tree
param_grid_dt = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)], # Reduced options to speed up
    'tfidf__max_features': [500, 1000, None], # Reduced options
    'classifier__estimator__max_depth': [1, 4, 10, 20],
    'classifier__estimator__min_samples_split': [2, 5, 7],
    'classifier__estimator__min_samples_leaf': [1, 2, 3],
}

# GridSearchCV for best params on small dataset (lack of computational resources)
grid_search_dt = GridSearchCV(estimator=ekman_pipeline_dt, param_grid=param_grid_dt, cv=3, scoring='f1_weighted', error_score='raise', verbose=1)
grid_search_dt.fit(X_train_small, y_train_small)

print("Best parameters for Decision Tree found: ", grid_search_dt.best_params_)


Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best parameters for Decision Tree found:  {'classifier__estimator__max_depth': 20, 'classifier__estimator__min_samples_leaf': 1, 'classifier__estimator__min_samples_split': 2, 'tfidf__max_features': None, 'tfidf__ngram_range': (1, 3)}


creating tuned decision tree model and making predictions

In [30]:
#define TFIDF vectorizer with the best params from the grid search
tfidf_vectorizer_dt = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 3,)
)

#create the decision tree classifier with the best params from the grid search
dt_classifier = MultiOutputClassifier(DecisionTreeClassifier(
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1
))

# Fit the vectorizer on the training data and transform both train and test data
X_train_tfidf_dt = tfidf_vectorizer_dt.fit_transform(X_train)
X_test_tfidf_dt = tfidf_vectorizer_dt.transform(X_test)

# Fit the classifier on the training data
dt_classifier.fit(X_train_tfidf_dt, y_train)

#make predictions
pred_dt = dt_classifier.predict(X_test_tfidf_dt)


Decision tree - model evaluation

In [31]:
#hamming loss and accuracy for decision tree
accuracy_dt = accuracy_score(y_test, pred_dt)
hamming_dt = hamming_loss(y_test, pred_dt)
print(f"Accuracy Decision Tree: {accuracy_dt}")
print(f"Hamming Loss Decision Tree: {hamming_dt}")

#classification report for decision tree
dt_classification_report = classification_report(y_test, pred_dt, target_names=['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'], zero_division=0)
print(classification_report(y_test, pred_dt, target_names=ekman_mapping.keys(), zero_division=0))

Accuracy Decision Tree: 0.21635198337086436
Hamming Loss Decision Tree: 0.13243918734997898
              precision    recall  f1-score   support

       anger       0.51      0.19      0.27      1588
     disgust       0.29      0.08      0.12       258
        fear       0.58      0.26      0.36       254
         joy       0.82      0.49      0.61      4728
     sadness       0.62      0.26      0.36      1010
    surprise       0.55      0.10      0.17      1608
     neutral       0.11      0.00      0.00      3144

   micro avg       0.72      0.25      0.37     12590
   macro avg       0.50      0.20      0.27     12590
weighted avg       0.54      0.25      0.33     12590
 samples avg       0.26      0.25      0.25     12590



Random forest - making classifier and mode hyperparameter tuning

In [8]:
#initialize pipeline for Random Forest
ekman_pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42))),
])
# Parameters for the grid search for Random Forest
param_grid_rf = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)], # Reduced options to speed up
    'tfidf__max_features': [500, 1000, None], # Reduced options
    'classifier__estimator__max_depth': [5, 20, 50,],
    'classifier__estimator__min_samples_split': [2, 5, 8],
    'classifier__estimator__min_samples_leaf': [1, 2, 4],
}

# GridSearchCV for best params on small dataset (lack of computational resources)
grid_search_rf = GridSearchCV(estimator=ekman_pipeline_rf, param_grid=param_grid_rf, cv=3, scoring='f1_weighted', error_score='raise')
grid_search_rf.fit(X_train_small, y_train_small)

#best params for random forest
print("Best parameters for Random Forest found: ", grid_search_rf.best_params_)

  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters for Random Forest found:  {'classifier__estimator__max_depth': 50, 'classifier__estimator__min_samples_leaf': 2, 'classifier__estimator__min_samples_split': 5, 'tfidf__max_features': 500, 'tfidf__ngram_range': (1, 1)}


random forest - tuning the model and making predictions

In [9]:
#define TFIDF vectorizer with the best params from the grid search
tfidf_vectorizer_rf = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 1),
    max_features=(500)
)

# Fit the vectorizer on the training data and transform both train and test data
X_train_tfidf_rf = tfidf_vectorizer_rf.fit_transform(X_train)
X_test_tfidf_rf = tfidf_vectorizer_rf.transform(X_test)

# Fit the classifier on the training data
rf_classifier = MultiOutputClassifier(RandomForestClassifier(
    n_estimators=100,
    max_depth=50,
    min_samples_split=5,
    min_samples_leaf=2
))
rf_classifier.fit(X_train_tfidf_rf, y_train)

#make predictions
pred_rf = rf_classifier.predict(X_test_tfidf_rf)

Evaluating the random forest model

In [14]:
#hamming loss and accuracy for decision tree
accuracy_rf = accuracy_score(y_test, pred_rf)
hamming_rf = hamming_loss(y_test, pred_rf)
print(f"Accuracy Decision Tree: {accuracy_rf}")
print(f"Hamming Loss Decision Tree: {hamming_rf}")

# Print classification report for Decision Tree
rf_classification_report = classification_report(y_test, pred_rf, target_names=['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'], zero_division=0)
print(classification_report(y_test, pred_rf, target_names=ekman_mapping.keys(), zero_division=0))

Accuracy Decision Tree: 0.22830417460592414
Hamming Loss Decision Tree: 0.12781173442874466
              precision    recall  f1-score   support

       anger       0.62      0.10      0.17      1588
     disgust       0.65      0.04      0.08       258
        fear       0.69      0.08      0.14       254
         joy       0.81      0.56      0.66      4728
     sadness       0.69      0.18      0.28      1010
    surprise       0.74      0.06      0.10      1608
     neutral       0.00      0.00      0.00      3144

   micro avg       0.79      0.24      0.37     12590
   macro avg       0.60      0.14      0.20     12590
weighted avg       0.56      0.24      0.31     12590
 samples avg       0.26      0.25      0.25     12590



KNN - making classifier and hyperparameter tuning

In [34]:
#create a pipeline for KNN
ekman_pipeline_knn = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', MultiOutputClassifier(KNeighborsClassifier()))
])

# Parameters for the grid search for KNN
param_grid_knn = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)], # Reduced options to speed up
    'tfidf__max_features': [500, 1000, None], # Reduced options
    'classifier__estimator__n_neighbors': [3, 5, 7],
    'classifier__estimator__weights': ['uniform', 'distance'],
    'classifier__estimator__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
}

# GridSearchCV for best params on small dataset (lack of computational resources)
grid_search_knn = GridSearchCV(estimator=ekman_pipeline_knn, param_grid=param_grid_knn, cv=3, scoring='f1_weighted', error_score='raise', verbose=1)
grid_search_knn.fit(X_train_small, y_train_small)
#best params for KNN
print("Best parameters for KNN found: ", grid_search_knn.best_params_)

Fitting 3 folds for each of 216 candidates, totalling 648 fits




Best parameters for KNN found:  {'classifier__estimator__algorithm': 'auto', 'classifier__estimator__n_neighbors': 3, 'classifier__estimator__weights': 'uniform', 'tfidf__max_features': 500, 'tfidf__ngram_range': (1, 2)}


KNN - tuning the model and making predictions

In [36]:
#define TFIDF vectorizer with the best params from the grid search
tfidf_vectorizer_knn = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),
    max_features=(500)
)

# Fit the vectorizer on the training data and transform both train and test data
X_train_tfidf_knn = tfidf_vectorizer_knn.fit_transform(X_train)
X_test_tfidf_knn = tfidf_vectorizer_knn.transform(X_test)

# Fit the classifier on the training data
knn_classifier = MultiOutputClassifier(KNeighborsClassifier(
    n_neighbors=3,
    weights='uniform',
    algorithm='auto'
))
knn_classifier.fit(X_train_tfidf_knn, y_train)
#make predictions
pred_knn = knn_classifier.predict(X_test_tfidf_knn)

KNN - model evaluation

In [37]:
#hamming loss and accuracy for decision tree
accuracy_knn = accuracy_score(y_test, pred_knn)
hamming_knn = hamming_loss(y_test, pred_knn)
print(f"Accuracy KNN: {accuracy_knn}")
print(f"Hamming Loss KNN: {hamming_knn}")

#print classification report for knn
knn_classification_report = classification_report(y_test, pred_knn, target_names=['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'], zero_division=0)
print(classification_report(y_test, pred_knn, target_names=ekman_mapping.keys(), zero_division=0))

Accuracy KNN: 0.31681967781049714
Hamming Loss KNN: 0.16381678255920418
              precision    recall  f1-score   support

       anger       0.30      0.15      0.20      1588
     disgust       0.25      0.05      0.08       258
        fear       0.48      0.11      0.18       254
         joy       0.66      0.46      0.54      4728
     sadness       0.42      0.13      0.20      1010
    surprise       0.31      0.12      0.18      1608
     neutral       0.35      0.41      0.38      3144

   micro avg       0.46      0.32      0.38     12590
   macro avg       0.40      0.21      0.25     12590
weighted avg       0.46      0.32      0.37     12590
 samples avg       0.35      0.34      0.34     12590

