In [None]:
import pandas as pd
pd.set_option('display.max_rows', 100)
import numpy as np
import utils

from sklearn.feature_extraction.text import TfidfVectorizer 
import re
from tqdm import tqdm

from sklearn.model_selection import ParameterGrid

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("../data/labelled_data/train_old.csv")
val = pd.read_csv("../data/labelled_data/val_old.csv")
test = pd.read_csv("../data/labelled_data/test_old.csv")

# Preprocessing

In [None]:
# # decode emojis from text
# train = utils.one_hot_encode_emojis(train, "phrase")
# val = utils.one_hot_encode_emojis(val, "phrase")
# test = utils.one_hot_encode_emojis(test, "phrase")

In [None]:
# clean phrases
train["phrase_lemma"] = train.phrase.apply(lambda x: utils.clean_phrase(x, lemmatize=True, stem=False))
val["phrase_lemma"] = val.phrase.apply(lambda x: utils.clean_phrase(x, lemmatize=True, stem=False))
test["phrase_lemma"] = test.phrase.apply(lambda x: utils.clean_phrase(x, lemmatize=True, stem=False))

train["phrase_stem"] = train.phrase.apply(lambda x: utils.clean_phrase(x, lemmatize=False, stem=True))
val["phrase_stem"] = val.phrase.apply(lambda x: utils.clean_phrase(x, lemmatize=False, stem=True))
test["phrase_stem"] = test.phrase.apply(lambda x: utils.clean_phrase(x, lemmatize=False, stem=True))

train.phrase = train.phrase.apply(lambda x: utils.clean_phrase(x, lemmatize=False, stem=False))
val.phrase = val.phrase.apply(lambda x: utils.clean_phrase(x, lemmatize=False, stem=False))
test.phrase = test.phrase.apply(lambda x: utils.clean_phrase(x, lemmatize=False, stem=False))

In [None]:
# filter out rows where there are no characters
train = train.loc[(train.phrase.str.len() > 0)]
val = val.loc[(val.phrase.str.len() > 0)]
test = test.loc[(test.phrase.str.len() > 0)]

In [None]:
# if nan label, replace with 0
train.label = train.label.apply(lambda x: 0 if np.isnan(x) else x)
val.label = val.label.apply(lambda x: 0 if np.isnan(x) else x)
test.label = test.label.apply(lambda x: 0 if np.isnan(x) else x)

In [None]:
# form trainval set
trainval = pd.concat([train, val])

# filter for only relevant aspects
train1 = train.loc[~(train.new_aspect_1.isnull())]
train2 = train.loc[~(train.new_aspect_2.isnull())]

val1 = val.loc[~(val.new_aspect_1.isnull())]
val2 = val.loc[~(val.new_aspect_2.isnull())]

trainval1 = trainval.loc[~(trainval.new_aspect_1.isnull())]
trainval2 = trainval.loc[~(trainval.new_aspect_2.isnull())]

test1 = test.loc[~(test.new_aspect_1.isnull())]
test2 = test.loc[~(test.new_aspect_2.isnull())]

# Modelling

## Initialise Parameters

In [None]:
tfidf_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1), (1,2), (1,3)]
}
tfidf_paramgrid = list(ParameterGrid(tfidf_params))

phrase_params = {
    "type": ["original", "lemma", "stem"], # normal, lemma, stem
    "aspect": ["original", "new_aspect_1", "new_aspect_2"]
}
phrase_paramgrid = list(ParameterGrid(phrase_params))

In [None]:
# logistic regression
logreg_params = {
    "C": [0.5, 1.0, 1.5],
    "solver": ["lbfgs", "newton-cg"],
    "penalty": ["l2", "none"],
    "class_weight": ["balanced", None] 
}
logreg_paramgrid = list(ParameterGrid(logreg_params))

In [None]:
# naive bayes
nb_params = {
    "alpha": [0, 1]
}
nb_paramgrid = list(ParameterGrid(nb_params))

In [None]:
# svm
svm_params = {
    "C": [0.5, 1.0, 1.5],
    "kernel": ["poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
    "class_weight": ["balanced", None] 
}

svm_paramgrid = list(ParameterGrid(svm_params))

In [None]:
rf_params = [
    {
        "criterion": ["gini", "entropy"],
        "min_samples_split": [2, 5, 10],
        "class_weight": ["balanced", "balanced_subsample", None]
    }
]
rf_paramgrid = list(ParameterGrid(rf_params))

In [None]:
dummy_params = { "strategy": ["prior"] }
dummy_paramgrid = list(ParameterGrid(dummy_params))

## Grid Search

In [None]:
def prepare_datasets(tfidf_param):
    # original
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train = tfidf.fit_transform(train.phrase)
    tfidf_val = tfidf.transform(val.phrase)
    tfidf_test = tfidf.transform(test.phrase)
    tfidf_trainval = tfidf.transform(trainval.phrase)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train1 = tfidf.fit_transform(train1.phrase)
    tfidf_val1 = tfidf.transform(val1.phrase)
    tfidf_test1 = tfidf.transform(test1.phrase)
    tfidf_trainval1 = tfidf.transform(trainval1.phrase)
    
    tfidf_train2 = tfidf.fit_transform(train2.phrase)
    tfidf_val2 = tfidf.transform(val2.phrase)
    tfidf_test2 = tfidf.transform(test2.phrase)
    tfidf_trainval2 = tfidf.transform(trainval2.phrase)
    
    # lemmatize
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_lemma = tfidf.fit_transform(train.phrase_lemma)
    tfidf_val_lemma = tfidf.transform(val.phrase_lemma)
    tfidf_test_lemma = tfidf.transform(test.phrase_lemma)
    tfidf_trainval_lemma = tfidf.transform(trainval.phrase_lemma)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train1_lemma = tfidf.fit_transform(train1.phrase_lemma)
    tfidf_val1_lemma = tfidf.transform(val1.phrase_lemma)
    tfidf_test1_lemma = tfidf.transform(test1.phrase_lemma)
    tfidf_trainval1_lemma = tfidf.transform(trainval1.phrase_lemma)
    
    tfidf_train2_lemma = tfidf.fit_transform(train2.phrase_lemma)
    tfidf_val2_lemma = tfidf.transform(val2.phrase_lemma)
    tfidf_test2_lemma = tfidf.transform(test2.phrase_lemma)
    tfidf_trainval2_lemma = tfidf.transform(trainval2.phrase_lemma)
    
    # stem
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_stem = tfidf.fit_transform(train.phrase_stem)
    tfidf_val_stem = tfidf.transform(val.phrase_stem)
    tfidf_test_stem = tfidf.transform(test.phrase_stem)
    tfidf_trainval_stem = tfidf.transform(trainval.phrase_stem)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train1_stem = tfidf.fit_transform(train1.phrase_stem)
    tfidf_val1_stem = tfidf.transform(val1.phrase_stem)
    tfidf_test1_stem = tfidf.transform(test1.phrase_stem)
    tfidf_trainval1_stem = tfidf.transform(trainval1.phrase_stem)
    
    tfidf_train2_stem = tfidf.fit_transform(train2.phrase_stem)
    tfidf_val2_stem = tfidf.transform(val2.phrase_stem)
    tfidf_test2_stem = tfidf.transform(test2.phrase_stem)
    tfidf_trainval2_stem = tfidf.transform(trainval2.phrase_stem)
    
    return {
        "original": {
            "original": [tfidf_train, tfidf_val, tfidf_test, tfidf_trainval],
            "new_aspect_1": [tfidf_train1, tfidf_val1, tfidf_test1, tfidf_trainval1],
            "new_aspect_2": [tfidf_train2, tfidf_val2, tfidf_test2, tfidf_trainval2]
        },
        "lemma": {
            "original": [tfidf_train_lemma, tfidf_val_lemma, tfidf_test_lemma, tfidf_trainval_lemma],
            "new_aspect_1": [tfidf_train1_lemma, tfidf_val1_lemma, tfidf_test1_lemma, tfidf_trainval1_lemma],
            "new_aspect_2": [tfidf_train2_lemma, tfidf_val2_lemma, tfidf_test2_lemma, tfidf_trainval2_lemma]
        },
        "stem": {
            "original": [tfidf_train_stem, tfidf_val_stem, tfidf_test_stem, tfidf_trainval_stem],
            "new_aspect_1": [tfidf_train1_stem, tfidf_val1_stem, tfidf_test1_stem, tfidf_trainval1_stem],
            "new_aspect_2": [tfidf_train2_stem, tfidf_val2_stem, tfidf_test2_stem, tfidf_trainval2_stem]
        }
    }

In [None]:
labels = {
    "original": [train.label, val.label, test.label, trainval.label],
    "new_aspect_1": [train1.label, val1.label, test1.label, trainval1.label],
    "new_aspect_2": [train2.label, val2.label, test2.label, trainval2.label],
}

### Logistic Regression

In [None]:
model_name = "logreg"
model_fn = LogisticRegression
model_paramgrid = logreg_paramgrid

ind = 0 
gridsearch_results = []
for tfidf_param in tfidf_paramgrid:
    datasets = prepare_datasets(tfidf_param)
    
    for phrase_param in phrase_paramgrid:
        data_type = phrase_param["type"]
        data_aspect = phrase_param["aspect"]
        
        # extract datasets
        train_set = datasets[data_type][data_aspect][0]
        val_set = datasets[data_type][data_aspect][1]
        test_set = datasets[data_type][data_aspect][2]
        trainval_set = datasets[data_type][data_aspect][3]
        
        train_label = labels[data_aspect][0]
        val_label = labels[data_aspect][1]
        test_label = labels[data_aspect][2]
        trainval_label = labels[data_aspect][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # predict
            val_f1 = f1_score(val_label, val_pred, average="weighted")
            val_recall = recall_score(val_label, val_pred, average="weighted")
            val_precision = precision_score(val_label, val_pred, average="weighted")
            val_accuracy = accuracy_score(val_label, val_pred)
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # predict
            test_f1 = f1_score(test_label, test_pred, average="weighted")
            test_recall = recall_score(test_label, test_pred, average="weighted")
            test_precision = precision_score(test_label, test_pred, average="weighted")
            test_accuracy = accuracy_score(test_label, test_pred)
            
            results = { "model": model_name }
            results.update(tfidf_param)
            results.update(phrase_param)
            results.update(model_param)
            results.update({"val_f1": val_f1, "val_recall": val_recall, "val_precision": val_precision, 
                            "val_accuracy": val_accuracy})
            results.update({"test_f1": test_f1, "test_recall": test_recall, "test_precision": test_precision, 
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            ind += 1
            
final_logreg_results = pd.DataFrame.from_records(gridsearch_results)
final_logreg_results = final_logreg_results.sort_values(by=["val_f1", "test_f1"], ascending=False)
final_logreg_results.to_csv("model_results/tfidf/logreg.csv", index=False)

### Naive Bayes

In [None]:
model_name = "nb"
model_fn = MultinomialNB
model_paramgrid = nb_paramgrid

gridsearch_results = []
for tfidf_param in tfidf_paramgrid:
    datasets = prepare_datasets(tfidf_param)
    
    for phrase_param in phrase_paramgrid:
        data_type = phrase_param["type"]
        data_aspect = phrase_param["aspect"]
        
        # extract datasets
        train_set = datasets[data_type][data_aspect][0]
        val_set = datasets[data_type][data_aspect][1]
        test_set = datasets[data_type][data_aspect][2]
        trainval_set = datasets[data_type][data_aspect][3]
        
        train_label = labels[data_aspect][0]
        val_label = labels[data_aspect][1]
        test_label = labels[data_aspect][2]
        trainval_label = labels[data_aspect][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # predict
            val_f1 = f1_score(val_label, val_pred, average="weighted")
            val_recall = recall_score(val_label, val_pred, average="weighted")
            val_precision = precision_score(val_label, val_pred, average="weighted")
            val_accuracy = accuracy_score(val_label, val_pred)
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # predict
            test_f1 = f1_score(test_label, test_pred, average="weighted")
            test_recall = recall_score(test_label, test_pred, average="weighted")
            test_precision = precision_score(test_label, test_pred, average="weighted")
            test_accuracy = accuracy_score(test_label, test_pred)
            
            results = { "model": model_name }
            results.update(tfidf_param)
            results.update(phrase_param)
            results.update(model_param)
            results.update({"val_f1": val_f1, "val_recall": val_recall, "val_precision": val_precision, 
                            "val_accuracy": val_accuracy})
            results.update({"test_f1": test_f1, "test_recall": test_recall, "test_precision": test_precision, 
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            
final_nb_results = pd.DataFrame.from_records(gridsearch_results)
final_nb_results = final_nb_results.sort_values(by=["val_f1", "test_f1"], ascending=False)
final_nb_results.to_csv("model_results/tfidf/nb.csv", index=False)

### Random Forest

In [None]:
model_name = "rf"
model_fn = RandomForestClassifier
model_paramgrid = rf_paramgrid

gridsearch_results = []
for tfidf_param in tfidf_paramgrid:
    datasets = prepare_datasets(tfidf_param)
    
    for phrase_param in phrase_paramgrid:
        data_type = phrase_param["type"]
        data_aspect = phrase_param["aspect"]
        
        # extract datasets
        train_set = datasets[data_type][data_aspect][0]
        val_set = datasets[data_type][data_aspect][1]
        test_set = datasets[data_type][data_aspect][2]
        trainval_set = datasets[data_type][data_aspect][3]
        
        train_label = labels[data_aspect][0]
        val_label = labels[data_aspect][1]
        test_label = labels[data_aspect][2]
        trainval_label = labels[data_aspect][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # predict
            val_f1 = f1_score(val_label, val_pred, average="weighted")
            val_recall = recall_score(val_label, val_pred, average="weighted")
            val_precision = precision_score(val_label, val_pred, average="weighted")
            val_accuracy = accuracy_score(val_label, val_pred)
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # predict
            test_f1 = f1_score(test_label, test_pred, average="weighted")
            test_recall = recall_score(test_label, test_pred, average="weighted")
            test_precision = precision_score(test_label, test_pred, average="weighted")
            test_accuracy = accuracy_score(test_label, test_pred)
            
            results = { "model": model_name }
            results.update(tfidf_param)
            results.update(phrase_param)
            results.update(model_param)
            results.update({"val_f1": val_f1, "val_recall": val_recall, "val_precision": val_precision, 
                            "val_accuracy": val_accuracy})
            results.update({"test_f1": test_f1, "test_recall": test_recall, "test_precision": test_precision, 
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            
final_rf_results = pd.DataFrame.from_records(gridsearch_results)
final_rf_results = final_rf_results.sort_values(by=["val_f1", "test_f1"], ascending=False)
final_rf_results.to_csv("model_results/tfidf/rf.csv", index=False)

### SVM

In [None]:
model_name = "svm"
model_fn = SVC
model_paramgrid = svm_paramgrid

gridsearch_results = []
for tfidf_param in tfidf_paramgrid:
    datasets = prepare_datasets(tfidf_param)
    
    for phrase_param in phrase_paramgrid:
        data_type = phrase_param["type"]
        data_aspect = phrase_param["aspect"]
        
        # extract datasets
        train_set = datasets[data_type][data_aspect][0]
        val_set = datasets[data_type][data_aspect][1]
        test_set = datasets[data_type][data_aspect][2]
        trainval_set = datasets[data_type][data_aspect][3]
        
        train_label = labels[data_aspect][0]
        val_label = labels[data_aspect][1]
        test_label = labels[data_aspect][2]
        trainval_label = labels[data_aspect][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # predict
            val_f1 = f1_score(val_label, val_pred, average="weighted")
            val_recall = recall_score(val_label, val_pred, average="weighted")
            val_precision = precision_score(val_label, val_pred, average="weighted")
            val_accuracy = accuracy_score(val_label, val_pred)
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # predict
            test_f1 = f1_score(test_label, test_pred, average="weighted")
            test_recall = recall_score(test_label, test_pred, average="weighted")
            test_precision = precision_score(test_label, test_pred, average="weighted")
            test_accuracy = accuracy_score(test_label, test_pred)
            
            results = { "model": model_name }
            results.update(tfidf_param)
            results.update(phrase_param)
            results.update(model_param)
            results.update({"val_f1": val_f1, "val_recall": val_recall, "val_precision": val_precision, 
                            "val_accuracy": val_accuracy})
            results.update({"test_f1": test_f1, "test_recall": test_recall, "test_precision": test_precision, 
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            
final_svm_results = pd.DataFrame.from_records(gridsearch_results)
final_svm_results = final_svm_results.sort_values(by=["val_f1", "test_f1"], ascending=False)
final_svm_results.to_csv("model_results/tfidf/svm.csv", index=False)

### Dummy Classifier

In [None]:
model_name = "dummy"
model_fn = DummyClassifier
model_paramgrid = dummy_paramgrid

gridsearch_results = []
for tfidf_param in tfidf_paramgrid:
    datasets = prepare_datasets(tfidf_param)
    
    for phrase_param in phrase_paramgrid:
        data_type = phrase_param["type"]
        data_aspect = phrase_param["aspect"]
        
        # extract datasets
        train_set = datasets[data_type][data_aspect][0]
        val_set = datasets[data_type][data_aspect][1]
        test_set = datasets[data_type][data_aspect][2]
        trainval_set = datasets[data_type][data_aspect][3]
        
        train_label = labels[data_aspect][0]
        val_label = labels[data_aspect][1]
        test_label = labels[data_aspect][2]
        trainval_label = labels[data_aspect][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # predict
            val_f1 = f1_score(val_label, val_pred, average="weighted")
            val_recall = recall_score(val_label, val_pred, average="weighted")
            val_precision = precision_score(val_label, val_pred, average="weighted")
            val_accuracy = accuracy_score(val_label, val_pred)
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # predict
            test_f1 = f1_score(test_label, test_pred, average="weighted")
            test_recall = recall_score(test_label, test_pred, average="weighted")
            test_precision = precision_score(test_label, test_pred, average="weighted")
            test_accuracy = accuracy_score(test_label, test_pred)
            
            results = { "model": model_name }
            results.update(tfidf_param)
            results.update(phrase_param)
            results.update(model_param)
            results.update({"val_f1": val_f1, "val_recall": val_recall, "val_precision": val_precision, 
                            "val_accuracy": val_accuracy})
            results.update({"test_f1": test_f1, "test_recall": test_recall, "test_precision": test_precision, 
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            
final_dummy_results = pd.DataFrame.from_records(gridsearch_results)
final_dummy_results = final_dummy_results.sort_values(by=["val_f1", "test_f1"], ascending=False)
final_dummy_results.to_csv("model_results/tfidf/dummy.csv", index=False)

### Combine and Save All Results

In [None]:
combined_df = pd.concat([final_logreg_results, final_nb_results, final_svm_results, final_rf_results, \
                        final_dummy_results])
combined_df = combined_df.sort_values(by=["val_f1", "test_f1"], ascending=False)
combined_df.to_csv("model_results/tfidf/combined.csv", index=False)