In [None]:
import pandas as pd
pd.set_option('display.max_rows', 100)
import numpy as np
import utils

from sklearn.feature_extraction.text import CountVectorizer 
import re
from tqdm import tqdm

from sklearn.model_selection import ParameterGrid

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

Import Datasets

In [None]:
# Old Rules
# train_old = pd.read_csv("old_labels/train_oldpreproc.csv")
# val_old = pd.read_csv("old_labels/val_oldpreproc.csv")
# test_old = pd.read_csv("old_labels/test_oldpreproc.csv")

# trainval_old =pd.concat([train_old, val_old])

# # New rules
# train_new = pd.read_csv("new_labels/train_newpreproc.csv")
# val_new = pd.read_csv("new_labels/val_newpreproc.csv")
# test_new = pd.read_csv("new_labels/test_newpreproc.csv")

# trainval_new = pd.concat([train_new, val_new])

train = pd.read_csv("new_labels/train_newpreproc_unagg.csv")
val = pd.read_csv("new_labels/val_newpreproc_unagg.csv")
test = pd.read_csv("new_labels/test_newpreproc_unagg.csv")

trainval =pd.concat([train, val])

In [None]:
train.head()

# Modelling

Initialise Parameters

In [None]:
bow_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1), (1,2), (1,3)],
    "max_df": [0.25, 0.5, 1.0],
    "min_df": [1, 10, 20]
}
bow_paramgrid = list(ParameterGrid(bow_params))

type_proc_params = {
    "type": ["original", "lemma", "stem"]
#     , # normal, lemma, stem
#     "processing": ["old", "new"] # old, new phrase splitting
}
type_proc_paramgrid = list(ParameterGrid(type_proc_params))

In [None]:
# logistic regression
logreg_params = {
    "C": [0.1, 0.5, 1.0, 1.5, 5],
    "solver": ["lbfgs", "newton-cg"],
    "penalty": ["l2", "none"],
    "class_weight": ["balanced", None] 
}
logreg_paramgrid = list(ParameterGrid(logreg_params))

In [None]:
# naive bayes
nb_params = {
    "alpha": [0, 0.001, 0.01, 0.1, 0.25, 0.5, 1]
}
nb_paramgrid = list(ParameterGrid(nb_params))

In [None]:
# svm
svm_params = {
    "C": [0.1, 0.5, 1.0, 1.5, 5],
    "kernel": ["poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
    "class_weight": ["balanced", None] 
}

svm_paramgrid = list(ParameterGrid(svm_params))

In [None]:
rf_params = [
    {
        "criterion": ["gini", "entropy"],
        "min_samples_split": [2, 5, 10],
        "class_weight": ["balanced", "balanced_subsample", None],
        "max_features": ["auto","sqrt"],
        "min_samples_leaf": [1, 2, 4]
    }
]

rf_paramgrid = list(ParameterGrid(rf_params))

In [None]:
dummy_params = { "strategy": ["prior"] }
dummy_paramgrid = list(ParameterGrid(dummy_params))

# Grid Search 

In [None]:
def prepare_datasets(bow_param):
    # original
    bow = CountVectorizer(**bow_param)
    bow_train = bow.fit_transform(train.phrase)
    bow_val = bow.transform(val.phrase)
    bow_test = bow.transform(test.phrase)
    bow_trainval = bow.transform(trainval.phrase)
    
#     bow = CountVectorizer(**bow_param)
#     bow_train_new = bow.fit_transform(train_new.phrase)
#     bow_val_new = bow.transform(val_new.phrase)
#     bow_test_new = bow.transform(test_new.phrase)
#     bow_trainval_new = bow.transform(trainval_new.phrase)
    
    # lemmatize
    bow = CountVectorizer(**bow_param)
    bow_train_lemma = bow.fit_transform(train.phrase_lemma)
    bow_val_lemma = bow.transform(val.phrase_lemma)
    bow_test_lemma = bow.transform(test.phrase_lemma)
    bow_trainval_lemma = bow.transform(trainval.phrase_lemma)

#     bow = CountVectorizer(**bow_param)
#     bow_train_new_lemma = bow.fit_transform(train_new.phrase_lemma)
#     bow_val_new_lemma = bow.transform(val_new.phrase_lemma)
#     bow_test_new_lemma = bow.transform(test_new.phrase_lemma)
#     bow_trainval_new_lemma = bow.transform(trainval_new.phrase_lemma)
    
    # stem
    bow = CountVectorizer(**bow_param)
    bow_train_stem = bow.fit_transform(train.phrase_stem)
    bow_val_stem = bow.transform(val.phrase_stem)
    bow_test_stem = bow.transform(test.phrase_stem)
    bow_trainval_stem = bow.transform(trainval.phrase_stem)
    
#     bow = CountVectorizer(**bow_param)
#     bow_train_new_stem = bow.fit_transform(train_new.phrase_stem)
#     bow_val_new_stem = bow.transform(val_new.phrase_stem)
#     bow_test_new_stem = bow.transform(test_new.phrase_stem)
#     bow_trainval_new_stem = bow.transform(trainval_new.phrase_stem)
    
    return {
        "original": [bow_train, bow_val, bow_test, bow_trainval],
        "lemma": [bow_train_lemma, bow_val_lemma, bow_test_lemma, bow_trainval_lemma],
        "stem": [bow_train_stem, bow_val_stem, bow_test_stem, bow_trainval_stem]
    }

In [None]:
labels = [train.label, val.label, test.label, trainval.label]

## Logistic Regression

In [None]:
model_name = "logreg"
model_fn = LogisticRegression
model_paramgrid = logreg_paramgrid

ind = 0 
gridsearch_results = []
for bow_param in tqdm(bow_paramgrid):
    datasets = prepare_datasets(bow_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        
        # extract datasets
        train_set = datasets[data_type][0]
        val_set = datasets[data_type][1]
        test_set = datasets[data_type][2]
        trainval_set = datasets[data_type][3]
        
        train_label = labels[0]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = labels[3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(bow_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            ind += 1
            print(ind)

final_logreg_results = pd.DataFrame.from_records(gridsearch_results)
final_logreg_results = final_logreg_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_logreg_results.to_csv("model_results/bow/no_agg/logreg.csv", index=False)

## Naive Bayes

In [None]:
model_name = "nb"
model_fn = MultinomialNB
model_paramgrid = nb_paramgrid

ind = 0 
gridsearch_results = []
for bow_param in tqdm(bow_paramgrid):
    datasets = prepare_datasets(bow_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        
        # extract datasets
        train_set = datasets[data_type][0]
        val_set = datasets[data_type][1]
        test_set = datasets[data_type][2]
        trainval_set = datasets[data_type][3]
        
        train_label = labels[0]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = labels[3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(bow_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            ind += 1
            
            
final_nb_results = pd.DataFrame.from_records(gridsearch_results)
final_nb_results = final_nb_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_nb_results.to_csv("model_results/bow/no_agg/nb.csv", index=False)

## Random Forest

In [None]:
model_name = "rf"
model_fn = RandomForestClassifier
model_paramgrid = rf_paramgrid

ind = 0 
gridsearch_results = []
for bow_param in tqdm(bow_paramgrid):
    datasets = prepare_datasets(bow_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        
        # extract datasets
        train_set = datasets[data_type][0]
        val_set = datasets[data_type][1]
        test_set = datasets[data_type][2]
        trainval_set = datasets[data_type][3]
        
        train_label = labels[0]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = labels[3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(bow_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            ind += 1

            
final_rf_results = pd.DataFrame.from_records(gridsearch_results)
final_rf_results = final_rf_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_rf_results.to_csv("model_results/bow/no_agg/rf.csv", index=False)

##  SVM

In [None]:
model_name = "svm"
model_fn = SVC
model_paramgrid = svm_paramgrid

ind = 0 
gridsearch_results = []
for bow_param in tqdm(bow_paramgrid):
    datasets = prepare_datasets(bow_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        
        # extract datasets
        train_set = datasets[data_type][0]
        val_set = datasets[data_type][1]
        test_set = datasets[data_type][2]
        trainval_set = datasets[data_type][3]
        
        train_label = labels[0]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = labels[3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(bow_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            ind += 1


final_svm_results = pd.DataFrame.from_records(gridsearch_results)
final_svm_results = final_svm_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_svm_results.to_csv("model_results/bow/no_agg/svm.csv", index=False)

## Dummy Classifier 

In [None]:
model_name = "dummy"
model_fn = DummyClassifier
model_paramgrid = dummy_paramgrid

ind = 0 
gridsearch_results = []
for bow_param in tqdm(bow_paramgrid):
    datasets = prepare_datasets(bow_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        
        # extract datasets
        train_set = datasets[data_type][0]
        val_set = datasets[data_type][1]
        test_set = datasets[data_type][2]
        trainval_set = datasets[data_type][3]
        
        train_label = labels[0]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = labels[3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(bow_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            ind += 1
                
final_dummy_results = pd.DataFrame.from_records(gridsearch_results)
final_dummy_results = final_dummy_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_dummy_results.to_csv("model_results/bow/no_agg/dummy.csv", index=False)

In [None]:
combined_df = pd.concat([final_logreg_results, final_nb_results, final_svm_results, final_rf_results, \
                        final_dummy_results])
combined_df = combined_df.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
combined_df.to_csv("model_results/bow/no_agg/combined.csv", index=False)