In [1]:
import pickle
import pandas as pd
pd.set_option('display.max_rows', 100)
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer 
import re
from tqdm import tqdm

from sklearn.model_selection import ParameterGrid

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

In [None]:
# old rules
train_old = pd.read_csv("old_labels/train_oldpreproc.csv", index_col=0)
val_old = pd.read_csv("old_labels/val_oldpreproc.csv", index_col=0)
test_old = pd.read_csv("old_labels/test_oldpreproc.csv", index_col=0)

trainval_old =pd.concat([train_old, val_old])

# new rules
train_new = pd.read_csv("preproc_emotion/train_newpreproc_emoticon.csv", index_col=0)
val_new = pd.read_csv("preproc_emotion/val_newpreproc_emoticon.csv", index_col=0)
test_new = pd.read_csv("preproc_emotion/test_newpreproc_emoticon.csv", index_col=0)

trainval_new = pd.concat([train_new, val_new])

# Modelling

## Initialise Parameters

In [None]:
tfidf_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1), (1,2), (1,3)],
    "max_df": [0.25, 0.5, 1.0],
    "min_df": [1, 10, 20]
}
tfidf_paramgrid = list(ParameterGrid(tfidf_params))

type_proc_params = {
    "type": ["normal", "stem", "lemma"], # normal, lemma, stem
    "processing": ["generic", "unique"] # new phrase splitting
}
type_proc_paramgrid = list(ParameterGrid(type_proc_params))

In [None]:
# logistic regression
logreg_params = {
    "C": [0.1, 0.5, 1.0, 1.5, 5],
    "solver": ["lbfgs", "newton-cg"],
    "penalty": ["l2", "none"],
    "class_weight": ["balanced", None] 
}
logreg_paramgrid = list(ParameterGrid(logreg_params))

In [None]:
# naive bayes
nb_params = {
    "alpha": [0, 0.001, 0.01, 0.1, 0.25, 0.5, 1]
}
nb_paramgrid = list(ParameterGrid(nb_params))

In [None]:
# svm
svm_params = {
    "C": [0.1, 0.5, 1.0, 1.5, 5],
    "kernel": ["poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
    "class_weight": ["balanced", None] 
}

svm_paramgrid = list(ParameterGrid(svm_params))

In [None]:
rf_params = [
    {
        "criterion": ["gini", "entropy"],
        "min_samples_split": [2, 5, 10],
        "class_weight": ["balanced", "balanced_subsample", None],
        "max_features": ["auto", "sqrt"],
        "min_samples_leaf": [1, 2, 4]
    }
]

rf_paramgrid = list(ParameterGrid(rf_params))

In [None]:
dummy_params = { "strategy": ["prior"] }
dummy_paramgrid = list(ParameterGrid(dummy_params))

## Grid Search

In [None]:
def prepare_datasets(tfidf_param):
    # original
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_emoticon_generic = tfidf.fit_transform(train_new.phrase_emoticon_generic)
    tfidf_val_emoticon_generic = tfidf.transform(val_new.phrase_emoticon_generic)
    tfidf_test_emoticon_generic = tfidf.transform(test_new.phrase_emoticon_generic)
    tfidf_trainval_emoticon_generic = tfidf.transform(trainval_new.phrase_emoticon_generic)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_emoticon_unique = tfidf.fit_transform(train_new.phrase_emoticon_unique)
    tfidf_val_emoticon_unique = tfidf.transform(val_new.phrase_emoticon_unique)
    tfidf_test_emoticon_unique = tfidf.transform(test_new.phrase_emoticon_unique)
    tfidf_trainval_emoticon_unique = tfidf.transform(trainval_new.phrase_emoticon_unique)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_old = tfidf.fit_transform(train_old.phrase)
    tfidf_val_old = tfidf.transform(val_old.phrase)
    tfidf_test_old = tfidf.transform(test_old.phrase)
    tfidf_trainval_old = tfidf.transform(trainval_old.phrase)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_new = tfidf.fit_transform(train_new.phrase)
    tfidf_val_new = tfidf.transform(val_new.phrase)
    tfidf_test_new = tfidf.transform(test_new.phrase)
    tfidf_trainval_new = tfidf.transform(trainval_new.phrase)
    
    # lemmatize
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_emoticon_generic_lemma = tfidf.fit_transform(train_new.phrase_lemma_emoticon_generic)
    tfidf_val_emoticon_generic_lemma = tfidf.transform(val_new.phrase_lemma_emoticon_generic)
    tfidf_test_emoticon_generic_lemma = tfidf.transform(test_new.phrase_lemma_emoticon_generic)
    tfidf_trainval_emoticon_generic_lemma = tfidf.transform(trainval_new.phrase_lemma_emoticon_generic)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_emoticon_unique_lemma = tfidf.fit_transform(train_new.phrase_lemma_emoticon_unique)
    tfidf_val_emoticon_unique_lemma = tfidf.transform(val_new.phrase_lemma_emoticon_unique)
    tfidf_test_emoticon_unique_lemma = tfidf.transform(test_new.phrase_lemma_emoticon_unique)
    tfidf_trainval_emoticon_unique_lemma = tfidf.transform(trainval_new.phrase_lemma_emoticon_unique)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_old_lemma = tfidf.fit_transform(train_old.phrase_lemma)
    tfidf_val_old_lemma = tfidf.transform(val_old.phrase_lemma)
    tfidf_test_old_lemma = tfidf.transform(test_old.phrase_lemma)
    tfidf_trainval_old_lemma = tfidf.transform(trainval_old.phrase_lemma)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_new_lemma = tfidf.fit_transform(train_new.phrase_lemma)
    tfidf_val_new_lemma = tfidf.transform(val_new.phrase_lemma)
    tfidf_test_new_lemma = tfidf.transform(test_new.phrase_lemma)
    tfidf_trainval_new_lemma = tfidf.transform(trainval_new.phrase_lemma)
    
    # stem
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_emoticon_generic_stem = tfidf.fit_transform(train_new.phrase_stem_emoticon_generic)
    tfidf_val_emoticon_generic_stem = tfidf.transform(val_new.phrase_stem_emoticon_generic)
    tfidf_test_emoticon_generic_stem = tfidf.transform(test_new.phrase_stem_emoticon_generic)
    tfidf_trainval_emoticon_generic_stem = tfidf.transform(trainval_new.phrase_stem_emoticon_generic)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_emoticon_unique_stem = tfidf.fit_transform(train_new.phrase_stem_emoticon_unique)
    tfidf_val_emoticon_unique_stem = tfidf.transform(val_new.phrase_stem_emoticon_unique)
    tfidf_test_emoticon_unique_stem = tfidf.transform(test_new.phrase_stem_emoticon_unique)
    tfidf_trainval_emoticon_unique_stem = tfidf.transform(trainval_new.phrase_stem_emoticon_unique)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_old_stem = tfidf.fit_transform(train_old.phrase_stem)
    tfidf_val_old_stem = tfidf.transform(val_old.phrase_stem)
    tfidf_test_old_stem = tfidf.transform(test_old.phrase_stem)
    tfidf_trainval_old_stem = tfidf.transform(trainval_old.phrase_stem)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_new_stem = tfidf.fit_transform(train_new.phrase_stem)
    tfidf_val_new_stem = tfidf.transform(val_new.phrase_stem)
    tfidf_test_new_stem = tfidf.transform(test_new.phrase_stem)
    tfidf_trainval_new_stem = tfidf.transform(trainval_new.phrase_stem)
    
    return {
        "original": {
            "old": [tfidf_train_old, tfidf_val_old, tfidf_test_old, tfidf_trainval_old],
            "new": [tfidf_train_new, tfidf_val_new, tfidf_test_new, tfidf_trainval_new],
            "generic": [tfidf_train_emoticon_generic, tfidf_val_emoticon_generic, tfidf_test_emoticon_generic, tfidf_trainval_emoticon_generic],
            "unique": [tfidf_train_emoticon_unique, tfidf_val_emoticon_unique, tfidf_test_emoticon_unique, tfidf_trainval_emoticon_unique]
        },
        "lemma": {
            "old": [tfidf_train_old_lemma, tfidf_val_old_lemma, tfidf_test_old_lemma, tfidf_trainval_old_lemma],
            "new": [tfidf_train_new_lemma, tfidf_val_new_lemma, tfidf_test_new_lemma, tfidf_trainval_new_lemma],
            "generic": [tfidf_train_emoticon_generic_lemma, tfidf_val_emoticon_generic_lemma, tfidf_test_emoticon_generic_lemma, tfidf_trainval_emoticon_generic_lemma],
            "unique": [tfidf_train_emoticon_unique_lemma, tfidf_val_emoticon_unique_lemma, tfidf_test_emoticon_unique_lemma, tfidf_trainval_emoticon_unique_lemma]
        },
        "stem": {
            "old": [tfidf_train_old_stem, tfidf_val_old_stem, tfidf_test_old_stem, tfidf_trainval_old_stem],
            "new": [tfidf_train_new_stem, tfidf_val_new_stem, tfidf_test_new_stem, tfidf_trainval_new_stem],
            "generic": [tfidf_train_emoticon_generic_stem, tfidf_val_emoticon_generic_stem, tfidf_test_emoticon_generic_stem, tfidf_trainval_emoticon_generic_stem],
            "unique": [tfidf_train_emoticon_unique_stem, tfidf_val_emoticon_unique_stem, tfidf_test_emoticon_unique_stem, tfidf_trainval_emoticon_unique_stem]
        }
    }

In [None]:
labels = {
    "old": [train_old.label, val_old.label, test_old.label, trainval_old.label],
    "new": [train_new.label, val_new.label, test_new.label, trainval_new.label],
    "generic": [train_new.label, val_new.label, test_new.label, trainval_new.label],
    "unique": [train_new.label, val_new.label, test_new.label, trainval_new.label]
}

### Logistic Regression

In [None]:
model_name = "logreg"
model_fn = LogisticRegression
model_paramgrid = logreg_paramgrid

gridsearch_results = []
for tfidf_param in tqdm(tfidf_paramgrid):
    datasets = prepare_datasets(tfidf_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        data_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][data_proc][0]
        val_set = datasets[data_type][data_proc][1]
        test_set = datasets[data_type][data_proc][2]
        trainval_set = datasets[data_type][data_proc][3]
        
        train_label = labels[data_proc][0]
        val_label = labels[data_proc][1]
        test_label = labels[data_proc][2]
        trainval_label = labels[data_proc][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(tfidf_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            
final_logreg_results = pd.DataFrame.from_records(gridsearch_results)
final_logreg_results = final_logreg_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_logreg_results.to_csv("model_results/tfidf/logreg.csv", index=False)

### Naive Bayes

In [None]:
model_name = "nb"
model_fn = MultinomialNB
model_paramgrid = nb_paramgrid

gridsearch_results = []
for tfidf_param in tqdm(tfidf_paramgrid):
    datasets = prepare_datasets(tfidf_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        data_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][data_proc][0]
        val_set = datasets[data_type][data_proc][1]
        test_set = datasets[data_type][data_proc][2]
        trainval_set = datasets[data_type][data_proc][3]
        
        train_label = labels[data_proc][0]
        val_label = labels[data_proc][1]
        test_label = labels[data_proc][2]
        trainval_label = labels[data_proc][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(tfidf_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            
final_nb_results = pd.DataFrame.from_records(gridsearch_results)
final_nb_results = final_nb_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_nb_results.to_csv("model_results/tfidf/nb.csv", index=False)

### Random Forest

In [None]:
model_name = "rf"
model_fn = RandomForestClassifier
model_paramgrid = rf_paramgrid

gridsearch_results = []
for tfidf_param in tqdm(tfidf_paramgrid):
    datasets = prepare_datasets(tfidf_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        data_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][data_proc][0]
        val_set = datasets[data_type][data_proc][1]
        test_set = datasets[data_type][data_proc][2]
        trainval_set = datasets[data_type][data_proc][3]
        
        train_label = labels[data_proc][0]
        val_label = labels[data_proc][1]
        test_label = labels[data_proc][2]
        trainval_label = labels[data_proc][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(tfidf_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            
final_rf_results = pd.DataFrame.from_records(gridsearch_results)
final_rf_results = final_rf_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_rf_results.to_csv("model_results/tfidf/rf.csv", index=False)

### SVM

In [None]:
model_name = "svm"
model_fn = SVC
model_paramgrid = svm_paramgrid

gridsearch_results = []
for tfidf_param in tqdm(tfidf_paramgrid):
    datasets = prepare_datasets(tfidf_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        data_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][data_proc][0]
        val_set = datasets[data_type][data_proc][1]
        test_set = datasets[data_type][data_proc][2]
        trainval_set = datasets[data_type][data_proc][3]
        
        train_label = labels[data_proc][0]
        val_label = labels[data_proc][1]
        test_label = labels[data_proc][2]
        trainval_label = labels[data_proc][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(tfidf_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            
final_svm_results = pd.DataFrame.from_records(gridsearch_results)
final_svm_results = final_svm_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_svm_results.to_csv("model_results/tfidf/svm.csv", index=False)

### Dummy Classifier

In [None]:
model_name = "dummy"
model_fn = DummyClassifier
model_paramgrid = dummy_paramgrid

gridsearch_results = []
for tfidf_param in tqdm(tfidf_paramgrid):
    datasets = prepare_datasets(tfidf_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        data_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][data_proc][0]
        val_set = datasets[data_type][data_proc][1]
        test_set = datasets[data_type][data_proc][2]
        trainval_set = datasets[data_type][data_proc][3]
        
        train_label = labels[data_proc][0]
        val_label = labels[data_proc][1]
        test_label = labels[data_proc][2]
        trainval_label = labels[data_proc][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1"]["f1-score"]
            val_f1_zero = val_metrics["0"]["f1-score"]
            val_f1_pos = val_metrics["1"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1"]["f1-score"]
            test_f1_zero = test_metrics["0"]["f1-score"]
            test_f1_pos = test_metrics["1"]["f1-score"]

            results = { "model": model_name }
            results.update(tfidf_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            
final_dummy_results = pd.DataFrame.from_records(gridsearch_results)
final_dummy_results = final_dummy_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_dummy_results.to_csv("model_results/tfidf/dummy.csv", index=False)

### Combine and Save All Results

In [None]:
combined_df = pd.concat([final_logreg_results, final_nb_results, final_svm_results, final_rf_results])
combined_df = combined_df.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
combined_df.to_csv("model_results/tfidf/combined.csv", index=False)

In [None]:
combined_df = pd.concat([final_logreg_results, final_nb_results, final_svm_results, final_rf_results, \
                        final_dummy_results])
combined_df = combined_df.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
combined_df.to_csv("model_results/tfidf/combined.csv", index=False)

# K-Fold Cross-Validation

## Generate Predictions for Meta Model

In [None]:
def custom_k_fold(model_grid, vectorizer_grid, column, data, model_name):
    
    # Generate fold predictions
    fold_num = 1
    for tf_combi in data:
        train = tf_combi[0]
        predict_on = tf_combi[1]

        # Vectorise Data
        for v in vectorizer_grid:
            vectorizer = TfidfVectorizer(**v)
        bow_train = vectorizer.fit_transform(train[column])
        bow_predict_on = vectorizer.transform(predict_on[column])
        
        # Get Labels
        train_label = train.label
        
        # Fit Model
        for m in model_grid:
            model = model_fn(**m)
        model.fit(bow_train, train_label)
        predictions = model.predict_proba(bow_predict_on)
        
        # Create Dataframe and output
        df = pd.DataFrame(data=predictions, columns = [model_name+'_prob_neg', model_name+'_prob_neu', model_name+'_prob_pos'])
        df.drop(columns= [model_name+'_prob_neu'])
        ordered_cols = [model_name+'_prob_pos',model_name+'_prob_neg']
        df=df[ordered_cols]
        
        if fold_num <=5:
            path = "fold_labels/" + model_name + '_fold' + str(fold_num) +'.csv'
        else:
            path = "fold_labels/" + model_name + '_test.csv'
        
        df.to_csv(path, index=False)
        
        fold_num +=1

In [None]:
# Import Data
fold1 = pd.read_csv('fold_labels/fold1.csv')
fold2 = pd.read_csv('fold_labels/fold2.csv')
fold3 = pd.read_csv('fold_labels/fold3.csv')
fold4 = pd.read_csv('fold_labels/fold4.csv')
fold5 = pd.read_csv('fold_labels/fold5.csv')

train1 = pd.read_csv('fold_labels/train1.csv')
train2 = pd.read_csv('fold_labels/train2.csv')
train3 = pd.read_csv('fold_labels/train3.csv')
train4 = pd.read_csv('fold_labels/train4.csv')
train5 = pd.read_csv('fold_labels/train5.csv')

train_all = pd.read_csv('fold_labels/train_all.csv')
test = pd.read_csv('fold_labels/test.csv')

# store in suitable data structure
data = [(train1, fold1), (train2, fold2),(train3, fold3), (train4, fold4), (train5, fold5), (train_all, test)]

In [3]:
# Instantiate model grid that gives highest validtion weighted F1
logreg_params = {
    "C": [5],
    "solver": ["lbfgs"],
    "penalty": ["l2"],
    "class_weight": ["balanced"] 
}

logreg_paramgrid = list(ParameterGrid(logreg_params))

# Instantiate CountVectorizer grid with Params giving highest validation weighted F1
logreg_tfidf_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,2)],
    "max_df": [0.25],
    "min_df": [1]
}
logreg_tfidf_paramgrid = list(ParameterGrid(logreg_tfidf_params))

# Best text processing
column = 'phrase_stem_emoticon_generic'

# Model Function
model_fn = LogisticRegression

custom_k_fold(logreg_paramgrid, logreg_tfidf_paramgrid, column, data, "logreg")

NameError: name 'custom_k_fold' is not defined

## Train Full Model

In [5]:
# save full model
full_df = pd.read_csv("new_labels/ALL_LABELLED_DATA.csv")

tfidf = TfidfVectorizer(**logreg_tfidf_paramgrid[0])
tfidf_train_emoticon_generic_stem = tfidf.fit_transform(full_df.phrase_stem_emoticon_generic)

final_model = LogisticRegression(**logreg_paramgrid[0])
final_model.fit(tfidf_train_emoticon_generic_stem, full_df.label)

vect_pkl_filename = "saved_models/model_logreg_vectorizer.pkl"
model_pkl_filename = "saved_models/model_logreg.pkl"
with open(model_pkl_filename, 'wb') as file:
    pickle.dump(final_model, file)
with open(vect_pkl_filename, 'wb') as file:
    pickle.dump(tfidf, file)