In [None]:
import pickle
import pandas as pd
pd.set_option('display.max_rows', 100)
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer 
import re
from tqdm import tqdm

from sklearn.model_selection import ParameterGrid

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

# text explainer
from sklearn.pipeline import make_pipeline
import eli5
from eli5.lime import TextExplainer
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from collections import defaultdict

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv("data/labelled_data/train_newpreproc_emoticon.csv", index_col=0)
val = pd.read_csv("data/labelled_data/val_newpreproc_emoticon.csv", index_col=0)
test = pd.read_csv("data/labelled_data/test_newpreproc_emoticon.csv", index_col=0)

trainval = pd.concat([train, val])

# Modelling

## Initialise Parameters

In [None]:
tfidf_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1), (1,2), (1,3)],
    "max_df": [0.25, 0.5, 1.0],
    "min_df": [1, 10, 20]
}
tfidf_paramgrid = list(ParameterGrid(tfidf_params))

type_proc_params = {
    "type": ["normal", "stem", "lemma"], # normal, lemma, stem
    "processing": ["original", "generic", "unique"] # new phrase splitting
}
type_proc_paramgrid = list(ParameterGrid(type_proc_params))

In [None]:
# logistic regression
logreg_params = {
    "C": [0.1, 0.5, 1.0, 1.5, 5],
    "solver": ["lbfgs", "newton-cg"],
    "penalty": ["l2", "none"],
    "class_weight": ["balanced", None] 
}
logreg_paramgrid = list(ParameterGrid(logreg_params))

In [None]:
# naive bayes
nb_params = {
    "alpha": [0, 0.001, 0.01, 0.1, 0.25, 0.5, 1]
}
nb_paramgrid = list(ParameterGrid(nb_params))

In [None]:
# svm
svm_params = {
    "C": [0.1, 0.5, 1.0, 1.5, 5],
    "kernel": ["poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
    "class_weight": ["balanced", None] 
}

svm_paramgrid = list(ParameterGrid(svm_params))

In [None]:
rf_params = [
    {
        "criterion": ["gini", "entropy"],
        "min_samples_split": [2, 5, 10],
        "class_weight": ["balanced", "balanced_subsample", None],
        "max_features": ["auto", "sqrt"],
        "min_samples_leaf": [1, 2, 4]
    }
]

rf_paramgrid = list(ParameterGrid(rf_params))

In [None]:
dummy_params = { "strategy": ["prior"] }
dummy_paramgrid = list(ParameterGrid(dummy_params))

## Grid Search

In [None]:
def prepare_datasets(tfidf_param):
    # original
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_emoticon_generic = tfidf.fit_transform(train.phrase_emoticon_generic)
    tfidf_val_emoticon_generic = tfidf.transform(val.phrase_emoticon_generic)
    tfidf_test_emoticon_generic = tfidf.transform(test.phrase_emoticon_generic)
    tfidf_trainval_emoticon_generic = tfidf.transform(trainval.phrase_emoticon_generic)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_emoticon_unique = tfidf.fit_transform(train.phrase_emoticon_unique)
    tfidf_val_emoticon_unique = tfidf.transform(val.phrase_emoticon_unique)
    tfidf_test_emoticon_unique = tfidf.transform(test.phrase_emoticon_unique)
    tfidf_trainval_emoticon_unique = tfidf.transform(trainval.phrase_emoticon_unique)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train = tfidf.fit_transform(train.phrase)
    tfidf_val = tfidf.transform(val.phrase)
    tfidf_test = tfidf.transform(test.phrase)
    tfidf_trainval = tfidf.transform(trainval.phrase)
    
    # lemmatize
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_emoticon_generic_lemma = tfidf.fit_transform(train.phrase_lemma_emoticon_generic)
    tfidf_val_emoticon_generic_lemma = tfidf.transform(val.phrase_lemma_emoticon_generic)
    tfidf_test_emoticon_generic_lemma = tfidf.transform(test.phrase_lemma_emoticon_generic)
    tfidf_trainval_emoticon_generic_lemma = tfidf.transform(trainval.phrase_lemma_emoticon_generic)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_emoticon_unique_lemma = tfidf.fit_transform(train.phrase_lemma_emoticon_unique)
    tfidf_val_emoticon_unique_lemma = tfidf.transform(val.phrase_lemma_emoticon_unique)
    tfidf_test_emoticon_unique_lemma = tfidf.transform(test.phrase_lemma_emoticon_unique)
    tfidf_trainval_emoticon_unique_lemma = tfidf.transform(trainval.phrase_lemma_emoticon_unique)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_lemma = tfidf.fit_transform(train.phrase_lemma)
    tfidf_val_lemma = tfidf.transform(val.phrase_lemma)
    tfidf_test_lemma = tfidf.transform(test.phrase_lemma)
    tfidf_trainval_lemma = tfidf.transform(trainval.phrase_lemma)
    
    # stem
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_emoticon_generic_stem = tfidf.fit_transform(train.phrase_stem_emoticon_generic)
    tfidf_val_emoticon_generic_stem = tfidf.transform(val.phrase_stem_emoticon_generic)
    tfidf_test_emoticon_generic_stem = tfidf.transform(test.phrase_stem_emoticon_generic)
    tfidf_trainval_emoticon_generic_stem = tfidf.transform(trainval.phrase_stem_emoticon_generic)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_emoticon_unique_stem = tfidf.fit_transform(train.phrase_stem_emoticon_unique)
    tfidf_val_emoticon_unique_stem = tfidf.transform(val.phrase_stem_emoticon_unique)
    tfidf_test_emoticon_unique_stem = tfidf.transform(test.phrase_stem_emoticon_unique)
    tfidf_trainval_emoticon_unique_stem = tfidf.transform(trainval.phrase_stem_emoticon_unique)
    
    tfidf = TfidfVectorizer(**tfidf_param)
    tfidf_train_stem = tfidf.fit_transform(train.phrase_stem)
    tfidf_val_stem = tfidf.transform(val.phrase_stem)
    tfidf_test_stem = tfidf.transform(test.phrase_stem)
    tfidf_trainval_stem = tfidf.transform(trainval.phrase_stem)
    
    return {
        "normal": {
            "original": [tfidf_train, tfidf_val, tfidf_test, tfidf_trainval],
            "generic": [tfidf_train_emoticon_generic, tfidf_val_emoticon_generic, tfidf_test_emoticon_generic, tfidf_trainval_emoticon_generic],
            "unique": [tfidf_train_emoticon_unique, tfidf_val_emoticon_unique, tfidf_test_emoticon_unique, tfidf_trainval_emoticon_unique]
        },
        "lemma": {
            "original": [tfidf_train_lemma, tfidf_val_lemma, tfidf_test_lemma, tfidf_trainval_lemma],
            "generic": [tfidf_train_emoticon_generic_lemma, tfidf_val_emoticon_generic_lemma, tfidf_test_emoticon_generic_lemma, tfidf_trainval_emoticon_generic_lemma],
            "unique": [tfidf_train_emoticon_unique_lemma, tfidf_val_emoticon_unique_lemma, tfidf_test_emoticon_unique_lemma, tfidf_trainval_emoticon_unique_lemma]
        },
        "stem": {
            "original": [tfidf_train_stem, tfidf_val_stem, tfidf_test_stem, tfidf_trainval_stem],
            "generic": [tfidf_train_emoticon_generic_stem, tfidf_val_emoticon_generic_stem, tfidf_test_emoticon_generic_stem, tfidf_trainval_emoticon_generic_stem],
            "unique": [tfidf_train_emoticon_unique_stem, tfidf_val_emoticon_unique_stem, tfidf_test_emoticon_unique_stem, tfidf_trainval_emoticon_unique_stem]
        }
    }

In [None]:
labels = {
    "original": [train.label, val.label, test.label, trainval.label],
    "generic": [train.label, val.label, test.label, trainval.label],
    "unique": [train.label, val.label, test.label, trainval.label]
}

### Logistic Regression

In [None]:
model_name = "logreg"
model_fn = LogisticRegression
model_paramgrid = logreg_paramgrid

gridsearch_results = []
for tfidf_param in tqdm(tfidf_paramgrid):
    datasets = prepare_datasets(tfidf_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        data_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][data_proc][0]
        val_set = datasets[data_type][data_proc][1]
        test_set = datasets[data_type][data_proc][2]
        trainval_set = datasets[data_type][data_proc][3]
        
        train_label = labels[data_proc][0]
        val_label = labels[data_proc][1]
        test_label = labels[data_proc][2]
        trainval_label = labels[data_proc][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(tfidf_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            
final_logreg_results = pd.DataFrame.from_records(gridsearch_results)
final_logreg_results = final_logreg_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
# final_logreg_results.to_csv("model_results/tfidf/logreg.csv", index=False)

### Naive Bayes

In [None]:
model_name = "nb"
model_fn = MultinomialNB
model_paramgrid = nb_paramgrid

gridsearch_results = []
for tfidf_param in tqdm(tfidf_paramgrid):
    datasets = prepare_datasets(tfidf_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        data_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][data_proc][0]
        val_set = datasets[data_type][data_proc][1]
        test_set = datasets[data_type][data_proc][2]
        trainval_set = datasets[data_type][data_proc][3]
        
        train_label = labels[data_proc][0]
        val_label = labels[data_proc][1]
        test_label = labels[data_proc][2]
        trainval_label = labels[data_proc][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(tfidf_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            
final_nb_results = pd.DataFrame.from_records(gridsearch_results)
final_nb_results = final_nb_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_nb_results.to_csv("model_results/tfidf/nb.csv", index=False)

### Random Forest

In [None]:
model_name = "rf"
model_fn = RandomForestClassifier
model_paramgrid = rf_paramgrid

gridsearch_results = []
for tfidf_param in tqdm(tfidf_paramgrid):
    datasets = prepare_datasets(tfidf_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        data_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][data_proc][0]
        val_set = datasets[data_type][data_proc][1]
        test_set = datasets[data_type][data_proc][2]
        trainval_set = datasets[data_type][data_proc][3]
        
        train_label = labels[data_proc][0]
        val_label = labels[data_proc][1]
        test_label = labels[data_proc][2]
        trainval_label = labels[data_proc][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(tfidf_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            
final_rf_results = pd.DataFrame.from_records(gridsearch_results)
final_rf_results = final_rf_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_rf_results.to_csv("model_results/tfidf/rf.csv", index=False)

### SVM

In [None]:
model_name = "svm"
model_fn = SVC
model_paramgrid = svm_paramgrid

gridsearch_results = []
for tfidf_param in tqdm(tfidf_paramgrid):
    datasets = prepare_datasets(tfidf_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        data_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][data_proc][0]
        val_set = datasets[data_type][data_proc][1]
        test_set = datasets[data_type][data_proc][2]
        trainval_set = datasets[data_type][data_proc][3]
        
        train_label = labels[data_proc][0]
        val_label = labels[data_proc][1]
        test_label = labels[data_proc][2]
        trainval_label = labels[data_proc][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(tfidf_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            
final_svm_results = pd.DataFrame.from_records(gridsearch_results)
final_svm_results = final_svm_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_svm_results.to_csv("model_results/tfidf/svm.csv", index=False)

### Dummy Classifier

In [None]:
model_name = "dummy"
model_fn = DummyClassifier
model_paramgrid = dummy_paramgrid

gridsearch_results = []
for tfidf_param in tqdm(tfidf_paramgrid):
    datasets = prepare_datasets(tfidf_param)
    
    for type_proc_param in type_proc_paramgrid:
        data_type = type_proc_param["type"]
        data_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][data_proc][0]
        val_set = datasets[data_type][data_proc][1]
        test_set = datasets[data_type][data_proc][2]
        trainval_set = datasets[data_type][data_proc][3]
        
        train_label = labels[data_proc][0]
        val_label = labels[data_proc][1]
        test_label = labels[data_proc][2]
        trainval_label = labels[data_proc][3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1"]["f1-score"]
            val_f1_zero = val_metrics["0"]["f1-score"]
            val_f1_pos = val_metrics["1"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1"]["f1-score"]
            test_f1_zero = test_metrics["0"]["f1-score"]
            test_f1_pos = test_metrics["1"]["f1-score"]

            results = { "model": model_name }
            results.update(tfidf_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            
final_dummy_results = pd.DataFrame.from_records(gridsearch_results)
final_dummy_results = final_dummy_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_dummy_results.to_csv("model_results/tfidf/dummy.csv", index=False)

### Combine and Save All Results

In [None]:
combined_df = pd.concat([final_logreg_results, final_nb_results, final_svm_results, final_rf_results])
combined_df = combined_df.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
combined_df.to_csv("model_results/tfidf/combined.csv", index=False)

In [None]:
combined_df = pd.concat([final_logreg_results, final_nb_results, final_svm_results, final_rf_results, \
                        final_dummy_results])
combined_df = combined_df.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
combined_df.to_csv("model_results/tfidf/combined.csv", index=False)

# K-Fold Cross-Validation

## Generate Predictions for Meta Model

In [None]:
def custom_k_fold(model_grid, vectorizer_grid, column, data, model_name):
    
    # Generate fold predictions
    fold_num = 1
    for tf_combi in data:
        train = tf_combi[0]
        predict_on = tf_combi[1]

        # Vectorise Data
        for v in vectorizer_grid:
            vectorizer = TfidfVectorizer(**v)
        bow_train = vectorizer.fit_transform(train[column])
        bow_predict_on = vectorizer.transform(predict_on[column])
        
        # Get Labels
        train_label = train.label
        
        # Fit Model
        for m in model_grid:
            model = model_fn(**m)
        model.fit(bow_train, train_label)
        predictions = model.predict_proba(bow_predict_on)
        
        # Create Dataframe and output
        df = pd.DataFrame(data=predictions, columns = [model_name+'_prob_neg', model_name+'_prob_neu', model_name+'_prob_pos'])
        df.drop(columns= [model_name+'_prob_neu'])
        ordered_cols = [model_name+'_prob_pos',model_name+'_prob_neg']
        df=df[ordered_cols]
        
        if fold_num <=5:
            path = "data/fold_predictions/" + model_name + "/" + model_name + '_fold' + str(fold_num) +'.csv'
        else:
            path = "data/fold_predictions/" + model_name + "/" + model_name + '_test.csv'
        
        df.to_csv(path, index=False)
        
        fold_num +=1

In [None]:
# Import Data
fold1 = pd.read_csv('data/stacking_folds/fold1.csv')
fold2 = pd.read_csv('data/stacking_folds/fold2.csv')
fold3 = pd.read_csv('data/stacking_folds/fold3.csv')
fold4 = pd.read_csv('data/stacking_folds/fold4.csv')
fold5 = pd.read_csv('data/stacking_folds/fold5.csv')

train1 = pd.read_csv('data/stacking_folds/train1.csv')
train2 = pd.read_csv('data/stacking_folds/train2.csv')
train3 = pd.read_csv('data/stacking_folds/train3.csv')
train4 = pd.read_csv('data/stacking_folds/train4.csv')
train5 = pd.read_csv('data/stacking_folds/train5.csv')

train_all = pd.read_csv('data/stacking_folds/train_all.csv')
test = pd.read_csv('data/stacking_folds/test.csv')

# store in suitable data structure
data = [(train1, fold1), (train2, fold2),(train3, fold3), (train4, fold4), (train5, fold5), (train_all, test)]

In [None]:
# Instantiate model grid that gives highest validtion weighted F1
logreg_params = {
    "C": [5],
    "solver": ["lbfgs"],
    "penalty": ["l2"],
    "class_weight": ["balanced"] 
}

logreg_paramgrid = list(ParameterGrid(logreg_params))

# Instantiate CountVectorizer grid with Params giving highest validation weighted F1
logreg_tfidf_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,2)],
    "max_df": [0.25],
    "min_df": [1]
}
logreg_tfidf_paramgrid = list(ParameterGrid(logreg_tfidf_params))

# Best text processing
column = 'phrase_stem_emoticon_unique'

# Model Function
model_fn = LogisticRegression

custom_k_fold(logreg_paramgrid, logreg_tfidf_paramgrid, column, data, "logreg")

## Train Full Model

In [None]:
# save full model
full_df = pd.read_csv("data/stacking_folds/all_labelled_data.csv")

tfidf = TfidfVectorizer(**logreg_tfidf_paramgrid[0])
tfidf_train_emoticon_unique_stem = tfidf.fit_transform(full_df.phrase_stem_emoticon_unique)

final_model = LogisticRegression(**logreg_paramgrid[0])
final_model.fit(tfidf_train_emoticon_unique_stem, full_df.label)

vect_pkl_filename = "saved_models/model_logreg_vectorizer.pkl"
model_pkl_filename = "saved_models/model_logreg.pkl"
with open(model_pkl_filename, 'wb') as file:
    pickle.dump(final_model, file)
with open(vect_pkl_filename, 'wb') as file:
    pickle.dump(tfidf, file)
    
# final predictions
transformed_text = tfidf.transform(full_df.phrase_stem_emoticon_unique)
model_pred = final_model.predict_proba(transformed_text)

model_name = "logreg"
df = pd.DataFrame(data=model_pred, columns = [model_name+'_prob_neg', model_name+'_prob_neu', model_name+'_prob_pos'])
df.drop(columns= [model_name+'_prob_neu'])
ordered_cols = [model_name+'_prob_pos',model_name+'_prob_neg']
df=df[ordered_cols]

df.to_csv("data/fold_predictions/logreg/logreg_all.csv", index=False)

# Model Evaluation

## Stacked Model

In [None]:
meta_df = pd.read_csv("data/explain_results/meta_model_feature_importance.csv")

meta_df_neg = meta_df[meta_df.Class == -1.0]
meta_df_neu = meta_df[meta_df.Class == 0.0]
meta_df_pos = meta_df[meta_df.Class == 1.0]

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1, 3, 1)
plt.barh(meta_df_neg.Feature, meta_df_neg.Score, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in meta_df_neg.Score])
plt.title('y=-1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 2)
plt.barh(meta_df_neu.Feature, meta_df_neu.Score, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in meta_df_neu.Score])
plt.title('y=0.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 3)
plt.barh(meta_df_pos.Feature, meta_df_pos.Score, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in meta_df_pos.Score])
plt.title('y=1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.tight_layout(pad=3.0)
plt.show()

## Logistic Regression

In [None]:
# load data
all_train = pd.read_csv('data/stacking_folds/train_all.csv', header = 0)
all_test = pd.read_csv('data/stacking_folds/test.csv', header = 0)
full_df = pd.concat([all_train, all_test], axis=0).reset_index().drop('index', axis=1)
full_df_subset = full_df[["new_aspect_1", "phrase_stem_emoticon_unique", "label"]]

# load saved models
vect_pkl_filename = "saved_models/model_logreg_vectorizer.pkl"
model_pkl_filename = "saved_models/model_logreg.pkl"
lr_vectorizer = pickle.load(open(vect_pkl_filename, "rb"))
lr_model = pickle.load(open(model_pkl_filename, "rb"))

In [None]:
lr_eval = eli5.explain_weights_df(lr_model, vec=lr_vectorizer, top=20)
lr_eval_neg = lr_eval[lr_eval.target == -1.0]
lr_eval_neu = lr_eval[lr_eval.target == 0.0]
lr_eval_pos = lr_eval[lr_eval.target == 1.0]

# save results
lr_eval.to_csv("data/explain_results/logreg_lime.csv")

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1, 3, 1)
plt.barh(lr_eval_neg.feature, lr_eval_neg.weight, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in lr_eval_neg.weight])
plt.title('y=-1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 2)
plt.barh(lr_eval_neu.feature, lr_eval_neu.weight, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in lr_eval_neu.weight])
plt.title('y=0.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 3)
plt.barh(lr_eval_pos.feature, lr_eval_pos.weight, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in lr_eval_pos.weight])
plt.title('y=1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.show()

## Random Forest

In [None]:
rf_eval = pd.read_csv("data/explain_results/rf_lime.csv")

rf_eval["mag_neg"] = np.abs(rf_eval.average_neg_impact)
rf_eval["mag_neu"] = np.abs(rf_eval.average_neu_impact)
rf_eval["mag_pos"] = np.abs(rf_eval.average_pos_impact)

rf_eval_neg = rf_eval.nlargest(20, "mag_neg")
rf_eval_neu = rf_eval.nlargest(20, "mag_neu")
rf_eval_pos = rf_eval.nlargest(20, "mag_pos")

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1, 3, 1)
plt.barh(rf_eval_neg.token, rf_eval_neg.average_neg_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in rf_eval_neg.average_neg_impact])
plt.title('y=-1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 2)
plt.barh(rf_eval_neu.token, rf_eval_neu.average_neu_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in rf_eval_neu.average_neu_impact])
plt.title('y=0.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 3)
plt.barh(rf_eval_pos.token, rf_eval_pos.average_pos_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in rf_eval_pos.average_pos_impact])
plt.title('y=1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.show()

## SVM

In [None]:
svm_eval = pd.read_csv("data/explain_results/svm_lime.csv")

svm_eval["mag_neg"] = np.abs(svm_eval.average_neg_impact)
svm_eval["mag_neu"] = np.abs(svm_eval.average_neu_impact)
svm_eval["mag_pos"] = np.abs(svm_eval.average_pos_impact)

svm_eval_neg = svm_eval.nlargest(20, "mag_neg")
svm_eval_neu = svm_eval.nlargest(20, "mag_neu")
svm_eval_pos = svm_eval.nlargest(20, "mag_pos")

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1, 3, 1)
plt.barh(svm_eval_neg.token, svm_eval_neg.average_neg_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in svm_eval_neg.average_neg_impact])
plt.title('y=-1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 2)
plt.barh(svm_eval_neu.token, svm_eval_neu.average_neu_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in svm_eval_neu.average_neu_impact])
plt.title('y=0.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 3)
plt.barh(svm_eval_pos.token, svm_eval_pos.average_pos_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in svm_eval_pos.average_pos_impact])
plt.title('y=1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.show()

## NB

In [None]:
nb_eval = pd.read_csv("data/explain_results/nb_lime.csv")

nb_eval["mag_neg"] = np.abs(nb_eval.average_neg_impact)
nb_eval["mag_neu"] = np.abs(nb_eval.average_neu_impact)
nb_eval["mag_pos"] = np.abs(nb_eval.average_pos_impact)

nb_eval_neg = nb_eval.nlargest(20, "mag_neg")
nb_eval_neu = nb_eval.nlargest(20, "mag_neu")
nb_eval_pos = nb_eval.nlargest(20, "mag_pos")

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1, 3, 1)
plt.barh(nb_eval_neg.token, nb_eval_neg.average_neg_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in nb_eval_neg.average_neg_impact])
plt.title('y=-1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 2)
plt.barh(nb_eval_neu.token, nb_eval_neu.average_neu_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in nb_eval_neu.average_neu_impact])
plt.title('y=0.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.subplot(1, 3, 3)
plt.barh(nb_eval_pos.token, nb_eval_pos.average_pos_impact, height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in nb_eval_pos.average_pos_impact])
plt.title('y=1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.show()