### Read in predictions from all models

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns


In [None]:
models = ["BERT","fasttext", "logreg", "NB", "RF", "SVM","VADER"]
meta_model_train = pd.DataFrame()
meta_model_test = pd.DataFrame()
meta_model_fold_1 = pd.DataFrame()
meta_model_fold_2 = pd.DataFrame()
meta_model_fold_3 = pd.DataFrame()
meta_model_fold_4 = pd.DataFrame()
meta_model_fold_5 = pd.DataFrame()

for model in models:
    fold1pred = pd.read_csv(f'data/fold_predictions/{model}/{model}_fold1.csv')
    meta_model_fold_1 = pd.concat([meta_model_fold_1,fold1pred], axis=1)
    
    fold2pred = pd.read_csv(f'data/fold_predictions/{model}/{model}_fold2.csv')
    meta_model_fold_2 = pd.concat([meta_model_fold_2,fold2pred], axis=1)

    fold3pred = pd.read_csv(f'data/fold_predictions/{model}/{model}_fold3.csv')
    meta_model_fold_3 = pd.concat([meta_model_fold_3,fold3pred], axis=1)

    fold4pred = pd.read_csv(f'data/fold_predictions/{model}/{model}_fold4.csv')
    meta_model_fold_4 = pd.concat([meta_model_fold_4,fold4pred], axis=1)

    fold5pred = pd.read_csv(f'data/fold_predictions/{model}/{model}_fold5.csv')    
    meta_model_fold_5 = pd.concat([meta_model_fold_5,fold5pred], axis=1)

    all_pred_train = pd.concat([fold1pred,fold2pred,fold3pred,fold4pred,fold5pred],axis = 0)
    meta_model_train = pd.concat([meta_model_train,all_pred_train], axis=1)
    
    testpred = pd.read_csv(f'data/fold_predictions/{model}/{model}_test.csv')
    meta_model_test = pd.concat([meta_model_test,testpred], axis=1)

    
data = [meta_model_fold_1,meta_model_fold_2,meta_model_fold_3,meta_model_fold_4,meta_model_fold_5,meta_model_train]
meta_model_train.head()


# Modelling

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 100)
import numpy as np
import utils
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer 
import re
from tqdm import tqdm

from sklearn.model_selection import ParameterGrid

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
# logistic regression
logreg_params = {
    "C": [0.1, 0.5, 1.0, 1.5, 5],
    "solver": ["lbfgs", "newton-cg"],
    "penalty": ["l2", "none"],
    "class_weight": ["balanced", None] 
}
logreg_paramgrid = list(ParameterGrid(logreg_params))

In [None]:
# naive bayes
nb_params = {
    "alpha": [0, 0.001, 0.01, 0.1, 0.25, 0.5, 1]
}
nb_paramgrid = list(ParameterGrid(nb_params))

In [None]:
# svm
svm_params = {
    "C": [0.1, 0.5, 1.0, 1.5, 5],
    "kernel": ["poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
    "class_weight": ["balanced", None] 
}

svm_paramgrid = list(ParameterGrid(svm_params))

In [None]:
rf_params = [
    {
        "criterion": ["gini", "entropy"],
        "min_samples_split": [2, 5, 10],
        "class_weight": ["balanced", "balanced_subsample", None],
        "max_features": ["auto","sqrt"],
        "min_samples_leaf": [1, 2, 4]
    }
]

rf_paramgrid = list(ParameterGrid(rf_params))

In [None]:
dummy_params = { "strategy": ["prior"] }
dummy_paramgrid = list(ParameterGrid(dummy_params))

# Grid Search 

### Logistic Regression/NB/SVM/DummyClassifier

In [None]:
def models_grid_search(model_name, model_fn, model_paramgrid,data, test) : 
    ind = 0 
    gridsearch_results = []      
    
    # train models
    for model_param in model_paramgrid:

        # 5 fold cross val
        val_accuracy = []
        val_f1_weighted = []
        val_f1_neg = []
        val_f1_zero = []
        val_f1_pos = []
    
        test_accuracy = []
        test_f1_weighted = []
        test_f1_neg = []
        test_f1_zero = []
        test_f1_pos = []
        
        for i in range(5):
            print(f"fold {i}")
            train_set = pd.DataFrame()
            for x in range(5):
                if i != x :
                    train_set = pd.concat([train_set,data[x]],axis=0)
            val_set = data[i]
            test_set = test
            trainval_set = data[5]

            train_label = train_set.label
            val_label = val_set.label
            test_label = test_set.label
            trainval_label = trainval_set.label

            # train on train model test on val
            model = model_fn(**model_param)
            model.fit(train_set.iloc[:,:-1], train_label)
            val_pred = model.predict(val_set.iloc[:,:-1])
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy.append(val_metrics["accuracy"])
            val_f1_weighted.append(val_metrics["weighted avg"]["f1-score"])
            val_f1_neg.append(val_metrics["-1.0"]["f1-score"])
            val_f1_zero.append(val_metrics["0.0"]["f1-score"])
            val_f1_pos.append(val_metrics["1.0"]["f1-score"])

            # train on train_val model test on test
            model = model_fn(**model_param)
            model.fit(trainval_set.iloc[:,:-1], trainval_label)
            test_pred = model.predict(test_set.iloc[:,:-1])

            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy.append(test_metrics["accuracy"])
            test_f1_weighted.append(test_metrics["weighted avg"]["f1-score"])
            test_f1_neg.append(test_metrics["-1.0"]["f1-score"])
            test_f1_zero.append(test_metrics["0.0"]["f1-score"])
            test_f1_pos.append(test_metrics["1.0"]["f1-score"])

        results = { "model": model_name }
        results.update(model_param)
        results.update({"val_f1_weighted": np.mean(val_f1_weighted), "val_f1_neg": np.mean(val_f1_neg), 
                        "val_f1_zero": np.mean(val_f1_zero), "val_f1_pos": np.mean(val_f1_pos),
                        "val_accuracy": np.mean(val_accuracy)})
        results.update({"test_f1_weighted": np.mean(test_f1_weighted), "test_f1_neg": np.mean(test_f1_neg), 
                        "test_f1_zero": np.mean(test_f1_zero), "test_f1_pos": np.mean(test_f1_pos),
                        "test_accuracy": np.mean(test_accuracy)})
        print(results)
        gridsearch_results.append(results)
        ind += 1
    return gridsearch_results
        

In [None]:
model_name = "logreg"
model_fn = LogisticRegression
model_paramgrid = logreg_paramgrid
final_logreg_results = pd.DataFrame.from_records(models_grid_search(model_name,model_fn,model_paramgrid, data, meta_model_test))

model_name = "nb"
model_fn = MultinomialNB
model_paramgrid = nb_paramgrid
final_nb_results = pd.DataFrame.from_records(models_grid_search(model_name,model_fn,model_paramgrid, data, meta_model_test))

model_name = "svm"
model_fn = SVC
model_paramgrid = svm_paramgrid
final_svm_results = pd.DataFrame.from_records(models_grid_search(model_name,model_fn,model_paramgrid, data, meta_model_test))

model_name = "dummy"
model_fn = DummyClassifier
model_paramgrid = dummy_paramgrid
final_dummy_results = pd.DataFrame.from_records(models_grid_search(model_name,model_fn,model_paramgrid, data, meta_model_test))

In [None]:
final_logreg_results = final_logreg_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_nb_results = final_nb_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_svm_results = final_svm_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_dummy_results = final_dummy_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)

In [None]:
# best model
final_logreg_results.iloc[0,:]

In [None]:
final_nb_results.iloc[0,:]

In [None]:
final_svm_results.iloc[0,:]

In [None]:
final_dummy_results.iloc[0,:]

### Train on all data & save model

In [None]:
alldata = pd.concat([meta_model_train,meta_model_test],axis=0)
alldata.columns

In [None]:
# save model trained on all data
import pickle
best_param = {"C": final_logreg_results.iloc[0,:].to_dict()["C"],
"class_weight" : final_logreg_results.iloc[0,:].to_dict()["class_weight"] , "penalty" : final_logreg_results.iloc[0,:].to_dict()["penalty"], "solver" : final_logreg_results.iloc[0,:].to_dict()["solver"]}
final_model = LogisticRegression(**best_param)
final_model.fit(alldata.iloc[:,:-1], alldata.label)

model_pkl_filename = "saved_models/model_meta.pkl"
with open(model_pkl_filename, 'wb') as file:
    pickle.dump(final_model, file)

### Get F1-Score by Aspect

In [None]:
aspects = meta_model_train.aspect.unique().tolist()

In [None]:
def models_grid_search_aspect(model_name,train, test) : 
    gridsearch_results = []


    # train on train_val model test on test
    # best params determined previously
    model = LogisticRegression(C=0.1,class_weight="balanced",penalty="l2", solver="lbfgs")
    model.fit(train.iloc[:,:-2], train.label)
    test_pred = model.predict(test.iloc[:,:-2])
    df= pd.DataFrame({"Aspects":test.aspect,"Labels":test.label,"Predictions":test_pred})    

    print("Train on Training-Val (all folds) test on Test Data")
    for aspect in aspects:
        print(f"Aspect = {aspect}")
        test_label_aspect = df.loc[df.Aspects == aspect,"Labels"]
        test_pred_aspect = df.loc[df.Aspects == aspect,"Predictions"]
        print(classification_report(test_label_aspect, test_pred_aspect))

    return df
        

In [None]:
model_name = "logreg"
output = models_grid_search_aspect(model_name, meta_model_train, meta_model_test)

#### Correlation of predictions

In [None]:
plt.figure(figsize=(16, 6))
# define the mask to set the values in the upper triangle to True
mask = np.triu(np.ones_like(alldata.corr(), dtype=np.bool))
heatmap = sns.heatmap(alldata.corr(), mask=mask, vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation of Predictions of Base Models', fontdict={'fontsize':18}, pad=16);

### Feature Importance

In [None]:
import pickle
final_model = pickle.load(open("saved_models/model_meta.pkl", "rb"))

In [None]:
# get importance
importance = final_model.coef_
classes = final_model.classes_
features = alldata.columns.tolist()[:-1]
features.append("intercept")
dictionary = {"Class":[],"Feature":[],"Score":[]}
# summarize feature importance
for j in classes:
    print(f"Class = {j}")
    print(final_model.intercept_[int(j)])
    coeff = list(importance[int(j)]) + [final_model.intercept_[int(j)]]
    print(f"Intercept: {final_model.intercept_[int(j)]}")
    for i,v in zip(features,coeff):
        dictionary["Class"].append(j)
        dictionary["Feature"].append(i)
        dictionary["Score"].append(v)
        print(f'Feature: {i}, Score: {v}')

    print()
feature_importance_dictionary = pd.DataFrame(dictionary)

In [None]:
plt.figure(figsize=(5,10))
plt.subplot(1, 1, 1)

### CHANGE THIS PART ####
plt.barh(features, importance[-1], height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in importance[-1]])
plt.title('Class = -1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')
########################## 


plt.show()

In [None]:
plt.figure(figsize=(5,10))
plt.subplot(1, 1, 1)

plt.barh(features, importance[0], height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in importance[-1]])
plt.title('Class = 0.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')
 
plt.show()

In [None]:
plt.figure(figsize=(5,10))
plt.subplot(1, 1, 1)

plt.barh(features, importance[1], height=0.8,
         color=["#E3242B" if x<0 else "#00AB6B" for x in importance[-1]])
plt.title('Class = 1.0 top features')
plt.ylabel('Features')
plt.xlabel('Weight')

plt.show()