### Read in predictions from all models

In [13]:
import pandas as pd
import numpy as np

In [90]:
models = ["fasttext", "logreg", "NB", "RF", "SVM","VADER"]
meta_model_train = pd.DataFrame()
meta_model_test = pd.DataFrame()
meta_model_fold_1 = pd.DataFrame()
meta_model_fold_2 = pd.DataFrame()
meta_model_fold_3 = pd.DataFrame()
meta_model_fold_4 = pd.DataFrame()
meta_model_fold_5 = pd.DataFrame()

for model in models:
    fold1pred = pd.read_csv(f'fold_predictions/{model}/{model}_fold1.csv')
    meta_model_fold_1 = pd.concat([meta_model_fold_1,fold1pred], axis=1)
    
    fold2pred = pd.read_csv(f'fold_predictions/{model}/{model}_fold2.csv')
    meta_model_fold_2 = pd.concat([meta_model_fold_2,fold2pred], axis=1)

    fold3pred = pd.read_csv(f'fold_predictions/{model}/{model}_fold3.csv')
    meta_model_fold_3 = pd.concat([meta_model_fold_3,fold3pred], axis=1)

    fold4pred = pd.read_csv(f'fold_predictions/{model}/{model}_fold4.csv')
    meta_model_fold_4 = pd.concat([meta_model_fold_4,fold4pred], axis=1)

    fold5pred = pd.read_csv(f'fold_predictions/{model}/{model}_fold5.csv')    
    meta_model_fold_5 = pd.concat([meta_model_fold_5,fold5pred], axis=1)

    all_pred_train = pd.concat([fold1pred,fold2pred,fold3pred,fold4pred,fold5pred],axis = 0)
    meta_model_train = pd.concat([meta_model_train,all_pred_train], axis=1)
    
    testpred = pd.read_csv(f'fold_predictions/{model}/{model}_test.csv')
    meta_model_test = pd.concat([meta_model_test,testpred], axis=1)

    
data = [meta_model_fold_1,meta_model_fold_2,meta_model_fold_3,meta_model_fold_4,meta_model_fold_5,meta_model_train]
meta_model_train.head()


Unnamed: 0,fasttext_prob_pos,fasttext_prob_neg,logreg_prob_pos,logreg_prob_neg,NB_prob_pos,NB_prob_neg,RF_prob_pos,RF_prob_neg,SVM_prob_pos,SVM_prob_neg,VADER_prob_pos,VADER_prob_neg,label
0,0.563547,0.214631,0.939938,0.004414,0.833302,0.006866,0.95,0.0,0.761402,0.014256,0.592,0.0,1.0
1,0.573885,0.212415,0.99843,0.000544,0.823375,0.052906,0.940909,0.0,0.884794,0.020397,1.0,0.0,1.0
2,0.368451,0.287994,0.479545,0.127199,0.548471,0.098711,0.711548,0.251786,0.675172,0.081166,0.145,0.15,0.0
3,0.311945,0.353513,0.172376,0.229518,0.043481,0.038845,0.319444,0.416667,0.60038,0.136811,0.154,0.203,1.0
4,0.431019,0.244249,0.740021,0.065695,0.867578,0.029769,0.825,0.0,0.852314,0.023726,0.765,0.0,1.0


# Modelling

In [91]:
import pandas as pd
pd.set_option('display.max_rows', 100)
import numpy as np
import utils

from sklearn.feature_extraction.text import CountVectorizer 
import re
from tqdm import tqdm

from sklearn.model_selection import ParameterGrid

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [92]:
# logistic regression
logreg_params = {
    "C": [0.1, 0.5, 1.0, 1.5, 5],
    "solver": ["lbfgs", "newton-cg"],
    "penalty": ["l2", "none"],
    "class_weight": ["balanced", None] 
}
logreg_paramgrid = list(ParameterGrid(logreg_params))

In [93]:
# naive bayes
nb_params = {
    "alpha": [0, 0.001, 0.01, 0.1, 0.25, 0.5, 1]
}
nb_paramgrid = list(ParameterGrid(nb_params))

In [94]:
# svm
svm_params = {
    "C": [0.1, 0.5, 1.0, 1.5, 5],
    "kernel": ["poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
    "class_weight": ["balanced", None] 
}

svm_paramgrid = list(ParameterGrid(svm_params))

In [95]:
rf_params = [
    {
        "criterion": ["gini", "entropy"],
        "min_samples_split": [2, 5, 10],
        "class_weight": ["balanced", "balanced_subsample", None],
        "max_features": ["auto","sqrt"],
        "min_samples_leaf": [1, 2, 4]
    }
]

rf_paramgrid = list(ParameterGrid(rf_params))

In [96]:
dummy_params = { "strategy": ["prior"] }
dummy_paramgrid = list(ParameterGrid(dummy_params))

# Grid Search 

### Logistic Regression/NB/SVM/DummyClassifier

In [98]:
def models_grid_search(model_name, model_fn, model_paramgrid,data, test) : 
    ind = 0 
    gridsearch_results = []      
    
    # train models
    for model_param in model_paramgrid:

        # 5 fold cross val
        val_accuracy = []
        val_f1_weighted = []
        val_f1_neg = []
        val_f1_zero = []
        val_f1_pos = []
    
        test_accuracy = []
        test_f1_weighted = []
        test_f1_neg = []
        test_f1_zero = []
        test_f1_pos = []
        
        for i in range(5):
            print(f"fold {i}")
            train_set = pd.DataFrame()
            for x in range(5):
                if i != x :
                    train_set = pd.concat([train_set,data[x]],axis=0)
            val_set = data[i]
            test_set = test
            trainval_set = data[5]

            train_label = train_set.label
            val_label = val_set.label
            test_label = test_set.label
            trainval_label = trainval_set.label

            # train on train model test on val
            model = model_fn(**model_param)
            model.fit(train_set.iloc[:,:-1], train_label)
            val_pred = model.predict(val_set.iloc[:,:-1])
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy.append(val_metrics["accuracy"])
            val_f1_weighted.append(val_metrics["weighted avg"]["f1-score"])
            val_f1_neg.append(val_metrics["-1.0"]["f1-score"])
            val_f1_zero.append(val_metrics["0.0"]["f1-score"])
            val_f1_pos.append(val_metrics["1.0"]["f1-score"])

            # train on train_val model test on test
            model = model_fn(**model_param)
            model.fit(trainval_set.iloc[:,:-1], trainval_label)
            test_pred = model.predict(test_set.iloc[:,:-1])

            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy.append(test_metrics["accuracy"])
            test_f1_weighted.append(test_metrics["weighted avg"]["f1-score"])
            test_f1_neg.append(test_metrics["-1.0"]["f1-score"])
            test_f1_zero.append(test_metrics["0.0"]["f1-score"])
            test_f1_pos.append(test_metrics["1.0"]["f1-score"])

        results = { "model": model_name }
        results.update(model_param)
        results.update({"val_f1_weighted": np.mean(val_f1_weighted), "val_f1_neg": np.mean(val_f1_neg), 
                        "val_f1_zero": np.mean(val_f1_zero), "val_f1_pos": np.mean(val_f1_pos),
                        "val_accuracy": np.mean(val_accuracy)})
        results.update({"test_f1_weighted": np.mean(test_f1_weighted), "test_f1_neg": np.mean(test_f1_neg), 
                        "test_f1_zero": np.mean(test_f1_zero), "test_f1_pos": np.mean(test_f1_pos),
                        "test_accuracy": np.mean(test_accuracy)})
        print(results)
        gridsearch_results.append(results)
        ind += 1
    return gridsearch_results
        

In [99]:
model_name = "logreg"
model_fn = LogisticRegression
model_paramgrid = logreg_paramgrid
final_logreg_results = pd.DataFrame.from_records(models_grid_search(model_name,model_fn,model_paramgrid, data, meta_model_test))

model_name = "nb"
model_fn = MultinomialNB
model_paramgrid = nb_paramgrid
final_nb_results = pd.DataFrame.from_records(models_grid_search(model_name,model_fn,model_paramgrid, data, meta_model_test))

model_name = "svm"
model_fn = SVC
model_paramgrid = svm_paramgrid
final_svm_results = pd.DataFrame.from_records(models_grid_search(model_name,model_fn,model_paramgrid, data, meta_model_test))

model_name = "dummy"
model_fn = DummyClassifier
model_paramgrid = dummy_paramgrid
final_dummy_results = pd.DataFrame.from_records(models_grid_search(model_name,model_fn,model_paramgrid, data, meta_model_test))

fold 0
fold 1
fold 2
fold 3
fold 4
{'model': 'logreg', 'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'lbfgs', 'val_f1_weighted': 0.746695166870537, 'val_f1_neg': 0.6313157508765863, 'val_f1_zero': 0.7131100652928284, 'val_f1_pos': 0.8004077480119083, 'val_accuracy': 0.7441714190855625, 'test_f1_weighted': 0.7411064175083611, 'test_f1_neg': 0.6844444444444445, 'test_f1_zero': 0.6549707602339181, 'test_f1_pos': 0.795566502463054, 'test_accuracy': 0.7329032258064516}
fold 0
fold 1
fold 2
fold 3
fold 4
{'model': 'logreg', 'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'newton-cg', 'val_f1_weighted': 0.746695166870537, 'val_f1_neg': 0.6313157508765863, 'val_f1_zero': 0.7131100652928284, 'val_f1_pos': 0.8004077480119083, 'val_accuracy': 0.7441714190855625, 'test_f1_weighted': 0.7411064175083611, 'test_f1_neg': 0.6844444444444445, 'test_f1_zero': 0.6549707602339181, 'test_f1_pos': 0.795566502463054, 'test_accuracy': 0.7329032258064516}
fold 0
fold 1
fold 2

fold 1
fold 2
fold 3
fold 4
{'model': 'logreg', 'C': 1.0, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'newton-cg', 'val_f1_weighted': 0.7463427912918318, 'val_f1_neg': 0.6430514946584338, 'val_f1_zero': 0.7052704865663203, 'val_f1_pos': 0.8024644292037199, 'val_accuracy': 0.7437277556440902, 'test_f1_weighted': 0.7409234732884576, 'test_f1_neg': 0.6608695652173913, 'test_f1_zero': 0.6573705179282869, 'test_f1_pos': 0.7995110024449877, 'test_accuracy': 0.7329032258064516}
fold 0
fold 1
fold 2
fold 3
fold 4
{'model': 'logreg', 'C': 1.0, 'class_weight': 'balanced', 'penalty': 'none', 'solver': 'lbfgs', 'val_f1_weighted': 0.7483964671067317, 'val_f1_neg': 0.647764495439006, 'val_f1_zero': 0.706957506055032, 'val_f1_pos': 0.8034955934458685, 'val_accuracy': 0.7457579270220706, 'test_f1_weighted': 0.7365689349250536, 'test_f1_neg': 0.6440677966101696, 'test_f1_zero': 0.6560636182902584, 'test_f1_pos': 0.7965474722564736, 'test_accuracy': 0.727741935483871}
fold 0
fold 1
fold 2
fol

fold 1
fold 2
fold 3
fold 4
{'model': 'logreg', 'C': 5, 'class_weight': 'balanced', 'penalty': 'none', 'solver': 'lbfgs', 'val_f1_weighted': 0.7483964671067317, 'val_f1_neg': 0.647764495439006, 'val_f1_zero': 0.706957506055032, 'val_f1_pos': 0.8034955934458685, 'val_accuracy': 0.7457579270220706, 'test_f1_weighted': 0.7365689349250536, 'test_f1_neg': 0.6440677966101696, 'test_f1_zero': 0.6560636182902584, 'test_f1_pos': 0.7965474722564736, 'test_accuracy': 0.727741935483871}
fold 0
fold 1
fold 2
fold 3
fold 4
{'model': 'logreg', 'C': 5, 'class_weight': 'balanced', 'penalty': 'none', 'solver': 'newton-cg', 'val_f1_weighted': 0.7480132892103636, 'val_f1_neg': 0.6467452116924497, 'val_f1_zero': 0.706647000621187, 'val_f1_pos': 0.8031782650487547, 'val_accuracy': 0.7453611016252452, 'test_f1_weighted': 0.7376867930861163, 'test_f1_neg': 0.6468085106382978, 'test_f1_zero': 0.6587301587301587, 'test_f1_pos': 0.7965474722564736, 'test_accuracy': 0.7290322580645161}
fold 0
fold 1
fold 2
fold 3

fold 1
fold 2
fold 3
fold 4
{'model': 'svm', 'C': 0.1, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'sigmoid', 'val_f1_weighted': 0.7337033104585011, 'val_f1_neg': 0.5856589079955642, 'val_f1_zero': 0.7133276663059742, 'val_f1_pos': 0.7872283406139594, 'val_accuracy': 0.7339651267944097, 'test_f1_weighted': 0.7106641668241552, 'test_f1_neg': 0.5714285714285714, 'test_f1_zero': 0.6422338568935427, 'test_f1_pos': 0.7755610972568577, 'test_accuracy': 0.7032258064516129}
fold 0
fold 1
fold 2
fold 3
fold 4
{'model': 'svm', 'C': 0.1, 'class_weight': None, 'gamma': 'scale', 'kernel': 'poly', 'val_f1_weighted': 0.738561272253816, 'val_f1_neg': 0.5525540452322116, 'val_f1_zero': 0.7131756407268677, 'val_f1_pos': 0.80457585788937, 'val_accuracy': 0.7415904793524316, 'test_f1_weighted': 0.7204292109485354, 'test_f1_neg': 0.5696969696969697, 'test_f1_zero': 0.6514084507042254, 'test_f1_pos': 0.7882496940024479, 'test_accuracy': 0.7148387096774194}
fold 0
fold 1
fold 2
fold 3
fold 4
{'mod

fold 2
fold 3
fold 4
{'model': 'svm', 'C': 0.5, 'class_weight': None, 'gamma': 'auto', 'kernel': 'sigmoid', 'val_f1_weighted': 0.7375260137636245, 'val_f1_neg': 0.551244824016563, 'val_f1_zero': 0.7062294863140011, 'val_f1_pos': 0.8086811526620471, 'val_accuracy': 0.7420198507557074, 'test_f1_weighted': 0.7367112230132427, 'test_f1_neg': 0.5679012345679012, 'test_f1_zero': 0.6641929499072357, 'test_f1_pos': 0.8103651354534748, 'test_accuracy': 0.7341935483870968}
fold 0
fold 1
fold 2
fold 3
fold 4
{'model': 'svm', 'C': 1.0, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'poly', 'val_f1_weighted': 0.7483360094668317, 'val_f1_neg': 0.6292842681077975, 'val_f1_zero': 0.711743425590711, 'val_f1_pos': 0.8036926328815264, 'val_accuracy': 0.7461706507304117, 'test_f1_weighted': 0.7258111304146629, 'test_f1_neg': 0.6255506607929515, 'test_f1_zero': 0.6421663442940039, 'test_f1_pos': 0.7890818858560794, 'test_accuracy': 0.7161290322580646}
fold 0
fold 1
fold 2
fold 3
fold 4
{'model': '

fold 1
fold 2
fold 3
fold 4
{'model': 'svm', 'C': 1.5, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf', 'val_f1_weighted': 0.7467141631514742, 'val_f1_neg': 0.6375903017096671, 'val_f1_zero': 0.7021935662912799, 'val_f1_pos': 0.8066878890652378, 'val_accuracy': 0.744496015936255, 'test_f1_weighted': 0.7382826045068706, 'test_f1_neg': 0.672566371681416, 'test_f1_zero': 0.6495049504950494, 'test_f1_pos': 0.796092796092796, 'test_accuracy': 0.7303225806451613}
fold 0
fold 1
fold 2
fold 3
fold 4
{'model': 'svm', 'C': 1.5, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'sigmoid', 'val_f1_weighted': 0.7485610175599126, 'val_f1_neg': 0.6469109849336462, 'val_f1_zero': 0.7074948359549403, 'val_f1_pos': 0.8042204797090193, 'val_accuracy': 0.7465182191867451, 'test_f1_weighted': 0.7422702177355501, 'test_f1_neg': 0.6788990825688075, 'test_f1_zero': 0.6576402321083171, 'test_f1_pos': 0.7975460122699386, 'test_accuracy': 0.7341935483870968}
fold 0
fold 1
fold 2
fold 3
fold 4
{

fold 2
fold 3
fold 4
{'model': 'svm', 'C': 5, 'class_weight': None, 'gamma': 'auto', 'kernel': 'poly', 'val_f1_weighted': 0.6944041993887443, 'val_f1_neg': 0.4631345939816304, 'val_f1_zero': 0.6927635748867876, 'val_f1_pos': 0.7585568654661322, 'val_accuracy': 0.7008699835578321, 'test_f1_weighted': 0.669657178119548, 'test_f1_neg': 0.4225352112676056, 'test_f1_zero': 0.6166134185303515, 'test_f1_pos': 0.751918158567775, 'test_accuracy': 0.6670967741935484}
fold 0
fold 1
fold 2
fold 3
fold 4
{'model': 'svm', 'C': 5, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'val_f1_weighted': 0.7422739156960168, 'val_f1_neg': 0.6036617775269489, 'val_f1_zero': 0.6917638480741116, 'val_f1_pos': 0.813303096488878, 'val_accuracy': 0.7451286188579018, 'test_f1_weighted': 0.7677776075147473, 'test_f1_neg': 0.6519337016574586, 'test_f1_zero': 0.692, 'test_f1_pos': 0.8308400460299195, 'test_accuracy': 0.7651612903225806}
fold 0
fold 1
fold 2
fold 3
fold 4
{'model': 'svm', 'C': 5, 'class_weight':

In [102]:
final_logreg_results = final_logreg_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_nb_results = final_nb_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_svm_results = final_svm_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_dummy_results = final_dummy_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)

In [105]:
final_logreg_results.iloc[0,:]

model                  logreg
C                           5
class_weight             None
penalty                    l2
solver              newton-cg
val_f1_weighted      0.750722
val_f1_neg           0.608237
val_f1_zero          0.709539
val_f1_pos           0.815549
val_accuracy         0.753149
test_f1_weighted     0.752375
test_f1_neg          0.644444
test_f1_zero         0.679537
test_f1_pos          0.812207
test_accuracy        0.748387
Name: 37, dtype: object

In [106]:
final_nb_results.iloc[0,:]

model                     nb
alpha                      0
val_f1_weighted     0.595045
val_f1_neg           0.57154
val_f1_zero         0.409293
val_f1_pos          0.738794
val_accuracy        0.626335
test_f1_weighted    0.637336
test_f1_neg         0.573171
test_f1_zero        0.357724
test_f1_pos         0.786627
test_accuracy       0.661935
Name: 0, dtype: object

In [107]:
final_svm_results.iloc[0,:]

model                    svm
C                          1
class_weight        balanced
gamma                   auto
kernel               sigmoid
val_f1_weighted     0.749395
val_f1_neg          0.649645
val_f1_zero         0.711167
val_f1_pos          0.802676
val_accuracy        0.747336
test_f1_weighted    0.740961
test_f1_neg         0.672811
test_f1_zero         0.65896
test_f1_pos         0.796069
test_accuracy       0.732903
Name: 29, dtype: object

In [108]:
final_dummy_results.iloc[0,:]

model                  dummy
strategy               prior
val_f1_weighted     0.330713
val_f1_neg                 0
val_f1_zero                0
val_f1_pos           0.66364
val_accuracy        0.497176
test_f1_weighted    0.431247
test_f1_neg                0
test_f1_zero               0
test_f1_pos         0.737785
test_accuracy       0.584516
Name: 0, dtype: object

### Train on all data

In [None]:
models = ["fasttext", "logreg", "NB", "RF", "SVM","VADER"]
final_meta_model_train = pd.DataFrame()

for model in models:
    allpred = pd.read_csv(f'fold_predictions/{model}/{model}_all.csv')    
    final_meta_model_train = pd.concat([final_meta_model_train,allpred], axis=1)

In [None]:
final_model = LogisticRegression(**logreg_paramgrid[0])
final_model.fit(final_meta_model_train.iloc[:,:-1], final_meta_model_train.label)

model_pkl_filename = "saved_models/model_stacking_metamodel.pkl"
with open(model_pkl_filename, 'wb') as file:
    pickle.dump(final_model, file)
with open(vect_pkl_filename, 'wb') as file:
    pickle.dump(tfidf, file)

#### Save Model

In [None]:
# save full model
full_df = pd.read_csv("new_labels/ALL_LABELLED_DATA.csv")

tfidf = TfidfVectorizer(**logreg_tfidf_paramgrid[0])
tfidf_train_emoticon_generic_stem = tfidf.fit_transform(full_df.phrase_stem_emoticon_generic)

final_model = LogisticRegression(**logreg_paramgrid[0])
final_model.fit(tfidf_train_emoticon_generic_stem, full_df.label)

vect_pkl_filename = "saved_models/model_logreg_vectorizer.pkl"
model_pkl_filename = "saved_models/model_logreg.pkl"
with open(model_pkl_filename, 'wb') as file:
    pickle.dump(final_model, file)
with open(vect_pkl_filename, 'wb') as file:
    pickle.dump(tfidf, file)