In [None]:
import pandas as pd
pd.set_option('display.max_rows', 100)
import numpy as np
import utils

from sklearn.feature_extraction.text import CountVectorizer 
import re
from tqdm import tqdm

from sklearn.model_selection import ParameterGrid

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

Import Datasets

In [None]:
train = pd.read_csv("new_labels/train_newpreproc_emoticon.csv")
val = pd.read_csv("new_labels/val_newpreproc_emoticon.csv")
test = pd.read_csv("new_labels/test_newpreproc_emoticon.csv")

trainval =pd.concat([train, val])

In [None]:
train.head()

# Modelling

Initialise Parameters

In [None]:
bow_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1), (1,2), (1,3)],
    "max_df": [0.25, 0.5, 1.0],
    "min_df": [1, 10, 20]
}
bow_paramgrid = list(ParameterGrid(bow_params))

type_proc_params_lr = {
    "type": ["original","stem"],
    "processing": ["no_emoji", "emoji_generic", "emoji_unique"]
}
type_proc_paramgrid_svm = list(ParameterGrid(type_proc_params_lr))

type_proc_params_others = {
    "type": ["stem"],
    "processing": ["no_emoji", "emoji_generic", "emoji_unique"]
}
type_proc_paramgrid_others = list(ParameterGrid(type_proc_params_others))

In [None]:
# logistic regression
logreg_params = {
    "C": [0.1, 0.5, 1.0, 1.5, 5],
    "solver": ["lbfgs", "newton-cg"],
    "penalty": ["l2", "none"],
    "class_weight": ["balanced", None] 
}
logreg_paramgrid = list(ParameterGrid(logreg_params))

In [None]:
# naive bayes
nb_params = {
    "alpha": [0, 0.001, 0.01, 0.1, 0.25, 0.5, 1]
}
nb_paramgrid = list(ParameterGrid(nb_params))

In [None]:
# svm
svm_params = {
    "C": [0.1, 0.5, 1.0, 1.5, 5],
    "kernel": ["poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
    "class_weight": ["balanced", None] 
}

svm_paramgrid = list(ParameterGrid(svm_params))

In [None]:
rf_params = [
    {
        "criterion": ["gini", "entropy"],
        "min_samples_split": [2, 5, 10],
        "class_weight": ["balanced", "balanced_subsample", None],
        "max_features": ["auto","sqrt"],
        "min_samples_leaf": [1, 2, 4]
    }
]

rf_paramgrid = list(ParameterGrid(rf_params))

In [None]:
dummy_params = { "strategy": ["prior"] }
dummy_paramgrid = list(ParameterGrid(dummy_params))

# Grid Search 

In [None]:
def prepare_datasets(bow_param):
    # original
    bow = CountVectorizer(**bow_param)
    bow_train = bow.fit_transform(train.phrase)
    bow_val = bow.transform(val.phrase)
    bow_test = bow.transform(test.phrase)
    bow_trainval = bow.transform(trainval.phrase)
    
    bow = CountVectorizer(**bow_param)
    bow_train_generic = bow.fit_transform(train.phrase_emoticon_generic)
    bow_val_generic = bow.transform(val.phrase_emoticon_generic)
    bow_test_generic = bow.transform(test.phrase_emoticon_generic)
    bow_trainval_generic = bow.transform(trainval.phrase_emoticon_generic)
    
    bow = CountVectorizer(**bow_param)
    bow_train_unique = bow.fit_transform(train.phrase_emoticon_unique)
    bow_val_unique = bow.transform(val.phrase_emoticon_unique)
    bow_test_unique = bow.transform(test.phrase_emoticon_unique)
    bow_trainval_unique = bow.transform(trainval.phrase_emoticon_unique)
    
#     # lemmatize
#     bow = CountVectorizer(**bow_param)
#     bow_train_lemma = bow.fit_transform(train.phrase_lemma)
#     bow_val_lemma = bow.transform(val.phrase_lemma)
#     bow_test_lemma = bow.transform(test.phrase_lemma)
#     bow_trainval_lemma = bow.transform(trainval.phrase_lemma)

#     bow = CountVectorizer(**bow_param)
#     bow_train_lemma_generic = bow.fit_transform(train.phrase_lemma_emoticon_generic)
#     bow_val_lemma_generic = bow.transform(val.phrase_lemma_emoticon_generic)
#     bow_test_lemma_generic = bow.transform(test.phrase_lemma_emoticon_generic)
#     bow_trainval_lemma_generic = bow.transform(trainval.phrase_lemma_emoticon_generic)

#     bow = CountVectorizer(**bow_param)
#     bow_train_lemma_unique = bow.fit_transform(train.phrase_lemma_emoticon_unique)
#     bow_val_lemma_unique = bow.transform(val.phrase_lemma_emoticon_unique)
#     bow_test_lemma_unique = bow.transform(test.phrase_lemma_emoticon_unique)
#     bow_trainval_lemma_unique = bow.transform(trainval.phrase_lemma_emoticon_unique)    
    
    # stem
#     bow = CountVectorizer(**bow_param)
#     bow_train_stem = bow.fit_transform(train.phrase_stem)
#     bow_val_stem = bow.transform(val.phrase_stem)
#     bow_test_stem = bow.transform(test.phrase_stem)
#     bow_trainval_stem = bow.transform(trainval.phrase_stem)
    
#     bow = CountVectorizer(**bow_param)
#     bow_train_stem_generic = bow.fit_transform(train.phrase_stem_emoticon_generic)
#     bow_val_stem_generic = bow.transform(val.phrase_stem_emoticon_generic)
#     bow_test_stem_generic = bow.transform(test.phrase_stem_emoticon_generic)
#     bow_trainval_stem_generic = bow.transform(trainval.phrase_stem_emoticon_generic)

#     bow = CountVectorizer(**bow_param)
#     bow_train_stem_unique = bow.fit_transform(train.phrase_stem_emoticon_unique)
#     bow_val_stem_unique = bow.transform(val.phrase_stem_emoticon_unique)
#     bow_test_stem_unique = bow.transform(test.phrase_stem_emoticon_unique)
#     bow_trainval_stem_unique = bow.transform(trainval.phrase_stem_emoticon_unique)  
    
    return {
        "original": 
            {
             "emoji_generic": [bow_train_generic, bow_val_generic, bow_test_generic, bow_trainval_generic],
             "emoji_unique": [bow_train_unique, bow_val_unique, bow_test_unique, bow_trainval_unique],
            },
    }

In [None]:
def prepare_datasets_stem(bow_param):
    # stem
    bow = CountVectorizer(**bow_param)
    bow_train_stem = bow.fit_transform(train.phrase_stem)
    bow_val_stem = bow.transform(val.phrase_stem)
    bow_test_stem = bow.transform(test.phrase_stem)
    bow_trainval_stem = bow.transform(trainval.phrase_stem)
    
    bow = CountVectorizer(**bow_param)
    bow_train_stem_generic = bow.fit_transform(train.phrase_stem_emoticon_generic)
    bow_val_stem_generic = bow.transform(val.phrase_stem_emoticon_generic)
    bow_test_stem_generic = bow.transform(test.phrase_stem_emoticon_generic)
    bow_trainval_stem_generic = bow.transform(trainval.phrase_stem_emoticon_generic)

    bow = CountVectorizer(**bow_param)
    bow_train_stem_unique = bow.fit_transform(train.phrase_stem_emoticon_unique)
    bow_val_stem_unique = bow.transform(val.phrase_stem_emoticon_unique)
    bow_test_stem_unique = bow.transform(test.phrase_stem_emoticon_unique)
    bow_trainval_stem_unique = bow.transform(trainval.phrase_stem_emoticon_unique)  
    
    return {
        "stem": 
            {"no_emoji": [bow_train_stem, bow_val_stem, bow_test_stem, bow_trainval_stem],
             "emoji_generic": [bow_train_stem_generic, bow_val_stem_generic, bow_test_stem_generic, bow_trainval_stem_generic],
             "emoji_unique": [bow_train_stem_unique, bow_val_stem_unique, bow_test_stem_unique, bow_trainval_stem_unique],
            },
        }

In [None]:
labels = [train.label, val.label, test.label, trainval.label]

## Logistic Regression

In [None]:
model_name = "logreg"
model_fn = LogisticRegression
model_paramgrid = logreg_paramgrid

ind = 0 
gridsearch_results = []
for bow_param in tqdm(bow_paramgrid):
    datasets = prepare_datasets_stem(bow_param)
    
    for type_proc_param in type_proc_paramgrid_others:
        data_type = type_proc_param["type"]
        emoji_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][emoji_proc][0]
        val_set = datasets[data_type][emoji_proc][1]
        test_set = datasets[data_type][emoji_proc][2]
        trainval_set = datasets[data_type][emoji_proc][3]
        
        train_label = labels[0]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = labels[3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(bow_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            ind += 1

final_logreg_results = pd.DataFrame.from_records(gridsearch_results)
final_logreg_results = final_logreg_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_logreg_results.to_csv("model_results/bow/no_agg/emoji/logreg.csv", index=False)

## Naive Bayes

In [None]:
model_name = "nb"
model_fn = MultinomialNB
model_paramgrid = nb_paramgrid

ind = 0 
gridsearch_results = []
for bow_param in tqdm(bow_paramgrid):
    datasets = prepare_datasets_stem(bow_param)
    
    for type_proc_param in type_proc_paramgrid_others:
        data_type = type_proc_param["type"]
        emoji_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][emoji_proc][0]
        val_set = datasets[data_type][emoji_proc][1]
        test_set = datasets[data_type][emoji_proc][2]
        trainval_set = datasets[data_type][emoji_proc][3]
        
        train_label = labels[0]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = labels[3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(bow_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            ind += 1
            
            
final_nb_results = pd.DataFrame.from_records(gridsearch_results)
final_nb_results = final_nb_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_nb_results.to_csv("model_results/bow/no_agg/emoji/nb.csv", index=False)

## Random Forest

In [None]:
model_name = "rf"
model_fn = RandomForestClassifier
model_paramgrid = rf_paramgrid

ind = 0 
gridsearch_results = []
for bow_param in tqdm(bow_paramgrid):
    datasets = prepare_datasets_stem(bow_param)
    
    for type_proc_param in type_proc_paramgrid_others:
        data_type = type_proc_param["type"]
        emoji_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][emoji_proc][0]
        val_set = datasets[data_type][emoji_proc][1]
        test_set = datasets[data_type][emoji_proc][2]
        trainval_set = datasets[data_type][emoji_proc][3]
        
        train_label = labels[0]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = labels[3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(bow_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            ind += 1

            
final_rf_results = pd.DataFrame.from_records(gridsearch_results)
final_rf_results = final_rf_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_rf_results.to_csv("model_results/bow/no_agg/emoji/rf.csv", index=False)

##  SVM

In [None]:
model_name = "svm"
model_fn = SVC
model_paramgrid = svm_paramgrid

ind = 0 
gridsearch_results = []
for bow_param in tqdm(bow_paramgrid):
    datasets = prepare_datasets(bow_param)
    
    for type_proc_param in type_proc_paramgrid_svm:
        data_type = type_proc_param["type"]
        emoji_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][emoji_proc][0]
        val_set = datasets[data_type][emoji_proc][1]
        test_set = datasets[data_type][emoji_proc][2]
        trainval_set = datasets[data_type][emoji_proc][3]
        
        train_label = labels[0]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = labels[3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(bow_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            ind += 1


final_svm_results = pd.DataFrame.from_records(gridsearch_results)
final_svm_results = final_svm_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_svm_results.to_csv("model_results/bow/no_agg/emoji/svm.csv", index=False)

## Dummy Classifier 

In [None]:
model_name = "dummy"
model_fn = DummyClassifier
model_paramgrid = dummy_paramgrid

ind = 0 
gridsearch_results = []
for bow_param in tqdm(bow_paramgrid):
    datasets = prepare_datasets(bow_param)
    
    for type_proc_param in type_proc_paramgrid_others:
        data_type = type_proc_param["type"]
        emoji_proc = type_proc_param["processing"]
        
        # extract datasets
        train_set = datasets[data_type][emoji_proc][0]
        val_set = datasets[data_type][emoji_proc][1]
        test_set = datasets[data_type][emoji_proc][2]
        trainval_set = datasets[data_type][emoji_proc][3]
        
        train_label = labels[0]
        val_label = labels[1]
        test_label = labels[2]
        trainval_label = labels[3]
        
        # train models
        for model_param in model_paramgrid:
            # train model
            model = model_fn(**model_param)
            model.fit(train_set, train_label)
            val_pred = model.predict(val_set)
            # scoring
            val_metrics = classification_report(val_label, val_pred, output_dict=True)
            val_accuracy = val_metrics["accuracy"]
            val_f1_weighted = val_metrics["weighted avg"]["f1-score"]
            val_f1_neg = val_metrics["-1.0"]["f1-score"]
            val_f1_zero = val_metrics["0.0"]["f1-score"]
            val_f1_pos = val_metrics["1.0"]["f1-score"]
            
            # train test_val model
            model = model_fn(**model_param)
            model.fit(trainval_set, trainval_label)
            test_pred = model.predict(test_set)
            # scoring
            test_metrics = classification_report(test_label, test_pred, output_dict=True)
            test_accuracy = test_metrics["accuracy"]
            test_f1_weighted = test_metrics["weighted avg"]["f1-score"]
            test_f1_neg = test_metrics["-1.0"]["f1-score"]
            test_f1_zero = test_metrics["0.0"]["f1-score"]
            test_f1_pos = test_metrics["1.0"]["f1-score"]

            results = { "model": model_name }
            results.update(bow_param)
            results.update(type_proc_param)
            results.update(model_param)
            results.update({"val_f1_weighted": val_f1_weighted, "val_f1_neg": val_f1_neg, 
                            "val_f1_zero": val_f1_zero, "val_f1_pos": val_f1_pos,
                            "val_accuracy": val_accuracy})
            results.update({"test_f1_weighted": test_f1_weighted, "test_f1_neg": test_f1_neg, 
                            "test_f1_zero": test_f1_zero, "test_f1_pos": test_f1_pos,
                            "test_accuracy": test_accuracy})
            gridsearch_results.append(results)
            ind += 1
                
final_dummy_results = pd.DataFrame.from_records(gridsearch_results)
final_dummy_results = final_dummy_results.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
final_dummy_results.to_csv("model_results/bow/no_agg/agg/dummy.csv", index=False)

In [None]:
combined_df = pd.concat([final_logreg_results, final_nb_results, final_svm_results, final_rf_results, \
                        final_dummy_results])
combined_df = combined_df.sort_values(by=["val_f1_weighted", "test_f1_weighted"], ascending=False)
combined_df.to_csv("model_results/bow/no_agg/combined.csv", index=False)

# K-Fold Cross-Validation

In [None]:
def custom_k_fold(model_grid, vectorizer_grid, column, data, model_name):
    
    # Generate fold predictions
    fold_num = 1
    for tf_combi in data:
        train = tf_combi[0]
        predict_on = tf_combi[1]

        # Vectorise Data
        for v in vectorizer_grid:
            vectorizer = CountVectorizer(**v)
        bow_train = vectorizer.fit_transform(train[column])
        bow_predict_on = vectorizer.transform(predict_on[column])
        
        # Get Labels
        train_label = train.label
        
        # Fit Model
        for m in model_grid:
            model = model_fn(**m)
        model.fit(bow_train, train_label)
        predictions = model.predict_proba(bow_predict_on)
        
        # Create Dataframe and output
        df = pd.DataFrame(data=predictions, columns = [model_name+'_prob_neg', model_name+'_prob_neu', model_name+'_prob_pos'])
        df.drop(columns= [model_name+'_prob_neu'])
        ordered_cols = [model_name+'_prob_pos',model_name+'_prob_neg']
        df=df[ordered_cols]
        
        if fold_num <=5:
            path = "kfold/" + model_name + '_fold' + str(fold_num) +'.csv'
        else:
            path = "kfold/" + model_name + '_test.csv'
        
        df.to_csv(path, index=False)
        
        fold_num +=1

In [None]:
# Import Data
fold1 = pd.read_csv('fold_labels/fold1.csv')
fold2 = pd.read_csv('fold_labels/fold2.csv')
fold3 = pd.read_csv('fold_labels/fold3.csv')
fold4 = pd.read_csv('fold_labels/fold4.csv')
fold5 = pd.read_csv('fold_labels/fold5.csv')

train1 = pd.read_csv('fold_labels/train1.csv')
train2 = pd.read_csv('fold_labels/train2.csv')
train3 = pd.read_csv('fold_labels/train3.csv')
train4 = pd.read_csv('fold_labels/train4.csv')
train5 = pd.read_csv('fold_labels/train5.csv')

train_all = pd.read_csv('fold_labels/train_all.csv')
test = pd.read_csv('fold_labels/test.csv')

# store in suitable data structure
data = [(train1, fold1), (train2, fold2),(train3, fold3), (train4, fold4), (train5, fold5), (train_all, test)]

### RF

In [None]:
# Instantiate model grid that gives highest validtion weighted F1
rf_params = [
    {
        "criterion": ["gini"],
        "min_samples_split": [5],
        "class_weight": [None],
        "max_features": ["auto"],
        "min_samples_leaf": [1]
    }
]

rf_paramgrid = list(ParameterGrid(rf_params))

# Instantiate CountVectorizer grid with Params giving highest validation weighted F1
rf_bow_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1)],
    "max_df": [1.0],
    "min_df": [1]
}
rf_bow_paramgrid = list(ParameterGrid(rf_bow_params))

# Best text processing
column = 'phrase_stem_emoticon_generic'

# Model Function
model_fn = RandomForestClassifier

In [None]:
custom_k_fold(rf_paramgrid, rf_bow_paramgrid, column, data, "RF")

### Naive's Bayes

In [None]:
# Instantiate model grid that gives highest validtion weighted F1
nb_params = {
    "alpha": [0.5]
}

nb_paramgrid = list(ParameterGrid(nb_params))

# Instantiate CountVectorizer grid with Params giving highest validation weighted F1
nb_bow_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1)],
    "max_df": [0.25],
    "min_df": [10]
}
nb_bow_paramgrid = list(ParameterGrid(nb_bow_params))

# Best text processing
column = 'phrase_stem_emoticon_generic'

# Model Function
model_fn = MultinomialNB

In [None]:
custom_k_fold(nb_paramgrid, nb_bow_paramgrid, column, data, "NB")

### SVM

In [None]:
# Instantiate model grid that gives highest validtion weighted F1
svm_params = {
    "C": [5],
    "kernel": ["rbf"],
    "gamma": ["scale"],
    "class_weight": [None],
    "probability": [True]
}

svm_paramgrid = list(ParameterGrid(svm_params))

# Instantiate CountVectorizer grid with Params giving highest validation weighted F1
svm_bow_params = {
    "analyzer": ["word"],
    "lowercase": [True],
    "ngram_range": [(1,1)],
    "max_df": [0.25],
    "min_df": [1]
}
svm_bow_paramgrid = list(ParameterGrid(svm_bow_params))

#Best text processing
column = 'phrase'

# Model Function
model_fn = SVC

In [None]:
custom_k_fold(svm_paramgrid, svm_bow_paramgrid, column, data, "SVM")