## Importacion de librerias

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from scipy.io import arff
import random
from collections import OrderedDict
import seaborn as sns
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from deap import base, creator, tools, algorithms
from sklearn.base import clone, is_classifier

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# sklearn-optimize es tomado de: https://github.com/senolakkas/sklearn-optimize
from sklearn_genetic.genetic import GeneticSearchCV

import joblib

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.metrics import confusion_matrix
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')



## Leer dataset imputado

In [2]:
median_imputed_df = pd.read_pickle(os.path.join('results','median_imputed_data.pkl'))

In [3]:
# Splitting features and labels
def split_features_labels(df):
    feature_dfs = df.iloc[:, 1:]
    label_dfs = df['flag']
    return feature_dfs, label_dfs

## Estrategia de validacion

In [4]:
# K-Fold Cross Validation
def kfold_cv(k, X, y, verbose=False):
    X = X.values  # Features
    y = y.values  # Labels
    kf = KFold(n_splits=k, shuffle=False, random_state=42)
    X_train = []
    y_train = []
    X_test = []
    y_test = []

    for train_index, test_index in kf.split(X):
        X_train.append(X[train_index])
        y_train.append(y[train_index])
        X_test.append(X[test_index])
        y_test.append(y[test_index])
    return X_train, y_train, X_test, y_test
seed = 7

In [6]:
# perform data modeling using a dict of models
def perform_data_modeling(_models_, imputed_df, verbose=False, k_folds=5):
    # 7 metrics, averaged over all the K-Folds
    model_results = OrderedDict()

    # Iterate over classifiers
    for model_name, clf in _models_.items():
        if verbose: print("-" * 120, "\n", "Model: " + '\033[1m' + model_name + '\033[0m' + " Classifier")
        imputer_results = OrderedDict()

        # Mean imputation is in a dictionary and iterating over that.In our case, only mean_imputation is implemented
        features_df, labels_df = split_features_labels(imputed_df)

        years = OrderedDict()
        df_index = 0
        if verbose: print('\t\tDataset: ' + '\033[1m' + str(df_index + 1) + 'year' + '\033[0m')
        # Running K-fold cross validation on train and test set
        X_train_list, y_train_list, X_test_list, y_test_list = kfold_cv(k_folds, features_df,
                                                                        labels_df, verbose)

        metrics = OrderedDict()

        # Calculating accuracy, precision, recall, and confusion matrix
        # Initializing these variables with a numpy array of 0

        accuracy_list = np.zeros([k_folds])
        precision_list = np.zeros([k_folds, 2])
        recall_list = np.zeros([k_folds, 2])
        true_negs = np.zeros([k_folds])
        false_pos = np.zeros([k_folds])
        false_negs = np.zeros([k_folds])
        true_pos = np.zeros([k_folds])

        # Iterate over all the k-folds and calculate accuracy, precision and confusion matrix
        for k in range(k_folds):
            X_train = X_train_list[k]
            y_train = y_train_list[k]
            X_test = X_test_list[k]
            y_test = y_test_list[k]

            # Fit the model and call predict function for test set
            clf = clf.fit(X_train, y_train)

            y_test_predicted = clf.predict(X_test)
            print(confusion_matrix(y_test_predicted, y_test))

            _accuracy_ = accuracy_score(y_test, y_test_predicted, normalize=True)
            accuracy_list[k] = _accuracy_
            _recalls_ = recall_score(y_test, y_test_predicted, average=None)
            recall_list[k] = _recalls_

            # code for calculating precision
            _precisions_ = precision_score(y_test, y_test_predicted, average=None)
            precision_list[k] = _precisions_

            # code for calculating confusion matrix
            _confusion_matrix_ = confusion_matrix(y_test, y_test_predicted)
            mlp_cm = confusion_matrix(y_test, y_test_predicted)

            true_negs[k] = _confusion_matrix_[0][0]
            false_pos[k] = _confusion_matrix_[0][1]
            false_negs[k] = _confusion_matrix_[1][0]
            true_pos[k] = _confusion_matrix_[1][1]

        metrics['Accuracy'] = np.mean(accuracy_list)
        metrics['Precisions'] = np.mean(precision_list, axis=0)
        metrics['Recalls'] = np.mean(recall_list, axis=0)
        metrics['TN'] = np.mean(true_negs)
        metrics['FP'] = np.mean(false_pos)
        metrics['FN'] = np.mean(false_negs)
        metrics['TP'] = np.mean(true_pos)

        if verbose:
            print('\t\t\tAccuracy:', metrics['Accuracy'])
            print('\t\t\tPrecision:', metrics['Precisions'])
            print('\t\t\tRecall:', metrics['Recalls'])

        model_results[model_name] = metrics

    sns.heatmap(mlp_cm, annot=True,
                xticklabels=['Non Bankrupt', 'Bankrupt'],
                yticklabels=['Non Bankrupt', 'Bankrupt'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

    return model_results


# perform ranking by metrics
def perform_model_ranking_acc(models, imputers, results):
    column_headers = ['-','Accuracy'] 
    rows = []
    for model_name, model_details in results.items():
        row = [model_name]
        row.append(model_details['Accuracy'])
        rows.append(row)
    results_df = pd.DataFrame(data=rows, columns = column_headers)
    return results_df

def perform_model_ranking_prec(models, imputers, results):
    column_headers = ['-','Precisions']
    rows = []
    for model_name, model_details in results.items():
        row = [model_name]
        row.append(model_details['Precisions'])
        rows.append(row)
    results_df = pd.DataFrame(data=rows, columns = column_headers)
    return results_df

def perform_model_ranking_rec(models, imputers, results):
    column_headers = ['-','Recalls']
    rows = []
    for model_name, model_details in results.items():
        row = [model_name]
        row.append(model_details['Recalls'])
        rows.append(row)
    results_df = pd.DataFrame(data=rows, columns = column_headers)
    return results_df

def perform_model_ranking_tn(models, imputers, results):
    column_headers = ['-','TN']
    rows = []
    for model_name, model_details in results.items():
        row = [model_name]
        row.append(model_details['TN'])
        rows.append(row)
    results_df = pd.DataFrame(data=rows, columns = column_headers)
    return results_df

def perform_model_ranking_fp(models, imputers, results):
    column_headers = ['-','FP']
    rows = []
    for model_name, model_details in results.items():
        row = [model_name]
        row.append(model_details['FP'])
        rows.append(row)
    results_df = pd.DataFrame(data=rows, columns = column_headers)
    return results_df

def perform_model_ranking_fn(models, imputers, results):
    column_headers = ['-','FN']
    rows = []
    for model_name, model_details in results.items():
        row = [model_name]
        row.append(model_details['FN'])
        rows.append(row)
    results_df = pd.DataFrame(data=rows, columns = column_headers)
    return results_df

def perform_model_ranking_tp(models, imputers, results):
    column_headers = ['-','FP']
    rows = []
    for model_name, model_details in results.items():
        row = [model_name]
        row.append(model_details['TP'])
        rows.append(row)
    results_df = pd.DataFrame(data=rows, columns = column_headers)
    return results_df

In [7]:
param = {'kernel': ['rbf'],
          'C': [0, 100], 
          'gamma': [0.01, 0.001]}
param_ga = {"kernel": ["rbf", "sigmoid", "linear"],
             "C": np.logspace(-9, 9, num=25, base=10),
             "gamma": np.logspace(-9, 9, num=25, base=10)}

In [8]:
X, y = split_features_labels(median_imputed_df)

**Don't want to wait?, put load to 1!**

In [9]:
load = 1

In [10]:
# svm simple search
cv_svm = GridSearchCV(estimator=SVC(), 
                      param_grid=param,
                      scoring="accuracy",
                      cv=5)
if load==0:
  cv_svm.fit(X, y)

In [11]:
# GeneticSearchCV modified
class GeneticSearchCVMod(GeneticSearchCV):
     def __init__(self, estimator, params, scoring=None, cv=4,
                 refit=True, verbose=False, population_size=50,
                 gene_mutation_prob=0.1, gene_crossover_prob=0.5,
                 tournament_size=3, generations_number=10, gene_type=None,
                 n_jobs=1, iid=True, error_score='raise',
                 fit_params={}):
        super(GeneticSearchCV, self).__init__(
            estimator=estimator, scoring=scoring,
            iid=iid, refit=refit, cv=cv, verbose=verbose,
            error_score=error_score)
        self.fit_params=fit_params
        self.params = params
        self.population_size = population_size
        self.generations_number = generations_number
        self._individual_evals = {}
        self.gene_mutation_prob = gene_mutation_prob
        self.gene_crossover_prob = gene_crossover_prob
        self.tournament_size = tournament_size
        self.gene_type = gene_type
        self.all_history_, self.all_logbooks_ = [], []
        self._cv_results = None
        self.best_score_ = None
        self.best_params_ = None
        self.score_cache = {}
        self.n_jobs = n_jobs
        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, est=clone(self.estimator), fitness=creator.FitnessMax)

In [12]:
# svm + ga
cv_svm_ga = GeneticSearchCVMod(estimator=SVC(),
                     params=param_ga,
                     scoring="accuracy",
                     cv=StratifiedKFold(n_splits=4),
                     verbose=1,
                     population_size=50,
                     gene_mutation_prob=0.10,
                     gene_crossover_prob=0.5,
                     tournament_size=3,
                     generations_number=5,
                     n_jobs=4)
if load == 0:
  cv_svm_ga.fit(X, y)

In [13]:
# save grid-searchs
if load == 0:
  joblib.dump(cv_svm, os.path.join('modelos','Modelo_SVM.pkl'))
  joblib.dump(cv_svm_ga, os.path.join('modelos','Modelo_GA_SVM.pkl'))

In [16]:
if load:
  cv_svm = joblib.load(os.path.join('modelos','Modelo_SVM.pkl'))
  cs_svm_ga = joblib.load(os.path.join('modelos','Modelo_GA_SVM.pkl'))

In [17]:
svm = cv_svm.best_estimator_
svm_ga = cv_svm_ga.
models_dictionary2 = OrderedDict()
models_dictionary2['SVM'] = svm
models_dictionary2['SVM+GA'] = svm_ga

AttributeError: 'GeneticSearchCVMod' object has no attribute 'best_estimator_'

In [None]:
# ideally 5 fold cross validation yielded better results
results2 = perform_data_modeling(models_dictionary2, mean_imputed_df, verbose=True, k_folds=5)

In [None]:
print(perform_model_ranking_acc(models_dictionary2, imputed_dict, results2))
print(perform_model_ranking_prec(models_dictionary2, imputed_dict, results2))
print(perform_model_ranking_rec(models_dictionary2, imputed_dict, results2))
print(perform_model_ranking_tn(models_dictionary2, imputed_dict, results2))
print(perform_model_ranking_fp(models_dictionary2, imputed_dict, results2))
print(perform_model_ranking_fn(models_dictionary2, imputed_dict, results2))
print(perform_model_ranking_tp(models_dictionary2, imputed_dict, results2))