In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import os
import matplotlib.colors as clrs
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
fp_ds1 = "./DS2_normalized_not_balanced.csv"

original = pd.read_csv(fp_ds1, parse_dates=False, sep=',',index_col=0)

my_palette = {'yellow': '#ECD474', 'pale orange': '#E9AE4E', 'salmon': '#E2A36B', 'orange': '#F79522', 'dark orange': '#D7725E',
              'pale acqua': '#92C4AF', 'acqua': '#64B29E', 'marine': '#3D9EA9', 'green': '#10A48A', 'olive': '#99C244',
              'pale blue': '#BDDDE0', 'blue2': '#199ED5', 'blue3': '#1DAFE5', 'dark blue': '#0C70B2',
              'pale pink': '#D077AC', 'pink': '#EA4799', 'lavender': '#E09FD5', 'lilac': '#B081B9', 'purple': '#923E97',
              'white': '#FFFFFF', 'light grey': '#D2D3D4', 'grey': '#939598', 'black': '#000000'}

In [2]:
import datetime as dt

NR_COLUMNS: int = 3
HEIGHT: int = 4


def choose_grid(nr):
    if nr < NR_COLUMNS:
        return 1, nr
    else:
        return (nr // NR_COLUMNS, NR_COLUMNS) if nr % NR_COLUMNS == 0 else (nr // NR_COLUMNS + 1, NR_COLUMNS)


def set_axes(xvalues: list, ax: plt.Axes = None, title: str = '', xlabel: str = '', ylabel: str = '', percentage=False):
    if ax is None:
        ax = plt.gca()
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if percentage:
        ax.set_ylim(0.0, 1.0)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_xticklabels(xvalues, fontsize='small', ha='center')

    return ax


def set_locators(xvalues: list, ax: plt.Axes = None):
    if isinstance(xvalues[0], dt.datetime):
        locator = mdates.AutoDateLocator()
        ax.xaxis.set_major_locator(locator)
        ax.xaxis.set_major_formatter(mdates.AutoDateFormatter(locator, defaultfmt='%Y-%m-%d'))
    else:
        ax.set_xticks(xvalues)
        ax.set_xlim(xvalues[0], xvalues[-1])

    return ax


def plot_line(xvalues: list, yvalues: list, ax: plt.Axes = None, title: str = '', xlabel: str = '',
              ylabel: str = '', percentage=False):
    ax = set_axes(xvalues, ax=ax, title=title, xlabel=xlabel, ylabel=ylabel, percentage=percentage)
    ax = set_locators(xvalues, ax=ax)
    ax.plot(xvalues,  yvalues, c=cfg.LINE_COLOR)


def multiple_line_chart(xvalues: list, yvalues: dict, ax: plt.Axes = None, title: str = '',
                        xlabel: str = '', ylabel: str = '', percentage=False, miny=0, maxy=1):
    ax = set_axes(xvalues, ax=ax, title=title, xlabel=xlabel, ylabel=ylabel, percentage=percentage)
    ax = set_locators(xvalues, ax=ax)

    legend: list = []
    if percentage:
        ax.set_ylim(miny-0.1,maxy+0.1)
    for name, y in yvalues.items():
        ax.plot(xvalues, y)
        legend.append(name)
    ax.legend(legend)


def bar_chart(xvalues: list, yvalues: list, ax: plt.Axes = None, title: str = '',
              xlabel: str = '', ylabel: str = '', percentage=False):
    ax = set_axes(xvalues, ax=ax, title=title, xlabel=xlabel, ylabel=ylabel, percentage=percentage)
    ax.bar(xvalues, yvalues, edgecolor=my_palette['dark blue'], color=my_palette['pale blue'])


def multiple_bar_chart(xvalues: list, yvalues: dict, ax: plt.Axes = None, title: str = '',
                       xlabel: str = '', ylabel: str = '', percentage=False):
    ax = set_axes(xvalues, ax=ax, title=title, xlabel=xlabel, ylabel=ylabel, percentage=percentage)

    x = np.arange(len(xvalues))  # the label locations

    width = 0.8 / (len(xvalues)*len(yvalues))
    # the width of the bars
    step = width / len(xvalues)
    i: int = 0
    for metric in yvalues:
        ax.bar(x + i*width, yvalues[metric], width=width, align='center', label=metric)
        i += 1
    ax.set_xticks(x + width/len(xvalues) - step/2)
    ax.legend(fontsize='x-small', title_fontsize='small')


def plot_confusion_matrix(cnf_matrix: np.ndarray, classes_names: np.ndarray,
                          ax: plt.Axes = None, normalize: bool = False):
    if ax is None:
        ax = plt.gca()
    if normalize:
        total = cnf_matrix.sum(axis=1)[:, np.newaxis]
        cm = cnf_matrix.astype('float') / total
        title = "Normalized confusion matrix"
    else:
        cm = cnf_matrix
        title = 'Confusion matrix'
    np.set_printoptions(precision=2)
    tick_marks = np.arange(0, len(classes_names), 1)
    ax.set_title(title)
    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')
    ax.set_xticks(tick_marks)
    ax.set_yticks(tick_marks)
    ax.set_xticklabels(classes_names)
    ax.set_yticklabels(classes_names)
    ax.imshow(cm, interpolation='nearest', cmap=clrs.LinearSegmentedColormap.from_list("myCMPBlues", [my_palette['pale blue'], my_palette['blue2'], my_palette['blue3'], my_palette['dark blue']]))

    fmt = '.2f' if normalize else 'd'
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        ax.text(j, i, format(cm[i, j], fmt), color='w', horizontalalignment="center")


def plot_evaluation_results(labels: np.ndarray, trn_y, prd_trn, tst_y, prd_tst):
    cnf_mtx_trn = metrics.confusion_matrix(trn_y, prd_trn, labels)
    tn_trn, fp_trn, fn_trn, tp_trn = cnf_mtx_trn.ravel()
    cnf_mtx_tst = metrics.confusion_matrix(tst_y, prd_tst, labels)
    tn_tst, fp_tst, fn_tst, tp_tst = cnf_mtx_tst.ravel()

    evaluation = {'Accuracy': [(tn_trn + tp_trn) / (tn_trn + tp_trn + fp_trn + fn_trn),
                               (tn_tst + tp_tst) / (tn_tst + tp_tst + fp_tst + fn_tst)],
                  'Recall': [tp_trn / (tp_trn + fn_trn), tp_tst / (tp_tst + fn_tst)],
                  'Specificity': [tn_trn / (tn_trn + fp_trn), tn_tst / (tn_tst + fp_tst)],
                  'Precision': [tp_trn / (tp_trn + fp_trn), tp_tst / (tp_tst + fp_tst)]}

    fig, axs = plt.subplots(1, 2, figsize=(2 * HEIGHT, HEIGHT))
    multiple_bar_chart(['Train', 'Test'], evaluation, ax=axs[0], title="Model's performance over Train and Test sets")
    plot_confusion_matrix(cnf_mtx_tst, labels, ax=axs[1])


def plot_roc_chart(models: dict, tstX: np.ndarray, tstY: np.ndarray, ax: plt.Axes = None, target: str = 'class'):
    if ax is None:
        ax = plt.gca()
    ax.set_xlim(0.0, 1.0)
    ax.set_ylim(0.0, 1.0)
    ax.set_xlabel('FP rate')
    ax.set_ylabel('TP rate')
    ax.set_title('ROC chart for %s' % target)

    ax.plot([0, 1], [0, 1], color='navy', label='random', linewidth=1, linestyle='--',  marker='')
    for clf in models.keys():
        metrics.plot_roc_curve(models[clf], tstX, tstY, ax=ax, marker='', linewidth=1)
    ax.legend(loc="lower right")

In [3]:
original.describe()

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.962,0.963,0.964,0.965,0.966,0.967,0.968,0.969,0.970,new_negative
count,8991.0,8991.0,8991.0,8991.0,8991.0,8991.0,8991.0,8991.0,8991.0,8991.0,...,8991.0,8991.0,8991.0,8991.0,8991.0,8991.0,8991.0,8991.0,8991.0,8991.0
mean,0.212991,0.035702,0.068735,0.033367,0.018685,0.144811,0.123123,0.193749,0.013792,0.017351,...,0.061284,0.016683,0.164387,0.239017,0.022467,0.022022,0.0218,0.022356,0.018463,0.917584
std,0.409444,0.185557,0.253018,0.179602,0.135419,0.35193,0.328597,0.395257,0.116631,0.130582,...,0.239863,0.128089,0.370647,0.426507,0.148205,0.146763,0.146037,0.147846,0.134626,0.275012
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
print(original.shape)
y: np.ndarray = original.pop('new_negative').values
X: np.ndarray = original.values
print(X.shape)

copy2 = original.copy()
copy2['new_negative']=y
copy2.head()

labels = pd.unique(y)
trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y)

(8991, 1025)
(8991, 1024)


In [5]:
sm = SMOTE(sampling_strategy='minority', random_state=42)
trnX_min, trnY_min = sm.fit_sample(trnX, trnY)
trnX_min.shape
print(trnX_min.shape)


(11548, 1024)


In [6]:
sm = SMOTE(sampling_strategy=0.6, random_state=42)
trnX_06, trnY_06 = sm.fit_sample(trnX, trnY)
trnX_06.shape
#Experimentar também com undersampling

(9238, 1024)

In [7]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_sample(trnX_06, trnY_06)
X_rus.shape

(6928, 1024)

In [8]:
from sklearn.ensemble import GradientBoostingClassifier
n_estimators = [5, 10, 25, 50, 75, 100, 150, 200, 250]
maxestimator = 0
maxlearnfraction = 0
maxscore = 0
rates = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]

for i in n_estimators:
    for j in rates:
        gb_clf = GradientBoostingClassifier(n_estimators = i, learning_rate = j, random_state = 42)
        gb_clf.fit(trnX_min, trnY_min)
        print("Learning rate: ", j)
        score_t = gb_clf.score(trnX, trnY)
        print("Accuracy score (training): {0:.3f}".format(score_t))
        score = gb_clf.score(tstX, tstY)
        print("Accuracy score (validation): {0:.3f}".format(score))
        maxscore = max(maxscore, score)
        if (maxscore == score):
            maxestimator = i
            maxlearnfraction = j
            
print("Best maximum estimator of", maxestimator)
print("Best learning rate of", maxlearnfraction)
print("Best cv score of", maxscore)

Learning rate:  0.1
Accuracy score (training): 0.835
Accuracy score (validation): 0.823
Learning rate:  0.2
Accuracy score (training): 0.893
Accuracy score (validation): 0.893
Learning rate:  0.3
Accuracy score (training): 0.884
Accuracy score (validation): 0.884
Learning rate:  0.4
Accuracy score (training): 0.882
Accuracy score (validation): 0.878
Learning rate:  0.5
Accuracy score (training): 0.886
Accuracy score (validation): 0.891
Learning rate:  0.6
Accuracy score (training): 0.878
Accuracy score (validation): 0.872
Learning rate:  0.7
Accuracy score (training): 0.880
Accuracy score (validation): 0.874
Learning rate:  0.8
Accuracy score (training): 0.871
Accuracy score (validation): 0.859
Learning rate:  0.9
Accuracy score (training): 0.891
Accuracy score (validation): 0.881
Learning rate:  1
Accuracy score (training): 0.882
Accuracy score (validation): 0.877
Learning rate:  0.1
Accuracy score (training): 0.868
Accuracy score (validation): 0.862
Learning rate:  0.2
Accuracy score

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

log = LogisticRegression(random_state = 42)
log.fit(trnX_min, trnY_min)
predY = log.predict(tstX)

print('Accuracy of logistic regression classifier on test set: {}'.format(log.score(tstX, tstY)))
print('Logistic Regression f1-score  : {:.4f}'.format(f1_score(tstY, predY)))
print('Logistic Regression precision : {:.4f}'.format(precision_score(tstY, predY)))
print('Logistic Regression recall    : {:.4f}'.format(recall_score(tstY, predY)))
print("\n",classification_report(tstY, predY))

Accuracy of logistic regression classifier on test set: 0.8773165307635286
Logistic Regression f1-score  : 0.9309
Logistic Regression precision : 0.9641
Logistic Regression recall    : 0.8998

               precision    recall  f1-score   support

           0       0.36      0.63      0.46       222
           1       0.96      0.90      0.93      2476

    accuracy                           0.88      2698
   macro avg       0.66      0.76      0.69      2698
weighted avg       0.91      0.88      0.89      2698



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
from sklearn.ensemble import GradientBoostingClassifier
n_estimators = [5, 10, 25, 50, 75, 100, 150, 200, 250]
maxestimator = 0
maxlearnfraction = 0
maxscore = 0
rates = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]

for i in n_estimators:
    for j in rates:
        gb_clf = GradientBoostingClassifier(n_estimators = i, learning_rate = j, random_state = 42)
        gb_clf.fit(X_rus, y_rus)
        print("Learning rate: ", j)
        score_t = gb_clf.score(X_rus, y_rus)
        print("Accuracy score (training): {0:.3f}".format(score_t))
        score = gb_clf.score(tstX, tstY)
        print("Accuracy score (validation): {0:.3f}".format(score))
        maxscore = max(maxscore, score)
        if (maxscore == score):
            maxestimator = i
            maxlearnfraction = j
            
print("Best maximum estimator of", maxestimator)
print("Best learning rate of", maxlearnfraction)
print("Best cv score of", maxscore)

Learning rate:  0.1
Accuracy score (training): 0.810
Accuracy score (validation): 0.812
Learning rate:  0.2
Accuracy score (training): 0.849
Accuracy score (validation): 0.874
Learning rate:  0.3
Accuracy score (training): 0.854
Accuracy score (validation): 0.887
Learning rate:  0.4
Accuracy score (training): 0.887
Accuracy score (validation): 0.880
Learning rate:  0.5
Accuracy score (training): 0.875
Accuracy score (validation): 0.872
Learning rate:  0.6
Accuracy score (training): 0.877
Accuracy score (validation): 0.865
Learning rate:  0.7
Accuracy score (training): 0.877
Accuracy score (validation): 0.866
Learning rate:  0.8
Accuracy score (training): 0.883
Accuracy score (validation): 0.867
Learning rate:  0.9
Accuracy score (training): 0.887
Accuracy score (validation): 0.884
Learning rate:  1
Accuracy score (training): 0.884
Accuracy score (validation): 0.875
Learning rate:  0.1
Accuracy score (training): 0.845
Accuracy score (validation): 0.871
Learning rate:  0.2
Accuracy score

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

log = LogisticRegression(random_state = 42)
log.fit(X_rus, y_rus)
predY = log.predict(tstX)

print('Accuracy of logistic regression classifier on test set: {}'.format(log.score(tstX, tstY)))
print('Logistic Regression f1-score  : {:.4f}'.format(f1_score(tstY, predY)))
print('Logistic Regression precision : {:.4f}'.format(precision_score(tstY, predY)))
print('Logistic Regression recall    : {:.4f}'.format(recall_score(tstY, predY)))
print("\n",classification_report(tstY, predY))

Accuracy of logistic regression classifier on test set: 0.8713862120088954
Logistic Regression f1-score  : 0.9269
Logistic Regression precision : 0.9683
Logistic Regression recall    : 0.8889

               precision    recall  f1-score   support

           0       0.35      0.68      0.46       222
           1       0.97      0.89      0.93      2476

    accuracy                           0.87      2698
   macro avg       0.66      0.78      0.70      2698
weighted avg       0.92      0.87      0.89      2698



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
