In [106]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import os
import matplotlib.colors as clrs
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
fp_ds1 = "./DS1_not_balanced_just_scaled.csv"

original = pd.read_csv(fp_ds1, parse_dates=False, sep=',',index_col=0)

my_palette = {'yellow': '#ECD474', 'pale orange': '#E9AE4E', 'salmon': '#E2A36B', 'orange': '#F79522', 'dark orange': '#D7725E',
              'pale acqua': '#92C4AF', 'acqua': '#64B29E', 'marine': '#3D9EA9', 'green': '#10A48A', 'olive': '#99C244',
              'pale blue': '#BDDDE0', 'blue2': '#199ED5', 'blue3': '#1DAFE5', 'dark blue': '#0C70B2',
              'pale pink': '#D077AC', 'pink': '#EA4799', 'lavender': '#E09FD5', 'lilac': '#B081B9', 'purple': '#923E97',
              'white': '#FFFFFF', 'light grey': '#D2D3D4', 'grey': '#939598', 'black': '#000000'}

In [107]:
import datetime as dt

NR_COLUMNS: int = 3
HEIGHT: int = 4


def choose_grid(nr):
    if nr < NR_COLUMNS:
        return 1, nr
    else:
        return (nr // NR_COLUMNS, NR_COLUMNS) if nr % NR_COLUMNS == 0 else (nr // NR_COLUMNS + 1, NR_COLUMNS)


def set_axes(xvalues: list, ax: plt.Axes = None, title: str = '', xlabel: str = '', ylabel: str = '', percentage=False):
    if ax is None:
        ax = plt.gca()
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if percentage:
        ax.set_ylim(0.0, 1.0)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_xticklabels(xvalues, fontsize='small', ha='center')

    return ax


def set_locators(xvalues: list, ax: plt.Axes = None):
    if isinstance(xvalues[0], dt.datetime):
        locator = mdates.AutoDateLocator()
        ax.xaxis.set_major_locator(locator)
        ax.xaxis.set_major_formatter(mdates.AutoDateFormatter(locator, defaultfmt='%Y-%m-%d'))
    else:
        ax.set_xticks(xvalues)
        ax.set_xlim(xvalues[0], xvalues[-1])

    return ax


def plot_line(xvalues: list, yvalues: list, ax: plt.Axes = None, title: str = '', xlabel: str = '',
              ylabel: str = '', percentage=False):
    ax = set_axes(xvalues, ax=ax, title=title, xlabel=xlabel, ylabel=ylabel, percentage=percentage)
    ax = set_locators(xvalues, ax=ax)
    ax.plot(xvalues,  yvalues, c=cfg.LINE_COLOR)


def multiple_line_chart(xvalues: list, yvalues: dict, ax: plt.Axes = None, title: str = '',
                        xlabel: str = '', ylabel: str = '', percentage=False, miny=0, maxy=1):
    ax = set_axes(xvalues, ax=ax, title=title, xlabel=xlabel, ylabel=ylabel, percentage=percentage)
    ax = set_locators(xvalues, ax=ax)

    legend: list = []
    if percentage:
        ax.set_ylim(miny-0.1,maxy+0.1)
    for name, y in yvalues.items():
        ax.plot(xvalues, y)
        legend.append(name)
    ax.legend(legend)


def bar_chart(xvalues: list, yvalues: list, ax: plt.Axes = None, title: str = '',
              xlabel: str = '', ylabel: str = '', percentage=False):
    ax = set_axes(xvalues, ax=ax, title=title, xlabel=xlabel, ylabel=ylabel, percentage=percentage)
    ax.bar(xvalues, yvalues, edgecolor=my_palette['dark blue'], color=my_palette['pale blue'])


def multiple_bar_chart(xvalues: list, yvalues: dict, ax: plt.Axes = None, title: str = '',
                       xlabel: str = '', ylabel: str = '', percentage=False):
    ax = set_axes(xvalues, ax=ax, title=title, xlabel=xlabel, ylabel=ylabel, percentage=percentage)

    x = np.arange(len(xvalues))  # the label locations

    width = 0.8 / (len(xvalues)*len(yvalues))
    # the width of the bars
    step = width / len(xvalues)
    i: int = 0
    for metric in yvalues:
        ax.bar(x + i*width, yvalues[metric], width=width, align='center', label=metric)
        i += 1
    ax.set_xticks(x + width/len(xvalues) - step/2)
    ax.legend(fontsize='x-small', title_fontsize='small')


def plot_confusion_matrix(cnf_matrix: np.ndarray, classes_names: np.ndarray,
                          ax: plt.Axes = None, normalize: bool = False):
    if ax is None:
        ax = plt.gca()
    if normalize:
        total = cnf_matrix.sum(axis=1)[:, np.newaxis]
        cm = cnf_matrix.astype('float') / total
        title = "Normalized confusion matrix"
    else:
        cm = cnf_matrix
        title = 'Confusion matrix'
    np.set_printoptions(precision=2)
    tick_marks = np.arange(0, len(classes_names), 1)
    ax.set_title(title)
    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')
    ax.set_xticks(tick_marks)
    ax.set_yticks(tick_marks)
    ax.set_xticklabels(classes_names)
    ax.set_yticklabels(classes_names)
    ax.imshow(cm, interpolation='nearest', cmap=clrs.LinearSegmentedColormap.from_list("myCMPBlues", [my_palette['pale blue'], my_palette['blue2'], my_palette['blue3'], my_palette['dark blue']]))

    fmt = '.2f' if normalize else 'd'
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        ax.text(j, i, format(cm[i, j], fmt), color='w', horizontalalignment="center")


def plot_evaluation_results(labels: np.ndarray, trn_y, prd_trn, tst_y, prd_tst):
    cnf_mtx_trn = metrics.confusion_matrix(trn_y, prd_trn, labels)
    tn_trn, fp_trn, fn_trn, tp_trn = cnf_mtx_trn.ravel()
    cnf_mtx_tst = metrics.confusion_matrix(tst_y, prd_tst, labels)
    tn_tst, fp_tst, fn_tst, tp_tst = cnf_mtx_tst.ravel()

    evaluation = {'Accuracy': [(tn_trn + tp_trn) / (tn_trn + tp_trn + fp_trn + fn_trn),
                               (tn_tst + tp_tst) / (tn_tst + tp_tst + fp_tst + fn_tst)],
                  'Recall': [tp_trn / (tp_trn + fn_trn), tp_tst / (tp_tst + fn_tst)],
                  'Specificity': [tn_trn / (tn_trn + fp_trn), tn_tst / (tn_tst + fp_tst)],
                  'Precision': [tp_trn / (tp_trn + fp_trn), tp_tst / (tp_tst + fp_tst)]}

    fig, axs = plt.subplots(1, 2, figsize=(2 * HEIGHT, HEIGHT))
    multiple_bar_chart(['Train', 'Test'], evaluation, ax=axs[0], title="Model's performance over Train and Test sets")
    plot_confusion_matrix(cnf_mtx_tst, labels, ax=axs[1])


def plot_roc_chart(models: dict, tstX: np.ndarray, tstY: np.ndarray, ax: plt.Axes = None, target: str = 'class'):
    if ax is None:
        ax = plt.gca()
    ax.set_xlim(0.0, 1.0)
    ax.set_ylim(0.0, 1.0)
    ax.set_xlabel('FP rate')
    ax.set_ylabel('TP rate')
    ax.set_title('ROC chart for %s' % target)

    ax.plot([0, 1], [0, 1], color='navy', label='random', linewidth=1, linestyle='--',  marker='')
    for clf in models.keys():
        metrics.plot_roc_curve(models[clf], tstX, tstY, ax=ax, marker='', linewidth=1)
    ax.legend(loc="lower right")

In [108]:
original.head()
original_v2 = original.copy()
print(original.shape)
y: np.ndarray = original.pop('DEATH_EVENT').values
X: np.ndarray = original.values
print(X.shape)

labels = pd.unique(y)

(224, 13)
(224, 12)


In [None]:
#eliminar var's com variancias pequenas. USar select Kbest com chi2 pq é var categorica

In [109]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
def gb_cross_validate(estimators, learning): #VARIAR A MAX_DEPTHS AS WELL, E A LOSS FUNCTION, O CRITÉRIO...
    learnfraction = learning/10
    
    kf = KFold(n_splits=5) #fazer com mais

    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        trnX = X[train_index]
        trnY = y[train_index]
        tstX = X[test_index]
        tstY = y[test_index]  
        sm = SMOTE()
        trnX_oversampled, trnY_oversampled = sm.fit_sample(trnX , trnY)
        model = GradientBoostingClassifier(n_estimators = estimators, learning_rate = learnfraction, random_state = 42)
        model.fit(trnX_oversampled, trnY_oversampled)  
        y_pred = model.predict(tstX)
        #print(f'#############################\n')
        #print(f'For fold {fold}:')
        acc = model.score(tstX, tstY)
        #print(f'Accuracy: {acc}')
        #print(f'f-score: {f1_score(tstY, y_pred)}')
        #print("Maximum estimators of", estimators, "and learning rate of", learnfraction)
    return estimators, learnfraction, acc, fold

n_estimators = [5, 10, 25, 50, 75, 100, 150, 200, 250, 300,500,1000]
maxestimator = 0;
maxlearnfraction = 0;
maxcvscore = 0;
maxfold = 1;
for i in n_estimators:
    for j in range(1, 11, 1):
        iterestim, iterlearn, itercv, fold = gb_cross_validate(i, j)
        maxcvscore = max(maxcvscore, itercv)
        if (maxcvscore == itercv):
            maxestimator = iterestim
            maxlearnfraction = iterlearn
            maxfold = fold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

            
print("Best maximum estimator of", maxestimator)
print("Best learning rate of", maxlearnfraction)
print("Best acc score of", maxcvscore)
print("With", fold, "folds")

#USAR A MÉDIA DOS FOLDS

Best maximum estimator of 1000
Best learning rate of 1.0
Best acc score of 0.9772727272727273
With 5 folds


In [110]:
#https://stackoverflow.com/questions/55591063/how-to-perform-smote-with-cross-validation-in-sklearn-in-python
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

#X = original[['age','ejection_fraction','serum_creatinine','creatinine_phosphokinase','time']]
kf = KFold(n_splits=5)
best_acc = 0
best_model = None
for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    trnX = X[train_index]
    trnY = y[train_index]
    tstX = X[test_index]
    tstY = y[test_index]  # See comment on ravel and  y_train
    sm = SMOTE(sampling_strategy='minority', random_state=42)
    trnX_oversampled, trnY_oversampled = sm.fit_sample(trnX , trnY)
    model = LogisticRegression(random_state = 42)
    model.fit(trnX_oversampled, trnY_oversampled)  
    y_pred = model.predict(tstX)
    print(f'#############################\n')
    print(f'For fold {fold}:')
    acc = model.score(tstX, tstY)
    print(f'Accuracy: {acc}')
    print(f'f1-score: {f1_score(tstY, y_pred)}')
    if(best_acc < acc):
        best_acc = acc
        best_model = model

#############################

For fold 1:
Accuracy: 0.6222222222222222
f-score: 0.746268656716418
#############################

For fold 2:
Accuracy: 0.4666666666666667
f-score: 0.3333333333333333
#############################

For fold 3:
Accuracy: 0.6666666666666666
f-score: 0.2857142857142857
#############################

For fold 4:
Accuracy: 0.8222222222222222
f-score: 0.0
#############################

For fold 5:
Accuracy: 0.9545454545454546
f-score: 0.0


In [111]:
original.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
0,0.636364,0.0,0.071319,0.0,0.090909,1.0,0.290823,0.157303,0.485714,1.0,0.0,0.0
2,0.454545,0.0,0.015693,0.0,0.090909,0.0,0.16596,0.089888,0.457143,1.0,1.0,0.010676
3,0.181818,1.0,0.011227,0.0,0.090909,0.0,0.224148,0.157303,0.685714,1.0,0.0,0.010676
5,0.909091,1.0,0.003062,0.0,0.393939,1.0,0.216875,0.179775,0.542857,1.0,1.0,0.014235
6,0.636364,1.0,0.028451,0.0,0.015152,0.0,0.12353,0.078652,0.685714,1.0,0.0,0.021352


In [112]:
X = original[['age','ejection_fraction','serum_creatinine','creatinine_phosphokinase','time']]
X.head()

Unnamed: 0,age,ejection_fraction,serum_creatinine,creatinine_phosphokinase,time
0,0.636364,0.090909,0.157303,0.071319,0.0
2,0.454545,0.090909,0.089888,0.015693,0.010676
3,0.181818,0.090909,0.157303,0.011227,0.010676
5,0.909091,0.393939,0.179775,0.003062,0.014235
6,0.636364,0.015152,0.078652,0.028451,0.021352


In [113]:
'''from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
def gb_cross_validate(estimators, learning):
    learnfraction = learning/10
    gboost = GradientBoostingClassifier(n_estimators = estimators, learning_rate = learnfraction, random_state = 42)
    cvscore = cross_val_score(gboost, trnX, trnY, cv = 5)
    print("Maximum estimators of", estimators, "and learning rate of", learnfraction,":", cvscore.mean())
    return estimators, learnfraction, cvscore.mean()

n_estimators = [5, 10, 25, 50, 75, 100, 150, 200, 250, 300,500,1000]
maxestimator = 0;
maxlearnfraction = 0;
maxcvscore = 0;
for i in n_estimators:
    for j in range(1, 11, 1):
        iterestim, iterlearn, itercv = gb_cross_validate(i, j)
        maxcvscore = max(maxcvscore, itercv)
        if (maxcvscore == itercv):
            maxestimator = iterestim
            maxlearnfraction = iterlearn
            
print("Best maximum estimator of", maxestimator)
print("Best learning rate of", maxlearnfraction)
print("Best cv score of", maxcvscore)'''

'from sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.model_selection import cross_val_score\ndef gb_cross_validate(estimators, learning):\n    learnfraction = learning/10\n    gboost = GradientBoostingClassifier(n_estimators = estimators, learning_rate = learnfraction, random_state = 42)\n    cvscore = cross_val_score(gboost, trnX, trnY, cv = 5)\n    print("Maximum estimators of", estimators, "and learning rate of", learnfraction,":", cvscore.mean())\n    return estimators, learnfraction, cvscore.mean()\n\nn_estimators = [5, 10, 25, 50, 75, 100, 150, 200, 250, 300,500,1000]\nmaxestimator = 0;\nmaxlearnfraction = 0;\nmaxcvscore = 0;\nfor i in n_estimators:\n    for j in range(1, 11, 1):\n        iterestim, iterlearn, itercv = gb_cross_validate(i, j)\n        maxcvscore = max(maxcvscore, itercv)\n        if (maxcvscore == itercv):\n            maxestimator = iterestim\n            maxlearnfraction = iterlearn\n            \nprint("Best maximum estimator of", maxestimato

In [116]:
#fazer sem scalling também
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
def gb_cross_validate(estimators, learning):
    learnfraction = learning/10
    
    kf = KFold(n_splits=5)

    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        trnX = X.iloc[train_index]
        trnY = y[train_index]
        tstX = X.iloc[test_index]
        tstY = y[test_index]  # See comment on ravel and  y_train
        sm = SMOTE()
        trnX_oversampled, trnY_oversampled = sm.fit_sample(trnX , trnY)
        model = GradientBoostingClassifier(n_estimators = estimators, learning_rate = learnfraction, random_state = 42)
        model.fit(trnX_oversampled, trnY_oversampled)  
        y_pred = model.predict(tstX)
        print(f'#############################\n')
        print(f'For fold {fold}:')
        acc = model.score(tstX, tstY)
        print(f'Accuracy: {acc}')
        print(f'f-score: {f1_score(tstY, y_pred)}')
        print("Maximum estimators of", estimators, "and learning rate of", learnfraction)
    return estimators, learnfraction, acc, fold

n_estimators = [5, 10, 25, 50, 75, 100, 150, 200, 250, 300,500,1000]
maxestimator = 0;
maxlearnfraction = 0;
maxcvscore = 0;
maxfold = 1;
for i in n_estimators:
    for j in range(1, 11, 1):
        iterestim, iterlearn, itercv, fold = gb_cross_validate(i, j)
        maxcvscore = max(maxcvscore, itercv)
        if (maxcvscore == itercv):
            maxestimator = iterestim
            maxlearnfraction = iterlearn
            maxfold = fold
            
print("Best maximum estimator of", maxestimator)
print("Best learning rate of", maxlearnfraction)
print("Best acc score of", maxcvscore)
print("With", fold, "folds")

#############################

For fold 1:
Accuracy: 0.5555555555555556
f-score: 0.6551724137931034
Maximum estimators of 5 and learning rate of 0.1
#############################

For fold 2:
Accuracy: 0.5777777777777777
f-score: 0.2962962962962963
Maximum estimators of 5 and learning rate of 0.1
#############################

For fold 3:
Accuracy: 0.8444444444444444
f-score: 0.36363636363636365
Maximum estimators of 5 and learning rate of 0.1
#############################

For fold 4:
Accuracy: 0.8
f-score: 0.1818181818181818
Maximum estimators of 5 and learning rate of 0.1
#############################

For fold 5:
Accuracy: 0.8409090909090909
f-score: 0.22222222222222224
Maximum estimators of 5 and learning rate of 0.1
#############################

For fold 1:
Accuracy: 0.6222222222222222
f-score: 0.711864406779661
Maximum estimators of 5 and learning rate of 0.2
#############################

For fold 2:
Accuracy: 0.5777777777777777
f-score: 0.2962962962962963
Maximum estimators o

f-score: 0.36363636363636365
Maximum estimators of 10 and learning rate of 0.2
#############################

For fold 4:
Accuracy: 0.8222222222222222
f-score: 0.3333333333333333
Maximum estimators of 10 and learning rate of 0.2
#############################

For fold 5:
Accuracy: 0.8409090909090909
f-score: 0.22222222222222224
Maximum estimators of 10 and learning rate of 0.2
#############################

For fold 1:
Accuracy: 0.6
f-score: 0.7000000000000001
Maximum estimators of 10 and learning rate of 0.3
#############################

For fold 2:
Accuracy: 0.6222222222222222
f-score: 0.37037037037037035
Maximum estimators of 10 and learning rate of 0.3
#############################

For fold 3:
Accuracy: 0.8222222222222222
f-score: 0.3333333333333333
Maximum estimators of 10 and learning rate of 0.3
#############################

For fold 4:
Accuracy: 0.7555555555555555
f-score: 0.3529411764705882
Maximum estimators of 10 and learning rate of 0.3
#############################

For

Accuracy: 0.8
f-score: 0.39999999999999997
Maximum estimators of 25 and learning rate of 0.4
#############################

For fold 4:
Accuracy: 0.7777777777777778
f-score: 0.28571428571428575
Maximum estimators of 25 and learning rate of 0.4
#############################

For fold 5:
Accuracy: 0.8409090909090909
f-score: 0.22222222222222224
Maximum estimators of 25 and learning rate of 0.4
#############################

For fold 1:
Accuracy: 0.5111111111111111
f-score: 0.6071428571428572
Maximum estimators of 25 and learning rate of 0.5
#############################

For fold 2:
Accuracy: 0.6
f-score: 0.35714285714285715
Maximum estimators of 25 and learning rate of 0.5
#############################

For fold 3:
Accuracy: 0.5777777777777777
f-score: 0.24
Maximum estimators of 25 and learning rate of 0.5
#############################

For fold 4:
Accuracy: 0.8222222222222222
f-score: 0.3333333333333333
Maximum estimators of 25 and learning rate of 0.5
#############################

Fo

#############################

For fold 1:
Accuracy: 0.5111111111111111
f-score: 0.5925925925925926
Maximum estimators of 50 and learning rate of 0.6
#############################

For fold 2:
Accuracy: 0.5555555555555556
f-score: 0.3333333333333333
Maximum estimators of 50 and learning rate of 0.6
#############################

For fold 3:
Accuracy: 0.8666666666666667
f-score: 0.4
Maximum estimators of 50 and learning rate of 0.6
#############################

For fold 4:
Accuracy: 0.7777777777777778
f-score: 0.28571428571428575
Maximum estimators of 50 and learning rate of 0.6
#############################

For fold 5:
Accuracy: 0.9772727272727273
f-score: 0.6666666666666666
Maximum estimators of 50 and learning rate of 0.6
#############################

For fold 1:
Accuracy: 0.4888888888888889
f-score: 0.5660377358490566
Maximum estimators of 50 and learning rate of 0.7
#############################

For fold 2:
Accuracy: 0.28888888888888886
f-score: 0.30434782608695654
Maximum esti

#############################

For fold 3:
Accuracy: 0.8222222222222222
f-score: 0.2
Maximum estimators of 75 and learning rate of 0.7
#############################

For fold 4:
Accuracy: 0.8
f-score: 0.30769230769230765
Maximum estimators of 75 and learning rate of 0.7
#############################

For fold 5:
Accuracy: 0.9545454545454546
f-score: 0.5
Maximum estimators of 75 and learning rate of 0.7
#############################

For fold 1:
Accuracy: 0.5333333333333333
f-score: 0.631578947368421
Maximum estimators of 75 and learning rate of 0.8
#############################

For fold 2:
Accuracy: 0.5777777777777777
f-score: 0.3448275862068965
Maximum estimators of 75 and learning rate of 0.8
#############################

For fold 3:
Accuracy: 0.8666666666666667
f-score: 0.4
Maximum estimators of 75 and learning rate of 0.8
#############################

For fold 4:
Accuracy: 0.8
f-score: 0.30769230769230765
Maximum estimators of 75 and learning rate of 0.8
########################

#############################

For fold 1:
Accuracy: 0.5333333333333333
f-score: 0.631578947368421
Maximum estimators of 100 and learning rate of 0.9
#############################

For fold 2:
Accuracy: 0.5777777777777777
f-score: 0.2962962962962963
Maximum estimators of 100 and learning rate of 0.9
#############################

For fold 3:
Accuracy: 0.8666666666666667
f-score: 0.4
Maximum estimators of 100 and learning rate of 0.9
#############################

For fold 4:
Accuracy: 0.8222222222222222
f-score: 0.3333333333333333
Maximum estimators of 100 and learning rate of 0.9
#############################

For fold 5:
Accuracy: 0.9318181818181818
f-score: 0.4
Maximum estimators of 100 and learning rate of 0.9
#############################

For fold 1:
Accuracy: 0.5555555555555556
f-score: 0.6551724137931034
Maximum estimators of 100 and learning rate of 1.0
#############################

For fold 2:
Accuracy: 0.6666666666666666
f-score: 0.3478260869565218
Maximum estimators of 100

#############################

For fold 3:
Accuracy: 0.4666666666666667
f-score: 0.14285714285714285
Maximum estimators of 150 and learning rate of 1.0
#############################

For fold 4:
Accuracy: 0.8
f-score: 0.30769230769230765
Maximum estimators of 150 and learning rate of 1.0
#############################

For fold 5:
Accuracy: 0.9318181818181818
f-score: 0.4
Maximum estimators of 150 and learning rate of 1.0
#############################

For fold 1:
Accuracy: 0.4666666666666667
f-score: 0.5555555555555556
Maximum estimators of 200 and learning rate of 0.1
#############################

For fold 2:
Accuracy: 0.6444444444444445
f-score: 0.3846153846153846
Maximum estimators of 200 and learning rate of 0.1
#############################

For fold 3:
Accuracy: 0.8222222222222222
f-score: 0.3333333333333333
Maximum estimators of 200 and learning rate of 0.1
#############################

For fold 4:
Accuracy: 0.8444444444444444
f-score: 0.3636363636363636
Maximum estimators of 

#############################

For fold 4:
Accuracy: 0.8
f-score: 0.30769230769230765
Maximum estimators of 250 and learning rate of 0.1
#############################

For fold 5:
Accuracy: 0.8409090909090909
f-score: 0.22222222222222224
Maximum estimators of 250 and learning rate of 0.1
#############################

For fold 1:
Accuracy: 0.5333333333333333
f-score: 0.6181818181818182
Maximum estimators of 250 and learning rate of 0.2
#############################

For fold 2:
Accuracy: 0.6888888888888889
f-score: 0.36363636363636365
Maximum estimators of 250 and learning rate of 0.2
#############################

For fold 3:
Accuracy: 0.8222222222222222
f-score: 0.2
Maximum estimators of 250 and learning rate of 0.2
#############################

For fold 4:
Accuracy: 0.8222222222222222
f-score: 0.3333333333333333
Maximum estimators of 250 and learning rate of 0.2
#############################

For fold 5:
Accuracy: 0.8863636363636364
f-score: 0.28571428571428575
Maximum estimators o

#############################

For fold 5:
Accuracy: 0.9318181818181818
f-score: 0.4
Maximum estimators of 300 and learning rate of 0.2
#############################

For fold 1:
Accuracy: 0.5555555555555556
f-score: 0.6551724137931034
Maximum estimators of 300 and learning rate of 0.3
#############################

For fold 2:
Accuracy: 0.6888888888888889
f-score: 0.4166666666666667
Maximum estimators of 300 and learning rate of 0.3
#############################

For fold 3:
Accuracy: 0.8222222222222222
f-score: 0.2
Maximum estimators of 300 and learning rate of 0.3
#############################

For fold 4:
Accuracy: 0.7777777777777778
f-score: 0.37499999999999994
Maximum estimators of 300 and learning rate of 0.3
#############################

For fold 5:
Accuracy: 0.9545454545454546
f-score: 0.5
Maximum estimators of 300 and learning rate of 0.3
#############################

For fold 1:
Accuracy: 0.6
f-score: 0.6896551724137931
Maximum estimators of 300 and learning rate of 0.4
##

#############################

For fold 2:
Accuracy: 0.5333333333333333
f-score: 0.3225806451612903
Maximum estimators of 500 and learning rate of 0.4
#############################

For fold 3:
Accuracy: 0.8
f-score: 0.39999999999999997
Maximum estimators of 500 and learning rate of 0.4
#############################

For fold 4:
Accuracy: 0.8222222222222222
f-score: 0.3333333333333333
Maximum estimators of 500 and learning rate of 0.4
#############################

For fold 5:
Accuracy: 0.9318181818181818
f-score: 0.4
Maximum estimators of 500 and learning rate of 0.4
#############################

For fold 1:
Accuracy: 0.5333333333333333
f-score: 0.6181818181818182
Maximum estimators of 500 and learning rate of 0.5
#############################

For fold 2:
Accuracy: 0.6666666666666666
f-score: 0.3478260869565218
Maximum estimators of 500 and learning rate of 0.5
#############################

For fold 3:
Accuracy: 0.8444444444444444
f-score: 0.22222222222222224
Maximum estimators of 

#############################

For fold 4:
Accuracy: 0.8222222222222222
f-score: 0.3333333333333333
Maximum estimators of 1000 and learning rate of 0.5
#############################

For fold 5:
Accuracy: 0.9545454545454546
f-score: 0.5
Maximum estimators of 1000 and learning rate of 0.5
#############################

For fold 1:
Accuracy: 0.4666666666666667
f-score: 0.5384615384615384
Maximum estimators of 1000 and learning rate of 0.6
#############################

For fold 2:
Accuracy: 0.6888888888888889
f-score: 0.4166666666666667
Maximum estimators of 1000 and learning rate of 0.6
#############################

For fold 3:
Accuracy: 0.8444444444444444
f-score: 0.22222222222222224
Maximum estimators of 1000 and learning rate of 0.6
#############################

For fold 4:
Accuracy: 0.8444444444444444
f-score: 0.2222222222222222
Maximum estimators of 1000 and learning rate of 0.6
#############################

For fold 5:
Accuracy: 0.9318181818181818
f-score: 0.4
Maximum estimator

In [117]:
#fgboost = GradientBoostingClassifier(n_estimators = 1000, learning_rate = 0.6, random_state = 42)
#fgboost.fit(trnX, trnY)
#prd_trn = fgboost.predict(trnX)
#prd_tst = fgboost.predict(tstX)
#plot_evaluation_results(pd.unique(y), trnY, prd_trn, tstY, prd_tst)


In [118]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

#X = original[['age','ejection_fraction','serum_creatinine','creatinine_phosphokinase','time']]
kf = KFold(n_splits=5)
best_acc = 0
best_model = None
for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    trnX = X.iloc[train_index]
    trnY = y[train_index]
    tstX = X.iloc[test_index]
    tstY = y[test_index]  # See comment on ravel and  y_train
    sm = SMOTE(sampling_strategy='minority', random_state=42)
    trnX_oversampled, trnY_oversampled = sm.fit_sample(trnX , trnY)
    model = LogisticRegression(random_state = 42) #VARIAR OUTROS PARAMETROS
    model.fit(trnX_oversampled, trnY_oversampled)  
    y_pred = model.predict(tstX)
    print(f'#############################\n')
    print(f'For fold {fold}:')
    acc = model.score(tstX, tstY)
    print(f'Accuracy: {acc}')
    print(f'f-score: {f1_score(tstY, y_pred)}')
    if(best_acc < acc):
        best_acc = acc
        best_model = model
        #print('Accuracy of logistic regression classifier on test set: {}'.format(log.score(tstX, tstY)))
        #print('Logistic Regression f1-score  : {:.4f}'.format(f1_score(tstY, predY)))
        #print('Logistic Regression precision : {:.4f}'.format(precision_score(tstY, predY)))
        #print('Logistic Regression recall    : {:.4f}'.format(recall_score(tstY, predY)))
        #print("\n",classification_report(tstY, predY))



#############################

For fold 1:
Accuracy: 0.7555555555555555
f-score: 0.8450704225352113
#############################

For fold 2:
Accuracy: 0.4444444444444444
f-score: 0.358974358974359
#############################

For fold 3:
Accuracy: 0.7333333333333333
f-score: 0.4
#############################

For fold 4:
Accuracy: 0.8666666666666667
f-score: 0.25
#############################

For fold 5:
Accuracy: 0.9545454545454546
f-score: 0.0


In [104]:
original_v2.shape

########################################
# FROM HERE ON OUT, IT'S ALL WORTHLESS
########################################

(224, 13)

In [105]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

#X = original[['age','ejection_fraction','serum_creatinine','creatinine_phosphokinase','time']]
kf = KFold(n_splits=5)
best_acc = 0
best_model = None
for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    trnX = original_v2[train_index]
    trnY = y[train_index]
    tstX = original_v2[test_index]
    tstY = y[test_index]  # See comment on ravel and  y_train
    sm = SMOTE(sampling_strategy='minority', random_state=42)
    trnX_oversampled, trnY_oversampled = sm.fit_sample(trnX , trnY)
    model = LogisticRegression(random_state = 42)
    model.fit(trnX_oversampled, trnY_oversampled)  
    y_pred = model.predict(tstX)
    print(f'#############################\n')
    print(f'For fold {fold}:')
    acc = model.score(tstX, tstY)
    print(f'Accuracy: {acc}')
    print(f'f-score: {f1_score(tstY, y_pred)}')
    if(best_acc < acc):
        best_acc = acc
        best_model = model

KeyError: "None of [Int64Index([ 45,  46,  47,  48,  49,  50,  51,  52,  53,  54,\n            ...\n            214, 215, 216, 217, 218, 219, 220, 221, 222, 223],\n           dtype='int64', length=179)] are in the [columns]"

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

#X = original[['age','ejection_fraction','serum_creatinine','creatinine_phosphokinase','time']]
trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y)

log = LogisticRegression(random_state = 42)
#sm = SMOTE(sampling_strategy='minority', random_state=42)
#trnX, trnY = sm.fit_sample(trnX, trnY)

log.fit(trnX, trnY)
predY= log.predict(tstX)

print('Accuracy of logistic regression classifier on test set: {}'.format(log.score(tstX, tstY)))
print('Logistic Regression f1-score  : {:.4f}'.format(f1_score(tstY, predY)))
print('Logistic Regression precision : {:.4f}'.format(precision_score(tstY, predY)))
print('Logistic Regression recall    : {:.4f}'.format(recall_score(tstY, predY)))
print("\n",classification_report(tstY, predY))

Accuracy of logistic regression classifier on test set: 0.8088235294117647
Logistic Regression f1-score  : 0.4800
Logistic Regression precision : 1.0000
Logistic Regression recall    : 0.3158

               precision    recall  f1-score   support

         0.0       0.79      1.00      0.88        49
         1.0       1.00      0.32      0.48        19

    accuracy                           0.81        68
   macro avg       0.90      0.66      0.68        68
weighted avg       0.85      0.81      0.77        68



In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

X = original[['age','ejection_fraction','serum_creatinine','creatinine_phosphokinase','time']]
#trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y)


from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score

kf = KFold(n_splits=5)

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    trnX = X[train_index]
    trnY = y[train_index]
    tstX = X[test_index]
    tstY = y[test_index]  # See comment on ravel and  y_train
    sm = SMOTE()
    trnX_oversampled, trnY_oversampled = sm.fit_sample(trnX , trnY)
    model = LogisticRegression(random_state = 42)
    model.fit(X_train_oversampled, y_train_oversampled )  
    y_pred = model.predict(tstX)
    print(f'For fold {fold}:')
    print(f'Accuracy: {model.score(tstX, tstY)}')
    print(f'f-score: {f1_score(tstY, y_pred)}')
    
    
#log = LogisticRegression(random_state = 42)
#sm = SMOTE(sampling_strategy='minority', random_state=42)
#trnX, trnY = sm.fit_sample(trnX, trnY)

#log.fit(trnX, trnY)
#predY= log.predict(tstX)

#print('Accuracy of logistic regression classifier on test set: {}'.format(log.score(tstX, tstY)))
#print('Logistic Regression f1-score  : {:.4f}'.format(f1_score(tstY, predY)))
#print('Logistic Regression precision : {:.4f}'.format(precision_score(tstY, predY)))
#print('Logistic Regression recall    : {:.4f}'.format(recall_score(tstY, predY)))
#print("\n",classification_report(tstY, predY))

KeyError: "None of [Int64Index([ 45,  46,  47,  48,  49,  50,  51,  52,  53,  54,\n            ...\n            214, 215, 216, 217, 218, 219, 220, 221, 222, 223],\n           dtype='int64', length=179)] are in the [columns]"