In [1]:
import pandas as pd

In [2]:
from google.colab import data_table
data_table.enable_dataframe_formatter()

In [3]:
dataset_url = "https://raw.githubusercontent.com/Scherpinski-R/Brain-Stroke-Prediction/main/BrainStrokeData/full_data.csv"

In [4]:
df = pd.read_csv(dataset_url)

### Como sugerido pelo autor do Dataset, e como temos muitos mais dados para Stroke=0 do que Stroke=1, droparemos instancias com idade < 38

In [5]:
df = df.drop( df[df.age < 38].index )

In [6]:
y = df["stroke"]
X = df.loc[:, df.columns!="stroke"]

In [7]:
from imblearn.under_sampling import OneSidedSelection

In [8]:
categorical_features = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]

In [9]:
numerical_features = ["age", "avg_glucose_level", "bmi"]

In [10]:
from sklearn.compose import ColumnTransformer

In [11]:
from sklearn.preprocessing import OneHotEncoder

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
columns_trans = ColumnTransformer(
    transformers=[
    ("cat", OneHotEncoder(), categorical_features),
    ("num", StandardScaler(), numerical_features)], 
    remainder='passthrough'
)

In [14]:
from imblearn.pipeline import Pipeline 

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

Undersample com OneSidedSelection = TomekLink + Condensed Nearest Neighbor

https://machinelearningmastery.com/undersampling-algorithms-for-imbalanced-classification/

https://imbalanced-learn.org/stable/references/generated/imblearn.pipeline.Pipeline.html

https://tiaplagata.medium.com/how-scikit-learn-pipelines-make-your-life-so-much-easier-3cfbfa1d9da6

Foi necessario mudar pipeline pro imblearn pois nao ha suporte no pipeline do sklearn com OneSidedSelection()

In [16]:
pipe = Pipeline( [ ('columns_trans',columns_trans), ('oss', OneSidedSelection(n_neighbors=1, n_seeds_S=200)), ('knn', KNeighborsClassifier(n_neighbors=11)) ])

In [17]:
from sklearn.model_selection import StratifiedKFold

In [18]:
cv = StratifiedKFold(n_splits=5)

In [19]:
from sklearn.model_selection import cross_validate

Defnição da função da Matriz Confusão:

In [20]:
import numpy as np

def confusion_matrix(real, predicted):
    
    #Define quais são as classes
    negative = 0
    positive = 1
    
    #Inicializa valores
    tp, tn, fp, fn = 0, 0, 0, 0
    
    for (i,real_value) in enumerate(real):
        if real_value == positive:
            if real_value == predicted[i]:
                tp = tp + 1
            else:
                fp = fp + 1
        if real_value == negative:
            if real_value == predicted[i]:
                tn = tn + 1
            else: 
                fn = fn + 1
                
    matrix = np.array([[tp,fn],[fp,tn]])
    
    return matrix

In [21]:
def confusion_matrix_scorer(clf, X, y):
  y_pred = clf.predict(X)
  cm = confusion_matrix(y, y_pred)
  return {'tn': cm[0, 0], 'fp': cm[0, 1],
          'fn': cm[1, 0], 'tp': cm[1, 1]}

cv = StratifiedKFold(n_splits=7)

cv_result = cross_validate(pipe, X, y, scoring=confusion_matrix_scorer, cv=cv, n_jobs=-1)
cv_result['test_fp']

array([ 1,  2,  5,  3,  5,  1, 11])

Depois de validado os melhores hiperparametros usamos todos os dados

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y) 

In [24]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('columns_trans',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat', OneHotEncoder(),
                                                  ['gender', 'ever_married',
                                                   'work_type',
                                                   'Residence_type',
                                                   'smoking_status']),
                                                 ('num', StandardScaler(),
                                                  ['age', 'avg_glucose_level',
                                                   'bmi'])])),
                ('oss', OneSidedSelection(n_neighbors=1, n_seeds_S=200)),
                ('knn', KNeighborsClassifier(n_neighbors=11))])

Executando a Matriz Confusão e Métricas resultantes:

In [25]:
real = y_test.to_numpy()
predicted = pipe.predict(X_test)
matrix = confusion_matrix(real, predicted)

print('Matriz Confusão: \n{}'.format(matrix))

Matriz Confusão: 
[[  1   3]
 [ 70 680]]


In [26]:
acc = (matrix[0][0] + matrix[1][1])/(matrix[0][0] + matrix[0][1] + matrix[1][0] + matrix[1][1])
recall = (matrix[0][0])/(matrix[0][0] + matrix[0][1])
precision = (matrix[0][0])/(matrix[0][0] + matrix[1][0])
f_score = 2*((precision*recall)/(precision+recall))

print('Acurácia: {:.2f}%'.format(acc*100))
print('Recall: {:.2f}%'.format(recall*100))
print('Precisão: {:.2f}%'.format(precision*100))
print('F-Score: {:.2f}%'.format(f_score*100))

Acurácia: 90.32%
Recall: 25.00%
Precisão: 1.41%
F-Score: 2.67%
