In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#from IPython.display import display
#import sweetviz as sv
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split, cross_val_score,  RepeatedStratifiedKFold,RandomizedSearchCV, GridSearchCV


from sklearn.metrics import classification_report, roc_curve, confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter

from sklearn.impute import KNNImputer

# modelos
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, \
VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# AED

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df = df.drop(['id'], axis=1)
df.info()

In [None]:
df.isnull().sum()

In [None]:
## you can donwload this notebook, and instal SweetViz. Then, uncomment these lines for some cool data exploration.

#analise = sv.analyze(df)
#analise.show_html('analise.html', layout = 'vertical', scale =1.0);

In [None]:
df.head(2)

In [None]:
df.work_type.value_counts()

In [None]:
df['bmi']

In [None]:
sns.boxplot(data = df[['age', 'avg_glucose_level','bmi']])

Etapas de pré-processamento:
- Input de missing values em 'bmi', utilizando knn imputer
- O mesmo para smoking status unknown, mantendo a coluna 'stroke'(?)
- Gender = 'other' replace 'female'
- Escalonar as colunas ['age', 'avg_glucose_level','bmi']


# Modeling

## bmi - KNNImputer

In [None]:
imputer = KNNImputer(n_neighbors = 6)
df['bmi'] = imputer.fit_transform(np.array(df['bmi']).reshape(-1,1))
df.isnull().sum()

In [None]:
sns.distplot(df['bmi'])

## bmi groups

In [None]:
bmi_group = []
for bmi in df['bmi']:
    if bmi < 17.0:
        bmi_group.append(0)
    
    elif (bmi >= 17.0) & (bmi <= 18.49):
        bmi_group.append(1)
    
    elif (bmi >= 18.50) & (bmi <= 24.99):
        bmi_group.append(2)
    
    elif (bmi >= 25.0) & (bmi <= 29.99):
        bmi_group.append(3)
    
    elif (bmi >= 30.0) & (bmi <= 34.99):
        bmi_group.append(4)
    
    elif (bmi >= 35.0) & (bmi <= 39.99):
        bmi_group.append(5)
    
    elif (bmi > 39.99):
        bmi_group.append(6)
    
df['bmi_group'] = bmi_group   
df = df.drop(['bmi'], axis=1)

In [None]:
df.columns

## replace gender == 'other'

In [None]:
df['gender'].value_counts()

In [None]:
df['gender'] = df['gender'].replace(to_replace='Other', value='Female')
df['gender'].value_counts()

## Rounding Age

In [None]:
df['age'] = df['age'].apply(lambda x: round(x))
df['age']

## Scaling

In [None]:
scaler = StandardScaler()
df_scale = pd.DataFrame(scaler.fit_transform(df[['age', 'avg_glucose_level']]))
df_scale[['age', 'avg_glucose_level']] = df_scale
df_scale = df_scale[['age', 'avg_glucose_level']]

df.drop(columns = ['age', 'avg_glucose_level'], inplace=True)
df = pd.concat([df, df_scale], axis=1)
sns.boxplot(data = df[['age', 'avg_glucose_level']])

## Correlations

In [None]:
df_temp = pd.get_dummies(df, drop_first = True)

In [None]:
plt.subplots(figsize=(14,8))
sns.heatmap(df_temp.corr(), annot = True, cmap='coolwarm')

## OneHot

In [None]:
ohe = OneHotEncoder(drop = 'first', sparse=False, handle_unknown = 'error')
df_t = pd.DataFrame(ohe.fit_transform(df.select_dtypes('object')))

df_t.columns = ohe.get_feature_names()
df_t.head(2)
df_t.shape

In [None]:
df_n = df.drop(df.select_dtypes('object'), axis=1)

df = pd.concat([df_t, df_n], axis=1)
df.head(2)

# Testing k_neighbors for SMOTE

In [None]:
k_values = [1, 2, 3, 4, 5, 6, 7]
X = df.drop(['stroke'], axis=1)
y = df.stroke

k_scores = []
for k in k_values:
    # define pipeline
    rf = RandomForestClassifier()
    over  = SMOTE(sampling_strategy=0.1, k_neighbors=k)
    steps = [('over', over), ('model', rf)]
    
    pipeline = Pipeline(steps=steps)
    
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    
    scores = cross_val_score(rf, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    score = np.mean(scores)
    k_scores.append((k, score))
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

In [None]:
sorted(k_scores, key=lambda tup: tup[1])

In [None]:
best_k = sorted(k_scores, key=lambda tup: tup[1])[-1][0]
best_k

In [None]:
over  = SMOTE(k_neighbors=best_k)

steps = [('over', over)]
pipeline = Pipeline(steps=steps)
X, y = pipeline.fit_resample(X, y)
counter = Counter(y)
print(counter)

# Models + voting classifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify = y, random_state=42)

In [None]:
X_train.columns

## random forest

In [None]:
rf = RandomForestClassifier()

n_estimators = [100,500,700,1000]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

grid =         {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
#search_nc = GridSearchCV(estimator = pipeline, param_grid = grid, cv = 5, verbose=1, n_jobs=-1)
search = RandomizedSearchCV(estimator = rf, param_distributions = grid, cv = 5, verbose=1, n_jobs=-1)


In [None]:
search.fit(X_train, y_train)
preds = search.predict(X_test)
print(classification_report(y_test, preds))

## Gradient Boost

In [None]:
gbr = GradientBoostingClassifier()

parameters = {
    "loss":["deviance"],
    "learning_rate": [0.01],#, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    #"min_samples_split": [1,2,3],
    "min_samples_leaf": [1,2,3],
    'max_depth' : [None, 2,3,5,10],
    #"subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[100, 300, 500, 1000],
    "warm_start" : [False, True]
    
    }
#search_gbr = RandomizedSearchCV(estimator = gbr, param_distributions = parameters, cv = 5, verbose = 2, n_jobs = -1)
search_gbr = RandomizedSearchCV(estimator = gbr,param_distributions = parameters, scoring = 'roc_auc',cv = 5, verbose = 2, n_jobs = -1)

## Extra trees


In [None]:
extra_trees = ExtraTreesClassifier()
search_extra = RandomizedSearchCV(estimator = extra_trees,param_distributions = grid, scoring = 'roc_auc',cv = 5, verbose = 2, n_jobs = -1)

## SVC

In [None]:
svc = SVC()
svc_grid = {'kernel':['linear','rbf'], 'probability' : [True]}
search_svc = RandomizedSearchCV(estimator = svc, param_distributions = svc_grid, scoring = 'roc_auc',cv = 5, verbose = 2, n_jobs = -1)

## Knn Classifier

In [None]:
knn = KNeighborsClassifier()
parameters_KNN = {
    'n_neighbors': (1,30, 1),
    'leaf_size': (20,40,1),
    'p': (1,2),
    'weights': ('uniform', 'distance'),
    'metric': ('minkowski', 'chebyshev')
}
search_knn = RandomizedSearchCV(estimator = knn, param_distributions = parameters_KNN, scoring = 'roc_auc',cv = 5, verbose = 2, n_jobs = -1)

## Voting Classifier

In [None]:
X_test.columns

In [None]:
voting_clf = VotingClassifier( estimators = [('rfr', search ),('gbr', search_gbr),('extra', search_extra),('svc', search_svc), ('knn', knn)], voting = 'soft')
voting_clf.fit(X_train, y_train);
preds = voting_clf.predict(X_test)
print(classification_report(y_test, preds))

# Moving Threshold

In [None]:
probas = voting_clf.predict_proba(X_test)
#Gets class 1 probas
probas = probas[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, probas)

# calculates g-mean for each threshold
gmeans = (tpr * (1-fpr)) ** (1/2)
# finds the index of the highest g-mean
ix = np.argmax(gmeans)
best_threshold = thresholds[ix]
print('Melhor Threshold = %f, G-Mean = %.3f' % (thresholds[ix], gmeans[ix]))

plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(fpr, tpr, marker='.', label='RFC')
plt.scatter(fpr[ix], tpr[ix], marker = 'o', color = 'black', label = 'Best')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
decisions = (probas >= best_threshold).astype(int)
print(classification_report(y_test, decisions))

In [None]:
print(confusion_matrix(y_test, decisions))

## Saving Modules

In [None]:
import pickle
# save the model
pickle.dump(voting_clf, open('modelo_voting_classifier.pkl', 'wb'))

# save the scaler
pickle.dump(scaler, open('scaler.pkl', 'wb'))

pickle.dump(ohe, open('ohe.pkl', 'wb'))