In [1]:
# Kütüphanelerin dahili

import pandas as pd
import pickle

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,roc_auc_score,precision_score,recall_score

from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline
from collections import Counter

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
# Verinin alınması

mainDataset = pd.read_csv("dataset.csv")

In [3]:
# Verinin görüntülenmesi

mainDataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
# Verinin dengesinin gözlenmesi

print(mainDataset["stroke"].value_counts()," \nDengesiz bir veri seti !")

0    4861
1     249
Name: stroke, dtype: int64  
Dengesiz bir veri seti !


In [5]:
# Dataframedeki verilerin türlerinin ve null değer içeren sütunların gözlenmesi

mainDataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [6]:
# Null değere sahip sütunların gözlenmesi

mainDataset.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [7]:
# Veri manipülasyonunun yapılması ve verisetinin uygun hale getirilmesi

le = LabelEncoder()

dataset = mainDataset.drop("id",axis=1)

dataset[['age']] = dataset[['age']].astype(int)

dataset["ever_married"] = le.fit_transform(dataset["ever_married"])
dataset["gender"] = le.fit_transform(dataset["gender"])
dataset["Residence_type"] = le.fit_transform(dataset["Residence_type"])
dataset["smoking_status"] = le.fit_transform(dataset["smoking_status"])

dataset = pd.get_dummies(dataset, columns=['work_type'])

In [8]:
# Veri setinibn son halinin gösterilmesi

dataset

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,1,67,0,1,1,1,228.69,36.6,1,1,0,0,1,0,0
1,0,61,0,0,1,0,202.21,,2,1,0,0,0,1,0
2,1,80,0,1,1,0,105.92,32.5,2,1,0,0,1,0,0
3,0,49,0,0,1,1,171.23,34.4,3,1,0,0,1,0,0
4,0,79,1,0,1,0,174.12,24.0,2,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,0,80,1,0,1,1,83.75,,2,0,0,0,1,0,0
5106,0,81,0,0,1,1,125.20,40.0,2,0,0,0,0,1,0
5107,0,35,0,0,1,0,82.99,30.6,2,0,0,0,0,1,0
5108,1,51,0,0,1,0,166.29,25.6,1,0,0,0,1,0,0


In [9]:
# BMI değerinin stroke durumuna ve kadın/erkek olmasına göre ortalamalarının alınması

print("STROKE \nMale mean of bmi=",dataset[(dataset.stroke == 1) & (dataset.bmi.notnull()) & (dataset.gender == 1)].bmi.mean(),"\nFemale mean of bmi=",dataset[(dataset.stroke == 1) & (dataset.bmi.notnull()) & (dataset.gender == 0)].bmi.mean())
print("NOT STROKE \nMale mean of bmi=",dataset[(dataset.stroke == 0) & (dataset.bmi.notnull()) & (dataset.gender == 1)].bmi.mean(),"\nFemale mean of bmi=",dataset[(dataset.stroke == 0) & (dataset.bmi.notnull()) & (dataset.gender == 0)].bmi.mean())

STROKE 
Male mean of bmi= 30.8123595505618 
Female mean of bmi= 30.21833333333333
NOT STROKE 
Male mean of bmi= 28.54771071800208 
Female mean of bmi= 29.015952466690674


In [10]:
# BMI değerinin stroke durumuna ve kadın/erkek olmasına göre ayrılması

dfStroke = dataset.iloc[:249]
dfStrokeMale = dfStroke[(dfStroke.gender == 1)]
dfStrokeFemale = dfStroke[(dfStroke.gender == 0)]

dfNotStroke = dataset.iloc[249:]
dfNotStrokeMale = dfNotStroke[(dfNotStroke.gender == 1)]
dfNotStrokeFemale = dfNotStroke[(dfNotStroke.gender == 0)]

In [11]:
# Null BMI değerlerinin stroke durumuna ve kadın/erkek olmasına göre ortalamalar ile doldurulması

dfStrokeMale = dfStrokeMale.fillna(dfStrokeMale.bmi.mean())
dfStrokeFemale = dfStrokeFemale.fillna(dfStrokeFemale.bmi.mean())

dfStroke = pd.concat([dfStrokeMale,dfStrokeFemale])

dfNotStrokeMale = dfNotStrokeMale.fillna(dfNotStrokeMale.bmi.mean())
dfNotStrokeFemale = dfNotStrokeFemale.fillna(dfNotStrokeFemale.bmi.mean())

In [12]:
# Ayrıştırılan verisetlerinin tekrar birleştirilmesi

dfNotStroke = pd.concat([dfNotStrokeMale,dfNotStrokeFemale])
dataset = pd.concat([dfStroke,dfNotStroke])

In [14]:
# X ve Y dataframelerinin alınması

Y = pd.Series(le.fit_transform(dataset["stroke"]))

X = dataset.drop("stroke",axis=1)

In [24]:
# SMOTEEN metodu ile resample edilmesi

print("Before Resample Count: ",Counter(Y))

steps = [('s', SMOTEENN(random_state=0))]
pipeline = Pipeline(steps=steps)
X_resampled, Y_resampled = pipeline.fit_resample(X, Y)

print("After Resample Count: ",Counter(Y_resampled))

Before Resample Count:  Counter({0: 4860, 1: 249})
After Resample Count:  Counter({1: 4567, 0: 3667})


In [25]:
# Test ve Train verisetinin ayrılması
X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.33, random_state=42)

In [26]:
# Makine Öğrenmesi algoritmalarının oluşturulup, başarı metriklerinin alınması

models = []
models.append(['SVM', SVC(random_state=0)])
models.append(['Decision Tree', tree.DecisionTreeClassifier(random_state=0)])
models.append(['Random Forest', RandomForestClassifier(random_state=0)])
models.append(['XGBoost', XGBClassifier(eval_metric='error',use_label_encoder=False)])

modelDetails = []

for m in range(len(models)):
    model = models[m][1]
    modelName = models[m][0]
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    
    normalAcc = (accuracy_score(Y_test, Y_pred))*100
    kfAcc = cross_val_score(estimator = model, X = X_train, y = Y_train, cv = 10)
    kfAccMean = kfAcc.mean()*100
    kfAccStd = kfAcc.std()*100
    cm = confusion_matrix(Y_test, Y_pred)
    roc = roc_auc_score(Y_test, Y_pred) 
    precision = precision_score(Y_test, Y_pred)  
    recall = recall_score(Y_test, Y_pred) 
    f1 = f1_score(Y_test, Y_pred) 
    modelDetails.append([modelName,normalAcc,kfAccMean,kfAccStd,roc,precision,recall,f1])

In [27]:
# Başarı metriklerinin kıyaslanması

df = pd.DataFrame(modelDetails, columns= ['Model', 'Accuracy', 'K-Fold Mean Accuracy', 'Std. Deviation', 'ROC AUC', 'Precision', 'Recall', 'F1'])
df.sort_values(by= ['Accuracy', 'K-Fold Mean Accuracy'], inplace= True, ascending= False)
df

Unnamed: 0,Model,Accuracy,K-Fold Mean Accuracy,Std. Deviation,ROC AUC,Precision,Recall,F1
3,XGBoost,96.578366,96.80972,1.171373,0.964496,0.96121,0.977273,0.969175
2,Random Forest,96.504783,96.156889,0.779492,0.964052,0.962987,0.97393,0.968428
1,Decision Tree,95.290655,94.815816,1.183699,0.951449,0.949409,0.965909,0.957588
0,SVM,83.922001,85.043728,1.091777,0.83326,0.828678,0.89238,0.85935


In [28]:
# GridSearchCV ile modellerin en iyi hiperparametrelerinin bulunması

grid_models = [(XGBClassifier(use_label_encoder=False), [{"subsample":[0.5,0.75,1],'max_delta_step':[0,1,2],'min_child_weight':[1,2,3],'learning_rate': [None,0.01, 0.1], 'eval_metric': ['error'],'random_state':[0]}]),
               (tree.DecisionTreeClassifier(),[{'criterion':['gini','entropy'],'random_state':[0]}]), 
               (RandomForestClassifier(),[{'n_estimators':[100,150,200],'criterion':['gini','entropy'],'random_state':[0]}])]
for i,j in grid_models:
    grid = GridSearchCV(estimator=i,param_grid = j, scoring = 'f1')
    grid.fit(X_train, Y_train)
    best_accuracy = grid.best_score_
    best_param = grid.best_params_
    print('{}:\nBest Accuracy : {:.2f}%'.format(i,best_accuracy*100))
    print('Best Parameters : ',best_param)
    print('----------------')

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=None,
              tree_method=None, use_label_encoder=False,
              validate_parameters=None, verbosity=None):
Best Accuracy : 97.07%
Best Parameters :  {'eval_metric': 'error', 'learning_rate': None, 'max_delta_step': 0, 'min_child_weight': 1, 'random_state': 0, 'subsample': 0.75}
----------------
DecisionTreeClassifier():
Best Accuracy : 95.33%
Best Parameters :  {'criterion': 'en

In [29]:
# Modelin kaydedilmesi

file_name = "Models/best_xgb.pkl"
best_model = XGBClassifier(eval_metric="error",use_label_encoder=False,subsample=0.75,random_state=0)
best_model.fit(X_train,Y_train)
pickle.dump(best_model, open(file_name, "wb"))