In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
#%matplotlib notebook
plt.rcParams["figure.figsize"] = (10,6)
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve, roc_auc_score, auc, roc_curve
from sklearn.impute import SimpleImputer

In [3]:
df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
data = df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [4]:
df.head()

In [5]:
df.shape

In [6]:
df.info()

In [7]:
df.describe()

In [8]:
df = df.drop('id',axis=1)

In [9]:
df.stroke.value_counts(normalize=True)

In [10]:
df.age.sort_values()

In [11]:
round(df.age.sort_values())[:100]

In [12]:
df[df['stroke'] == 1]['age'].nsmallest(10)

Datamızı incelediğimiz zaman yaş sütununda yer alan verilerin tahminlerimizi saptıracak ölçüde 
anlamsız değerler içerdiğini görmekteyiz. Örneğin 'stroke' olan kişilerden yaşı en küçük olan 
10 kişiyi getirdiğimiz zaman 1.320 yaş ve 14.00 gibi yaşlar görüyoruz. Sonraki yaşların başlangıcı ise en yakın 32.00
olarak karşımıza çıkmaktadır. Bu datayı kullanarak daha anlamlı tahminler yapmak için 14 yaş ve altı herkesi drop
etmek yerinde olacaktır diye düşünüyorum.

In [13]:
df = df.drop(index = 3116)
# gender değişkeninde yer alan 'other' değerini drop ediyoruz. 

In [14]:
df = df[df['age'] >= 14]

In [15]:
df.shape
print(f"14 yaş ve altı kişiler düşünce {df.shape[0]} satır kaldı. {5110-df.shape[0]} tane satır düşmüş  olduk.")

In [16]:
df.info()

In [17]:
df.describe()

In [18]:
data.smoking_status.value_counts()

In [19]:
df['stroke'].value_counts(normalize=True)*100

In [20]:
df['stroke'].value_counts().plot(kind = 'bar');

In [21]:
sns.heatmap(df.corr(),annot = True);

In [22]:
sns.boxplot(data = df, x = 'stroke', y = 'age')

In [23]:
df.head()

In [24]:
sns.pairplot(df,hue='stroke');

kategorik değişkenlerde yer alan değerleri scale edebilmek için
1 ve 0 haline getiriyoruz. 

In [25]:
df = pd.get_dummies(df, drop_first=True)
df

## Train | Test Split and Filling None

In [26]:
X = df.drop('stroke',axis=1)
y = df['stroke']

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [28]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [29]:
import missingno as msno

In [30]:
msno.bar(X_train)

In [31]:
msno.matrix(X_train)

In [32]:
msno.bar(X_test)

In [33]:
X_train.describe()


In [34]:
sns.boxplot(X_train.bmi)

Yukarıdaki grafikten de anlaşılacağı üzere median ile doldurmak daha mantıklı olacaktır.

In [35]:
X_train

In [36]:
from sklearn.impute import SimpleImputer

In [37]:
imputer = SimpleImputer(missing_values=np.nan, strategy="median")

X_train['bmi'] = imputer.fit_transform(X_train['bmi'].values.reshape(-1,1))[:,0]



In [38]:
X_test['bmi'] = imputer.fit_transform(X_test['bmi'].values.reshape(-1,1))[:,0]


In [39]:
print(X_train.isnull().sum(), X_test.isnull().sum())

In [40]:
X_train['bmi']

## Scaling

In [41]:
scaler = MinMaxScaler()

In [42]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Modelling 

In [43]:
from sklearn.linear_model import LogisticRegression

In [44]:
log_model = LogisticRegression(class_weight='balanced')

In [45]:
log_model.fit(X_train_scaled,y_train)

In [46]:
y_pred = log_model.predict(X_test_scaled)

In [47]:
y_pred_proba = log_model.predict_proba(X_test_scaled)

In [48]:
test_data = pd.concat([X_test, y_test], axis=1)
test_data["pred"] = y_pred
test_data["pred_proba"] = y_pred_proba[:,1]


In [49]:
test_data.sample(10)

In [50]:
from sklearn.metrics import confusion_matrix, classification_report

In [51]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [52]:
from sklearn.model_selection import cross_validate

In [53]:
model = LogisticRegression(class_weight = "balanced")

scores = cross_validate(model, X_train_scaled, y_train, scoring = ['accuracy', 'precision','recall','f1'], cv = 10)
df_scores = pd.DataFrame(scores, index = range(1, 11))
#df_scores
df_scores.mean()[2:]

In [54]:
y_pred_proba = log_model.predict_proba(X_test_scaled)

fp_rate, tp_rate, thresholds = roc_curve(y_test, y_pred_proba[:,1])

optimal_idx = np.argmax(tp_rate - fp_rate)
optimal_threshold = thresholds[optimal_idx]
optimal_threshold

test_data = pd.concat([X_test, y_test], axis=1)

test_data["pred_proba"] = y_pred_proba[:,1]

test_data["pred"] = y_pred

test_data["pred2"] = test_data["pred_proba"].apply(lambda x : 1 if x >= optimal_threshold else 0)

y_pred2 = test_data["pred2"]

print(classification_report(y_test,y_pred2))


In [55]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred2))