In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install seaborn --upgrade

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set()
sns.__version__

In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

In [None]:
df.set_index('id', inplace=True)

# check out some data

In [None]:
for col in df.columns[:-1]:
    sns.displot(df, x=col, hue='stroke')
    plt.show()

nan check

In [None]:
for col in df.columns:
    if df[col].isnull().sum() > 0:
        print(col,':', df[col].isnull().sum(), 'nan values')

look into bmi

In [None]:
for col in df[['age', 'avg_glucose_level']].columns:
    print(f'bmi corr with {col}: ', df['bmi'].corr(df[col]))

In [None]:
df['bmi'].describe()

In [None]:
sns.displot(df[df['bmi'].isnull()], x='stroke')

In [None]:
p1 = df[df['bmi'].isnull()]['stroke'].sum() / len(df[df['bmi'].isnull()]) * 100
p2 = df['stroke'].sum() / len(df) * 100
print(f'{p1.round()}% of subjects with unreported bmi had a stroke')
print(f'{p2.round()}% of subjects in data set had a stroke')

those that don't report bmi are more likely to have a stroke

# data processing

split data for processing

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('stroke', axis=1)
y = df['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2, stratify=y)

In [None]:
train = X_train.join(y_train)
test = X_test.join(y_test)
dfs = [train, test]

further outside research found that these are all risk factors for stroke : 

* high blood pressure
* obesity : bmi >= 30
* diabetic
* smoking history
* heart disease
* gender : female
* age : over 65 (70% strokes over age 65)
* prior stroke
* stress
* hyperglycemia : >= 108 mg/dL (observed in 2/3 of ischemic strokes)

* unreported bmi? could just be sample specific 

In [None]:
train['bmi_unreported'] = np.where(train['bmi'].isnull(), 1, 0)
test['bmi_unreported'] = np.where(test['bmi'].isnull(), 1, 0)

In [None]:
print('train bmi median : ', train['bmi'].median())
print('test bmi median : ', test['bmi'].median())

by replacing the missing bmi with the median, we won't pumping up the obese numbers ( >= 30 )

In [None]:
train['bmi'].fillna(train['bmi'].median(), inplace=True)
test['bmi'].fillna(test['bmi'].median(), inplace=True)
train.head()

nan check 2

In [None]:
for df in dfs:
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            print(col,':', df[col].isnull().sum(), 'nan values')

In [None]:
for df in dfs:
    df['ever_married'].replace({'Yes': 1, 'No': 0}, inplace=True)
    df['Residence_type'].replace({'Urban': 1, 'Rural': 0}, inplace=True)

In [None]:
def dummies(df):
    df = pd.get_dummies(df, columns=['work_type'], prefix='work')
    df = pd.get_dummies(df, columns=['smoking_status'], prefix='smoking')
    df = pd.get_dummies(df, columns=['gender'], prefix='gender')
    return df

In [None]:
train = dummies(train)
test = dummies(test)
train.head()

In [None]:
train.columns = train.columns.str.lower()
test.columns = test.columns.str.lower()

only 1 sample of gender_other so let's delete it

In [None]:
test[test['gender_other']==1]['gender_other'].count()

In [None]:
test = test.drop('gender_other', axis=1)

In [None]:
for df in dfs:
    print(df['avg_glucose_level'].describe(), '\n')

add risk factors according to research

In [None]:
def add_features(df):
    df['obese'] = np.where(df['bmi'] >= 30, 1, 0)
    df['hyperglycemic'] = np.where(df['avg_glucose_level'] >=108, 1, 0)
    df['over_65'] = np.where(df['age'] >=65, 1, 0)
    return df

In [None]:
add_features(train)
add_features(test)

add a total risk factors feature

In [None]:
risk_factors = ['obese', 'hyperglycemic', 'over_65', 'gender_female', 'heart_disease', 
                'hypertension', 'smoking_formerly smoked', 'smoking_smokes']
train['risk_factors'] = train[risk_factors].sum(axis=1)
test['risk_factors'] = test[risk_factors].sum(axis=1)
train.head()

PCA analysis of some features

In [None]:
from sklearn.decomposition import PCA

def pca(X):
    Xp = (X - X.mean(axis=0)) / X.std(axis=0)
    pca = PCA(random_state=0)
    X_pca = pca.fit_transform(Xp)
    comps = [f'PC{1+i}' for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=comps, index=X.index)
    
    loadings = pd.DataFrame(pca.components_.T, columns=comps, index=X.columns)

    return pca, X_pca, loadings

In [None]:
pca_features = ['age', 'avg_glucose_level', 'bmi', 'risk_factors']
pca_train, X_pca_train, loadings_train = pca(train[pca_features])
pca_test, X_pca_test, loadings_test = pca(test[pca_features])
loadings_train

In [None]:
loadings_test

looks like age * risk_factors may be a promising feature 

In [None]:
train['age*risk_factors'] = train['age'] * (train['risk_factors'])
test['age*risk_factors'] = test['age'] * (test['risk_factors'])

add pca features to the data sets

multiply test pc2 and pc4 by -1 to get into same orientation as training data

In [None]:
train = train.join(X_pca_train)
test = test.join(X_pca_test)
test['PC2'] = test['PC2'] * (-1)
test['PC4'] = test['PC4'] * (-1)

get original features just for reference

In [None]:
features = train.columns
added_features = ['obese', 'hyperglycemic', 'over_65', 'risk_factors', 
                  'bmi_unreported', 'PC1', 'PC2', 'PC3', 'PC4', 'age*risk_factors']
og_features = [x for x in features if x not in added_features]

In [None]:
# train = train[og_features]
# test = test[og_features]

In [None]:
X_train = train.drop('stroke', axis=1)
y_train = train['stroke']

X_test = test.drop('stroke', axis=1)
y_test = test['stroke']

make sure the columns match between train and test sets

In [None]:
print([x for x in X_test.columns if x not in X_train.columns])
print([x for x in X_train.columns if x not in X_test.columns])

# classifier

heavily skewed towards non stroke subjects

In [None]:
print('y_train mean : ', y_train.mean())
print('y_test mean : ', y_test.mean())

scale our data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

use smote to oversample the stroke data since it is heavily skewed

In [None]:
from imblearn.over_sampling import SMOTE

X_train_smote, y_train_smote = SMOTE().fit_resample(X_train_scaled, y_train)

In [None]:
print('y_train_smote mean : ', y_train_smote.mean())

time to train and analyze some models

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, RocCurveDisplay
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
models = dict()
rs = 0
models['xgb clf'] = XGBClassifier(n_estimators=300, learning_rate=0.05, random_state=rs)
models['gbd tree'] = GradientBoostingClassifier(random_state=rs)
models['random forests'] = RandomForestClassifier(random_state=rs)
models['log reg'] = LogisticRegression(random_state=rs)

In [None]:
for model in models:
    models[model].fit(X_train_smote, y_train_smote)
    print(f'{model} : ✔')

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
def model_test(model):
    model = models[m]
    pred = model.predict(X_test_scaled)
    pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    auc = roc_auc_score(y_test, pred_proba)
    cf = confusion_matrix(y_test, pred)
    cr = classification_report(y_test, pred)
    
    print('- - - - - -\n', m, '\n')
    print('roc auc: ', auc)
    print(cf)
    print(cr, '\n')

In [None]:
for m in models:
   model_test(m)

In [None]:
fig = plt.figure(figsize=(8,8))
for m in models:
    model = models[m]
    prediction = models[m].predict_proba(X_test_scaled)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, prediction)
    plt.plot(fpr, tpr, label=m)
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.legend()
plt.show()

the recall score is important in this analysis since we want to catch all the possible strokes even if we misclassify some

we'll continue with the logistic regressor since it has the highest recall score by far

In [None]:
log_params = [{'solver':['liblinear'], 'penalty':['l1', 'l2'], 'C':[1.0, 10.0, 100.0]},
              {'solver':['lbfgs'], 'penalty':['none', 'l2'], 'C':[1.0, 10.0, 100.0]}]
log_grid = GridSearchCV(models['log reg'], param_grid=log_params, scoring='recall').fit(X_train_smote, y_train_smote)
print(log_grid.best_score_)
print(log_grid.best_params_)
print(log_grid.best_estimator_)

In [None]:
model_test(log_grid.best_estimator_)

In [None]:
final_predicted = log_grid.best_estimator_.predict(X_test_scaled)
sub = pd.DataFrame(data={'id': X_test.index, 'stroke':final_predicted})
sub.to_csv('stroke_sub.csv', index=False)

Thanks to samsatp's notebook for guidance on the modeling structure. Their notebook can be found here : [https://www.kaggle.com/sathianpong/stroke-eda-visualization-prediction](http://)