In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Problem Overview: Can you predict the danger of stroke based on the evaluation of certain tested parameters?
### Context: According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.
### Data Source: License -Data files © Original Authors: fedesoriano

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

# Load Dataset

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
df

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe

In [None]:
df.dtypes


In [None]:
list(df.columns)

# Missing Values

In [None]:
plt.title('Missing Value Status',fontweight='bold')
ax = sns.heatmap(df.isna().sum().to_frame(),annot=True,fmt='d',cmap='vlag')
ax.set_xlabel('Amount Missing')
plt.show()

In [None]:
df.fillna(df.mean(), inplace=True)

In [None]:
df.isnull().any()

# Exploratory data analysis

In [None]:
c = df.groupby('gender')['gender'].count()

In [None]:
c

In [None]:
sns.barplot(x=c.index, y=c.values)

In [None]:
s = sns.heatmap(df.corr())

In [None]:
sns.pairplot(data = df, hue=None, palette=None)

In [None]:
c = df.groupby('stroke')['stroke'].count()

In [None]:
plt.pie(c, labels=c.index, autopct="%.2f%%", wedgeprops=dict(width=0.3));
plt.show()

In [None]:
plt.title('Stroke Sample Distribution Based On Bmi And Glucose Level')
sns.scatterplot(x=df['avg_glucose_level'],y=df['bmi'],hue=df['stroke'])
plt.show()

In [None]:
plt.title('Stroke Sample Distribution Based On Bmi And Age')
sns.scatterplot(x=df['age'],y=df['bmi'],hue=df['stroke'])
plt.tight_layout()
plt.show()

### What is visible straight away is the fact the in both scatterplots the individuals who had a stroke are located in the low BMI value regions, high glucose levels as well as are in old age.

In [None]:
c = df.groupby('smoking_status')['smoking_status'].count()

In [None]:
plt.pie(c, labels=c.index, autopct="%.2f%%", wedgeprops=dict(width=0.3));
plt.show()

In [None]:
# split a dataset into train and test sets
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
# create dataset
X, y = make_blobs(n_samples=1000)
# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
parameters = ['gender', 'age', 'hypertension', 'heart_disease', 'bmi', 'avg_glucose_level']
result = "stroke"

In [None]:
def binarize_gender(val):
    if val == 'Male':
        return 1
    else:
        return 0 

In [None]:
df['gender'] = df['gender'].apply(binarize_gender)

In [None]:
model = DecisionTreeClassifier(max_depth=5)
model.fit(df[parameters], df[result])

In [None]:
model.score(df[parameters], df[result])

In [None]:
clf = DecisionTreeClassifier(max_depth = 2, 
                             random_state = 0)# Step 3: Train the model on the data
clf.fit(X_train, y_train)# Step 4: Predict labels of unseen (test) data

clf.predict(X_test)

In [None]:
tree.plot_tree(clf);

In [None]:
list(zip(parameters, model.feature_importances_))