In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
plt.style.use('fivethirtyeight')

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data.head()

In [None]:
data.info()

In [None]:
# Check the options for the categorical variables
class cate:
    def __init__(self,categories_list):
        self.categories_list = categories_list
    
    def options(self):
        cat_options = {}
        for cat in self.categories_list:
            options_num = list(data[cat].unique())
            cat_options[cat] = options_num
        return cat_options
    
    def numberic_cate(self):
        for key,values in self.options().items():
            print("Number of unique values of the {} ：{} ".format(key,len(values)))
            
categories_list = ['gender','ever_married','work_type','Residence_type','smoking_status','stroke','hypertension','heart_disease']
ca = cate(categories_list)
ca_options = ca.options()
ca_options

In [None]:
ca.numberic_cate()

In [None]:
data = data.drop(['id'],axis=1)
data.head()

In [None]:
data.describe()

1. **There are missing values in the column where bmi is located**
2. **I try to fill in with the average**

In [None]:
data["bmi"] = data["bmi"].replace('NaN',np.nan)
data["bmi"] = data["bmi"].replace(np.nan,data["bmi"].mean())
data.info()

In [None]:
sns.countplot(x = data.gender,hue='stroke',data=data)

In [None]:
len(data[data['gender'] == 'Other'])

# Deleting this row of data has little effect

In [None]:
data = data.drop(data[data['gender'] == 'Other'].index).copy()
len(data[data['gender'] == 'Other'])

In [None]:
sns.countplot(x = data.ever_married,hue='stroke',data=data)

In [None]:
plt.figure(figsize=(14,6))
sns.countplot(x = data.work_type,hue='stroke',data=data)

In [None]:
sns.countplot(x = data.Residence_type,hue='stroke',data=data)

In [None]:
plt.figure(figsize=(14,6))
sns.countplot(x = data.smoking_status,hue='stroke',data=data)

In [None]:
sns.countplot(x = data.stroke)

# Calculate the ratio of stroke equal to 0 and 1
# Unbalanced data classification

In [None]:
stroke_total = len(data["stroke"])
stroke_0 = len(data[data["stroke"] == 0]) / stroke_total
stroke_1 = len(data[data["stroke"] == 1]) / stroke_total
print('stroke_0 / stroke_total: {}'.format(stroke_0))
print('stroke_1 / stroke_total: {}'.format(stroke_1))

In [None]:
sns.heatmap(data.corr(),cbar=False,cmap='BuGn',annot=True)

In [None]:
p=data.hist(figsize = (10,10))

# data processing
* About the Data:
* id: unique identifier
* gender: "Male", "Female" or "Other"
* age: age of the patient
* hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
* heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
* ever_married: "No" or "Yes"
* work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
* Residence_type: "Rural" or "Urban"
* avg_glucose_level: average glucose level in blood
* bmi: body mass index
* smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
* stroke: 1 if the patient had a stroke or 0 if not
* Note: "Unknown" in smoking_status means that the information is unavailable for this patient

In [None]:
data.head()

****We need to process 7 types of data  ----- one-hot
*  gender	
*  hypertension	
* heart_disease	
* ever_married	
*  work_type	
* Residence_type	
*  smoking_status


In [None]:
data['hypertension'] = data['hypertension'].replace(0,'no_hypertension')
data['hypertension'] = data['hypertension'].replace(1,'yes_hypertension')
data['heart_disease'] = data['heart_disease'].replace(0,'no_heart_disease')
data['heart_disease'] = data['heart_disease'].replace(1,'yes_heart_disease')
data.head()

In [None]:
#sample
# gender = data.pop('gender')
# data['Male'] = (gender == 'Male') * 1
# data['Female'] = (gender == 'Female') * 1

class one_hot:
    
    def __init__(self,col_):
        self.col_ = col_
    
    def pop_trans_data(self):
        for i in self.col_:
            cat = list(data[i].unique())
            i_columns= data.pop(i)
            for ca in cat:
                data[ca] = (i_columns == ca) * 1
        
        pass
col_ = ['gender','hypertension','heart_disease','ever_married','work_type','Residence_type','smoking_status']
hot = one_hot(col_)
hot.pop_trans_data()
data.head()

In [None]:
#standardization-----age avg_glucose_level  bmi
class standard:
    
    def __init__(self,data_v):
        self.data_v = data_v
        
    def trans(self):
        for i in self.data_v:
            data[i] = (data[i] - data[i].mean()) / data[i].std()
        pass
data_v = ['age','avg_glucose_level','bmi']
sta = standard(data_v)
sta.trans()
data.head()

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.model_selection import train_test_split

In [None]:
X = data.drop(['stroke'],axis=1)
Y = data['stroke']
smo = SMOTE(random_state=10)
X_smo,Y_smo = smo.fit_resample(X,Y)

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(X_smo,Y_smo,test_size=0.3,random_state=0)

In [None]:
LR = LogisticRegression(C=1.0,tol=0.01)
LR.fit(xtrain,ytrain)
LR_a = accuracy_score(ytest,LR.predict(xtest))
print(LR_a)
f1_a = f1_score(ytest,LR.predict(xtest))
print(f1_a)

In [None]:
xgb = XGBClassifier(learning_rate=0.05,n_estimators=10,random_state=20)
xgb.fit(xtrain,ytrain)
xgb_score = accuracy_score(ytest,xgb.predict(xtest))
xgb_f1_score = f1_score(ytest,xgb.predict(xtest))
print(xgb_score)
print(xgb_f1_score)

In [None]:
print('LogisticRegression--accuracy_score:{}'.format(LR_a))
print('XGBClassifier--accuracy_score:{}'.format(xgb_score))