In [2]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score

In [3]:
raw_df = df = pd.read_csv('./Dataset/train.csv')
raw_df.drop(columns=['id'],inplace=True)

### Delete data with unknown 'age'

In [4]:
df = raw_df.copy()
df = df.loc[df.age.notnull() & df.hypertension.notnull()]
df = df.fillna(np.mean(df['bmi']))
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,blood,height,stroke
1,Male,55.0,0.0,0.0,Yes,Private,Urban,89.17,28.91703,never smoked,B,1.59000,0
2,Female,42.0,0.0,0.0,No,Private,Unknown,98.53,18.50000,never smoked,B,28.91703,0
3,Female,31.0,0.0,0.0,No,Private,Urban,108.89,52.30000,Unknown,O,1.24000,0
4,Female,38.0,0.0,0.0,Yes,Private,Unknown,91.44,28.91703,Unknown,O,28.91703,0
5,Female,24.0,0.0,0.0,No,Private,Urban,97.55,26.20000,never smoked,B,1.75000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,Male,16.0,0.0,0.0,No,Private,Unknown,99.49,22.00000,Unknown,B,28.91703,0
3996,Male,47.0,0.0,0.0,Yes,Private,Rural,75.30,25.00000,formerly smoked,B,1.79000,0
3997,Other,57.0,1.0,0.0,Yes,Private,Rural,129.54,60.90000,smokes,AB,1.15000,0
3998,Female,45.0,0.0,0.0,Yes,Private,Unknown,172.33,45.30000,formerly smoked,O,1.33000,0


In [5]:
df['gender'].value_counts()

Female    2024
Male      1482
Other      157
Name: gender, dtype: int64

In [6]:
res = smf.ols('stroke ~ avg_glucose_level + hypertension', data=df).fit()
res.summary()

0,1,2,3
Dep. Variable:,stroke,R-squared:,0.02
Model:,OLS,Adj. R-squared:,0.019
Method:,Least Squares,F-statistic:,37.07
Date:,"Tue, 18 May 2021",Prob (F-statistic):,1.15e-16
Time:,12:46:54,Log-Likelihood:,1614.2
No. Observations:,3663,AIC:,-3222.0
Df Residuals:,3660,BIC:,-3204.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0179,0.006,-2.898,0.004,-0.030,-0.006
avg_glucose_level,0.0004,5.56e-05,7.033,0.000,0.000,0.000
hypertension,0.0333,0.009,3.792,0.000,0.016,0.051

0,1,2,3
Omnibus:,4013.89,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,188750.784
Skew:,5.859,Prob(JB):,0.0
Kurtosis:,36.157,Cond. No.,385.0


In [7]:
col = df.columns
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3663 entries, 1 to 3999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             3663 non-null   object 
 1   age                3663 non-null   float64
 2   hypertension       3663 non-null   float64
 3   heart_disease      3663 non-null   float64
 4   ever_married       3663 non-null   object 
 5   work_type          3663 non-null   object 
 6   Residence_type     3663 non-null   object 
 7   avg_glucose_level  3663 non-null   float64
 8   bmi                3663 non-null   float64
 9   smoking_status     3663 non-null   object 
 10  blood              3663 non-null   object 
 11  height             3663 non-null   float64
 12  stroke             3663 non-null   int64  
dtypes: float64(6), int64(1), object(6)
memory usage: 400.6+ KB


In [8]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,blood,height,stroke
1,Male,55.0,0.0,0.0,Yes,Private,Urban,89.17,28.91703,never smoked,B,1.59,0
2,Female,42.0,0.0,0.0,No,Private,Unknown,98.53,18.5,never smoked,B,28.91703,0
3,Female,31.0,0.0,0.0,No,Private,Urban,108.89,52.3,Unknown,O,1.24,0
4,Female,38.0,0.0,0.0,Yes,Private,Unknown,91.44,28.91703,Unknown,O,28.91703,0
5,Female,24.0,0.0,0.0,No,Private,Urban,97.55,26.2,never smoked,B,1.75,0


In [9]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,height,stroke
count,3663.0,3663.0,3663.0,3663.0,3663.0,3663.0,3663.0
mean,42.927808,0.097734,0.68533,102.554699,28.91703,8.931781,0.025389
std,22.3942,0.296995,4.225102,46.93789,6.626813,12.097632,0.157325
min,0.08,0.0,0.0,28.91703,11.3,0.91,0.0
25%,25.0,0.0,0.0,74.795,25.6,1.56,0.0
50%,45.0,0.0,0.0,90.55,28.91703,1.7,0.0
75%,60.0,0.0,0.0,112.58,30.7,28.91703,0.0
max,82.0,1.0,28.91703,267.76,97.6,28.91703,1.0


In [10]:
def draw_missing_data_table(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data
draw_missing_data_table(df)

Unnamed: 0,Total,Percent
gender,0,0.0
age,0,0.0
hypertension,0,0.0
heart_disease,0,0.0
ever_married,0,0.0
work_type,0,0.0
Residence_type,0,0.0
avg_glucose_level,0,0.0
bmi,0,0.0
smoking_status,0,0.0


In [11]:
missing_data = ['bmi','height','hypertension','avg_glucose_level','age','heart_disease','blood']
data_numerical = df[['age','avg_glucose_level','bmi']]
data_categorical = df[['gender', 'hypertension', 'heart_disease', 'ever_married','work_type', 'Residence_type',
                       'smoking_status', 'stroke']]