In [2]:
import numpy as np # linear algebra
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings

In [3]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df.head(3)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1


In [4]:
print(df.dtypes)

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object


# Calculate the initial memory usage

In [43]:
initial_memory = df.memory_usage(deep=True).sum()
print("Initial memory usage:", initial_memory)

Initial memory usage: 206230


In [6]:
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
df[categorical_columns] = df[categorical_columns].astype('category')


In [7]:
boolean_columns = ['hypertension', 'heart_disease', 'stroke']
df[boolean_columns] = df[boolean_columns].astype('bool')


# Calculate the memory usage differences

In [8]:
# Calculate the memory usage differences
final_memory = df.memory_usage(deep=True).sum()
print("Final memory usage:", final_memory)
print("Memory saved:", initial_memory - final_memory)


Final memory usage: 206230
Memory saved: 1695623


# Explore statistical facts

In [9]:
# Explore statistical facts
print(df.describe())


                 id          age  avg_glucose_level          bmi
count   5110.000000  5110.000000        5110.000000  4909.000000
mean   36517.829354    43.226614         106.147677    28.893237
std    21161.721625    22.612647          45.283560     7.854067
min       67.000000     0.080000          55.120000    10.300000
25%    17741.250000    25.000000          77.245000    23.500000
50%    36932.000000    45.000000          91.885000    28.100000
75%    54682.000000    61.000000         114.090000    33.100000
max    72940.000000    82.000000         271.740000    97.600000


### percentile 

In [47]:

# Calculate specific percentiles for a column using numpy
percentiles_custom_age = np.percentile(df['age'], [10, 20, 30, 40, 60, 70, 80, 90])
percentiles_custom_glucose = np.percentile(df['avg_glucose_level'], [10, 20, 30, 40, 60, 70, 80, 90])
percentiles_custom_bmi = np.percentile(df['bmi'], [10, 20, 30, 40, 60, 70, 80, 90])
percent_hypertension_true = (df['hypertension'].sum() / len(df['hypertension'])) * 100
percent_heart_disease_true = (df['heart_disease'].sum() / len(df['heart_disease'])) * 100

# Print the calculated percentiles
print("Percentiles for Age:", percentiles_custom_age)
print("Percentiles for Average Glucose Level:", percentiles_custom_glucose)
print("Percentiles for BMI:", percentiles_custom_bmi)
print("Percentage of people with hypertension:", percent_hypertension_true)
print("Percentage of people with heart disease:", percent_heart_disease_true)



Percentiles for Age: [11. 20. 30. 38. 51. 57. 65. 75.]
Percentiles for Average Glucose Level: [ 65.789  73.76   80.038  85.6    98.914 108.516 124.16  192.181]
Percentiles for BMI: [19.8 22.6 24.7 26.6 29.9 31.8 34.3 38.7]
Percentage of people with hypertension: 9.74559686888454
Percentage of people with heart disease: 5.401174168297456


## median- central tendency

In [48]:
# Calculate median for each column
median_age = df['age'].median()
median_glucose_level = df['avg_glucose_level'].median()
median_bmi = df['bmi'].median()
median_hypertension = df['hypertension'].median()
median_heart_disease = df['heart_disease'].median()

# Print the calculated medians
print("Median Age:", median_age)
print("Median Avg. Glucose Level:", median_glucose_level)
print("Median BMI:", median_bmi)
print("Median Hypertension:", median_hypertension)
print("Median Heart Disease:", median_heart_disease)


Median Age: 45.0
Median Avg. Glucose Level: 91.88499999999999
Median BMI: 28.3
Median Hypertension: 0.0
Median Heart Disease: 0.0


## Mean 

In [49]:
# Calculate mean for each column
mean_age = df['age'].mean()
mean_avg_glucose_level = df['avg_glucose_level'].mean()
mean_bmi = df['bmi'].mean()
mean_hypertension = df['hypertension'].mean()
mean_heart_disease = df['heart_disease'].mean()

# Print the calculated means
print("Mean Age:", mean_age)
print("Mean Avg. Glucose Level:", mean_avg_glucose_level)
print("Mean BMI:", mean_bmi)
print("Mean Hypertension:", mean_hypertension)
print("Mean Heart Disease:", mean_heart_disease)


Mean Age: 43.226614481409
Mean Avg. Glucose Level: 106.1476771037182
Mean BMI: 28.91841033735874
Mean Hypertension: 0.0974559686888454
Mean Heart Disease: 0.05401174168297456


# Missing value

In [10]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

# Define the Decision Tree regressor pipeline
DT_bmi_pipe = Pipeline( steps=[ 
                               ('scale',StandardScaler()),
                               ('lr',DecisionTreeRegressor(random_state=42))
                              ])

# Identify rows with missing BMI values
Missing = df[df['bmi'].isna()]

# Check if there are any missing values
if not Missing.empty:
    # Extract features for missing BMI prediction
    X_missing = Missing[['age', 'gender']].copy()
    
    # Convert gender to numerical values
    X_missing['gender'] = X_missing['gender'].map({'Male': 0, 'Female': 1, 'Other': -1}).fillna(-1).astype('int8')

    # Extract features and target for training
    X_train = df[~df['bmi'].isna()][['age', 'gender']].copy()
    y_train = df[~df['bmi'].isna()]['bmi']

    # Fit the pipeline
    DT_bmi_pipe.fit(X_train, y_train)

    # Predict missing BMI values
    predicted_bmi = DT_bmi_pipe.predict(X_missing)
    
    # Update missing BMI values in the original DataFrame
    df.loc[Missing.index, 'bmi'] = predicted_bmi
else:
    print("No missing BMI values found.")


No missing BMI values found.
