In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [4]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [6]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

BMI missing rows filled by median value from rest of column

In [7]:
# Fill missing 'bmi' values with the median (rounded to 2 decimal places)
df['bmi'] = df['bmi'].fillna(round(df['bmi'].median(), 2))

# Check total missing values in each column
print(df.isnull().sum())


id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                5110 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [9]:
import plotly.express as px
fig = px.histogram(df, x="age")
fig.show()


In [10]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create subplot grid (4 rows, 2 cols)
fig = make_subplots(
    rows=4, cols=2,
    subplot_titles=("Age", "Gender", "Hypertension", "Heart Disease",
                    "Ever Married", "Work type", "Residence Type", "Smoking Status")
)

# Add histograms to subplots
fig.add_trace(go.Histogram(x=df['age']), row=1, col=1)
fig.add_trace(go.Histogram(x=df['gender']), row=1, col=2)
fig.add_trace(go.Histogram(x=df['hypertension']), row=2, col=1)
fig.add_trace(go.Histogram(x=df['heart_disease']), row=2, col=2)
fig.add_trace(go.Histogram(x=df['ever_married']), row=3, col=1)
fig.add_trace(go.Histogram(x=df['work_type']), row=3, col=2)
fig.add_trace(go.Histogram(x=df['Residence_type']), row=4, col=1)
fig.add_trace(go.Histogram(x=df['smoking_status']), row=4, col=2)

# Layout update
fig.update_layout(
    height=1200, width=800,
    title_text="Data Counts",
    showlegend=False
)

# Show figure
fig.show()


In [11]:
df['gender'].value_counts()

gender
Female    2994
Male      2115
Other        1
Name: count, dtype: int64

In [12]:
# Drop rows where gender == 'Other'
df.drop(df[df['gender'] == 'Other'].index, inplace=True)

# Check remaining unique values in gender column
df['gender'].unique()


array(['Male', 'Female'], dtype=object)

In [13]:
fig = px.histogram(df, x="bmi")
fig.show()

Get rid of outliers

In [14]:
df.drop(df[df['bmi']>50].index, inplace=True)

Plots

In [17]:
fig = px.histogram(df, x="bmi")
fig.show()

In [19]:
import plotly.express as px
fig = px.histogram(df, x="stroke")
fig.show()



In [21]:
# Group by gender and sum stroke cases
gender = df.groupby(df['gender'])['stroke'].sum()

# Create a new DataFrame for plotting
data_gender = pd.DataFrame({
    'labels': gender.index,
    'values': gender.values
})

# Plot pie chart
fig = px.pie(
    data_gender,
    values='values',
    names='labels',
    title='Stroke by Gender',
    color=['Female', 'Male'],
    color_discrete_map={'Female': 'lightpink', 'Male': 'lightskyblue'}
)

# Show figure
fig.show()

In [22]:
work = df.groupby(df['work_type'])['stroke'].sum()
data_work = pd.DataFrame({'labels': work.index, 'values': work.values})

fig = px.pie(data_work, values='values', names='labels', title='Stroke by Work Type')
fig.show()

In [23]:
smoke = df.groupby(df['smoking_status'])['stroke'].sum()
data_smoke = pd.DataFrame({'labels': smoke.index, 'values': smoke.values})

fig = px.pie(data_smoke, values='values', names='labels', title='Stroke by Smoking Status')
fig.show()


In [24]:
residence = df.groupby(df['Residence_type'])['stroke'].sum()
data_res = pd.DataFrame({'labels': residence.index, 'values': residence.values})

fig = px.pie(data_res, values='values', names='labels', title='Stroke by Residence Type')
fig.show()


In [25]:
married = df.groupby(df['ever_married'])['stroke'].sum()
data_marriage = pd.DataFrame({'labels': married.index, 'values': married.values})

fig = px.pie(data_marriage, values='values', names='labels', title='Stroke by Marital Status')
fig.show()


In [27]:
df = df.drop(['Id', 'Residence_type', 'work_type'], axis=1, errors='ignore')



In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5030 entries, 0 to 5109
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5030 non-null   int64  
 1   gender             5030 non-null   object 
 2   age                5030 non-null   float64
 3   hypertension       5030 non-null   int64  
 4   heart_disease      5030 non-null   int64  
 5   ever_married       5030 non-null   object 
 6   avg_glucose_level  5030 non-null   float64
 7   bmi                5030 non-null   float64
 8   smoking_status     5030 non-null   object 
 9   stroke             5030 non-null   int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 432.3+ KB


In [32]:
df['ever_married'].unique()
df['ever_married'].value_counts()




ever_married
1    3289
0    1741
Name: count, dtype: int64

In [33]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
df['ever_married'] = label_encoder.fit_transform(df['ever_married'])

df['ever_married'].unique()
df['ever_married'].value_counts()


ever_married
1    3289
0    1741
Name: count, dtype: int64