#  Data Importing Or Loading
# ----------------------------------------

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv(r"C:\Users\Srinivas\Downloads\Data Analytics\Python\healthcare-dataset-stroke-data.csv")
print(df)

         id  gender   age  hypertension  heart_disease ever_married  \
0      9046    Male  67.0             0              1          Yes   
1     51676  Female  61.0             0              0          Yes   
2     31112    Male  80.0             0              1          Yes   
3     60182  Female  49.0             0              0          Yes   
4      1665  Female  79.0             1              0          Yes   
...     ...     ...   ...           ...            ...          ...   
5105  18234  Female  80.0             1              0          Yes   
5106  44873  Female  81.0             0              0          Yes   
5107  19723  Female  35.0             0              0          Yes   
5108  37544    Male  51.0             0              0          Yes   
5109  44679  Female  44.0             0              0          Yes   

          work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0           Private          Urban             228.69  36.6  former

#  Basic Data Inspection 
# ----------------------------------------

In [4]:
print(df.head())

      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   

   stroke  
0       1  
1       1  
2       1  
3       1  
4       1  


In [5]:
print(df.tail())

         id  gender   age  hypertension  heart_disease ever_married  \
5105  18234  Female  80.0             1              0          Yes   
5106  44873  Female  81.0             0              0          Yes   
5107  19723  Female  35.0             0              0          Yes   
5108  37544    Male  51.0             0              0          Yes   
5109  44679  Female  44.0             0              0          Yes   

          work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
5105        Private          Urban              83.75   NaN     never smoked   
5106  Self-employed          Urban             125.20  40.0     never smoked   
5107  Self-employed          Rural              82.99  30.6     never smoked   
5108        Private          Rural             166.29  25.6  formerly smoked   
5109       Govt_job          Urban              85.28  26.2          Unknown   

      stroke  
5105       0  
5106       0  
5107       0  
5108       0  
5109       0  


In [6]:
print(df.shape)

(5110, 12)


In [7]:
print(df.columns)

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')


In [8]:
print(df.dtypes)

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object


In [9]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
None


In [10]:
print(df.describe())

                 id          age  hypertension  heart_disease  \
count   5110.000000  5110.000000   5110.000000    5110.000000   
mean   36517.829354    43.226614      0.097456       0.054012   
std    21161.721625    22.612647      0.296607       0.226063   
min       67.000000     0.080000      0.000000       0.000000   
25%    17741.250000    25.000000      0.000000       0.000000   
50%    36932.000000    45.000000      0.000000       0.000000   
75%    54682.000000    61.000000      0.000000       0.000000   
max    72940.000000    82.000000      1.000000       1.000000   

       avg_glucose_level          bmi       stroke  
count        5110.000000  4909.000000  5110.000000  
mean          106.147677    28.893237     0.048728  
std            45.283560     7.854067     0.215320  
min            55.120000    10.300000     0.000000  
25%            77.245000    23.500000     0.000000  
50%            91.885000    28.100000     0.000000  
75%           114.090000    33.100000     0

#  Cleaning Data 
# ----------------------------------------

In [11]:
print(df.isnull().sum())

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df = df[df['gender'] != 'Other']
print(df['gender'])

0         Male
1       Female
2         Male
3       Female
4       Female
         ...  
5105    Female
5106    Female
5107    Female
5108      Male
5109    Female
Name: gender, Length: 5109, dtype: object


In [14]:
df = df[df['age'] >= 2]
print(df)

         id  gender   age  hypertension  heart_disease ever_married  \
0      9046    Male  67.0             0              1          Yes   
1     51676  Female  61.0             0              0          Yes   
2     31112    Male  80.0             0              1          Yes   
3     60182  Female  49.0             0              0          Yes   
4      1665  Female  79.0             1              0          Yes   
...     ...     ...   ...           ...            ...          ...   
5105  18234  Female  80.0             1              0          Yes   
5106  44873  Female  81.0             0              0          Yes   
5107  19723  Female  35.0             0              0          Yes   
5108  37544    Male  51.0             0              0          Yes   
5109  44679  Female  44.0             0              0          Yes   

          work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0           Private          Urban             228.69  36.6  former

In [15]:
df = df[df['smoking_status'] != 'Unknown']
print(df)

         id  gender   age  hypertension  heart_disease ever_married  \
0      9046    Male  67.0             0              1          Yes   
1     51676  Female  61.0             0              0          Yes   
2     31112    Male  80.0             0              1          Yes   
3     60182  Female  49.0             0              0          Yes   
4      1665  Female  79.0             1              0          Yes   
...     ...     ...   ...           ...            ...          ...   
5102  45010  Female  57.0             0              0          Yes   
5105  18234  Female  80.0             1              0          Yes   
5106  44873  Female  81.0             0              0          Yes   
5107  19723  Female  35.0             0              0          Yes   
5108  37544    Male  51.0             0              0          Yes   

          work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0           Private          Urban             228.69  36.6  former

In [16]:
df = df[df['bmi'] != 'N/A']
print(df)

         id  gender   age  hypertension  heart_disease ever_married  \
0      9046    Male  67.0             0              1          Yes   
1     51676  Female  61.0             0              0          Yes   
2     31112    Male  80.0             0              1          Yes   
3     60182  Female  49.0             0              0          Yes   
4      1665  Female  79.0             1              0          Yes   
...     ...     ...   ...           ...            ...          ...   
5102  45010  Female  57.0             0              0          Yes   
5105  18234  Female  80.0             1              0          Yes   
5106  44873  Female  81.0             0              0          Yes   
5107  19723  Female  35.0             0              0          Yes   
5108  37544    Male  51.0             0              0          Yes   

          work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0           Private          Urban             228.69  36.6  former

In [17]:
print(df['bmi'].unique())

[36.6  nan 32.5 34.4 24.  29.  27.4 22.8 29.7 36.8 27.3 28.2 30.9 37.5
 25.8 37.8 22.4 48.9 26.6 27.2 23.5 28.3 44.2 22.2 30.5 26.5 33.7 23.1
 29.9 23.9 28.5 26.4 20.2 33.6 38.6 39.2 27.7 31.4 36.5 33.2 32.8 40.4
 25.3 30.2 47.5 30.  28.9 28.1 31.1 21.7 27.  24.1 22.9 29.1 32.3 41.1
 29.8 26.3 29.4 24.4 28.  34.6 30.3 24.2 41.5 22.6 56.6 31.3 31.  31.7
 35.8 28.4 20.1 26.7 38.7 25.  23.8 21.8 27.5 24.6 32.9 26.1 31.9 34.1
 25.6 36.9 37.3 45.7 34.2 23.6 22.3 37.1 45.  25.5 30.8 32.  37.4 34.5
 27.9 29.5 46.  42.5 35.5 26.9 45.5 31.5 33.  23.4 30.7 20.5 21.5 27.1
 40.  28.6 42.2 29.6 35.4 16.9 32.6 35.9 21.2 42.4 40.5 29.3 17.7 54.6
 22.  39.4 19.7 22.5 25.2 60.9 23.7 24.5 31.2 25.1 36.  26.8 34.9 35.3
 36.7 34.3 27.6 24.3 40.1 21.9 38.4 25.9 54.7 24.9 19.4 48.2 20.7 39.5
 23.3 64.8 43.6 21.  47.3 31.6 21.6 35.6 41.8 41.9 29.2 37.9 44.6 39.6
 40.3 24.8 36.1 36.3 25.4 46.5 46.6 26.2 31.8 15.3 38.2 23.2 45.2 49.8
 27.8 60.2 23.  51.  39.7 34.7 21.3 41.2 34.8 35.7 21.4 40.8 41.6 32.4
 34.  

# EDA (Exploratory Data Analysis)
# KPIs (Key Performance Indicators)

In [None]:
# Total Stroke Cases
total_stroke_cases = df['stroke'].sum()
print(total_stroke_cases)

202


In [None]:
# Stroke rate
stroke_rate = df['stroke'].mean() * 100
print(stroke_rate)

5.666199158485274


In [22]:
# Average age
Average_age = df['age'].mean()
print(Average_age)

48.86030855539972


In [23]:
# Median age of stroke patients
Avg_age_stroke_patient = df[df['stroke'] == 1]['age'].median()
print(Avg_age_stroke_patient)

71.0


In [31]:
# Stroke rate among heart disease patients
Stroke_rate_heart_disease = df[df['heart_disease'] == 1]['stroke'].mean() * 100
print(Stroke_rate_heart_disease)

17.105263157894736


In [32]:
# Stroke rate among hypertensives
Stroke_rate_hypertension = df[df['hypertension'] == 1]['stroke'].mean() * 100
print(Stroke_rate_hypertension)

13.901345291479823


In [29]:
# Avg glucose in stroke patients
Avg_glucose = df[df['stroke'] == 1]['avg_glucose_level'].mean()
print(Avg_glucose)

134.34465346534654


In [26]:
# Stroke rate by smoking status
Stroke_rate_smoking = df.groupby('smoking_status')['stroke'].mean() * 100
print(Stroke_rate_smoking)

smoking_status
formerly smoked    7.918552
never smoked       4.756871
smokes             5.323194
Name: stroke, dtype: float64


In [28]:
# Current smokers %
Current_smokers = (df['smoking_status'] == 'smokes').mean() * 100
print(Current_smokers)

22.131837307152875


In [30]:
# Stroke rate by marital status
Stroke_rate_marital_status = df.groupby('ever_married')['stroke'].mean() * 100
print(Stroke_rate_marital_status)

ever_married
No     2.456140
Yes    6.678967
Name: stroke, dtype: float64


In [24]:
# Stroke rate by work type
Stroke_rate_work_type = df.groupby('work_type')['stroke'].mean() * 100
print(Stroke_rate_work_type)

work_type
Govt_job         4.672897
Never_worked     0.000000
Private          5.253940
Self-employed    8.597285
children         0.000000
Name: stroke, dtype: float64


In [25]:
# Unemployment rate
Unemployment_rate = df['work_type'].isin(['children', 'Never_worked']).mean() * 100
print(Unemployment_rate)

2.3281907433380082
