In [1]:
# First we install the pandas library

!pip install pandas




In [3]:
# Now it's time to import our library

import pandas as pd

# Including our data file path
file_path = "/content/healthcare-dataset-stroke-data.csv"
df = pd.read_csv(file_path)




In [44]:
# Let's see our Data

df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.600000,1,1
1,51676,0,61.0,0,0,1,3,0,202.21,28.893237,2,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.500000,2,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.400000,3,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.000000,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,0,80.0,1,0,1,2,1,83.75,28.893237,2,0
5106,44873,0,81.0,0,0,1,3,1,125.20,40.000000,2,0
5107,19723,0,35.0,0,0,1,3,0,82.99,30.600000,2,0
5108,37544,1,51.0,0,0,1,2,0,166.29,25.600000,1,0


In [8]:
# Now we define a function can know the type of your data

def data_types(df):
    data_types = df.dtypes.to_dict()
    return data_types


In [12]:
# See how the function work

data_types(df)

{'id': dtype('int64'),
 'gender': dtype('O'),
 'age': dtype('float64'),
 'hypertension': dtype('int64'),
 'heart_disease': dtype('int64'),
 'ever_married': dtype('O'),
 'work_type': dtype('O'),
 'Residence_type': dtype('O'),
 'avg_glucose_level': dtype('float64'),
 'bmi': dtype('float64'),
 'smoking_status': dtype('O'),
 'stroke': dtype('int64')}

In [18]:
# Now we build a function that give us some of the statistical info

def statistics_df(df):
    numerical_cols = df.select_dtypes(include=['int64', 'float64'])
    stats_df = numerical_cols.describe().transpose()

    return stats_df



In [20]:
# We test the function and use the most common statistical summaries

stats_df = statistics_df(df)
print("The statistical summaries of my data are:")
print(stats_df)

The statistical summaries of my data are:
                    count          mean           std    min        25%  \
id                 5110.0  36517.829354  21161.721625  67.00  17741.250   
age                5110.0     43.226614     22.612647   0.08     25.000   
hypertension       5110.0      0.097456      0.296607   0.00      0.000   
heart_disease      5110.0      0.054012      0.226063   0.00      0.000   
avg_glucose_level  5110.0    106.147677     45.283560  55.12     77.245   
bmi                4909.0     28.893237      7.854067  10.30     23.500   
stroke             5110.0      0.048728      0.215320   0.00      0.000   

                         50%       75%       max  
id                 36932.000  54682.00  72940.00  
age                   45.000     61.00     82.00  
hypertension           0.000      0.00      1.00  
heart_disease          0.000      0.00      1.00  
avg_glucose_level     91.885    114.09    271.74  
bmi                   28.100     33.10     97.60  


In [21]:
# We build a function for some frequant statistical info

def most_frequent_statistics(df):
    categorical_cols = df.select_dtypes(include=['object'])
    frequent_values = {}
    for column in categorical_cols:
        frequent_value = df[column].mode()[0]
        frequent_values[column] = frequent_value

    return frequent_values



In [22]:
# See the results of our Function

frequent_values_dict = most_frequent_statistics(df)
print("Most Frequent Values in my data are:")
print(frequent_values_dict)


Most Frequent Values in my data are:
{'gender': 'Female', 'ever_married': 'Yes', 'work_type': 'Private', 'Residence_type': 'Urban', 'smoking_status': 'never smoked'}


In [23]:
# Check is there any null in our data

df.isna()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,False,False,False,False,False,False,False,False,False,True,False,False
5106,False,False,False,False,False,False,False,False,False,False,False,False
5107,False,False,False,False,False,False,False,False,False,False,False,False
5108,False,False,False,False,False,False,False,False,False,False,False,False


In [25]:
# We need know how many null values we have

df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [46]:
# We build a function help us to remove any missing values

def remove_missing_values(df, axis='column'):
    if axis == 'rows':
        cleaned_data_df = df.dropna(axis=0)
    elif axis == 'columns':
        cleaned_data_df = df.dropna(axis=1)
    else:
        raise ValueError("column")
    return cleaned_data_df


In [44]:
# As we see the function worked but in row and column default case

cleaned_data_df = remove_missing_values(df, axis='columns')
print("My dataset after removing all missing values:")
print(cleaned_data_df)

My dataset after removing all missing values:
         id  gender   age  hypertension  heart_disease ever_married  \
0      9046    Male  67.0             0              1          Yes   
1     51676  Female  61.0             0              0          Yes   
2     31112    Male  80.0             0              1          Yes   
3     60182  Female  49.0             0              0          Yes   
4      1665  Female  79.0             1              0          Yes   
...     ...     ...   ...           ...            ...          ...   
5105  18234  Female  80.0             1              0          Yes   
5106  44873  Female  81.0             0              0          Yes   
5107  19723  Female  35.0             0              0          Yes   
5108  37544    Male  51.0             0              0          Yes   
5109  44679  Female  44.0             0              0          Yes   

          work_type Residence_type  avg_glucose_level   smoking_status  stroke  
0           Private 

In [69]:
# Here we need to let our function spisfied with a spisfic number as we need column No. 10

def remove_all_missing_values(df, axis=None):
    if axis == 'rows':
        if column is None:
            raise ValueError("Invalid value for 'column'. Specify a column name when axis is 'rows'.")
        cleaned_data_df = df.dropna(subset=[column])
    elif axis == 'columns':
        cleaned_data_df = df.dropna(axis=1)
    else:
        raise ValueError("Invalid value for 'axis'. Specify 'rows' or 'columns'.")
    return cleaned_data_df


In [71]:
#Test the function if it's work or not

cleaned_data_df = remove_all_missing_values(df, axis=10)


In [45]:
# The new Data worked

cleaned_data_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [73]:
# The original data not untill now we see the missing values

df.isnull()


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,False,False,False,False,False,False,False,False,False,True,False,False
5106,False,False,False,False,False,False,False,False,False,False,False,False
5107,False,False,False,False,False,False,False,False,False,False,False,False
5108,False,False,False,False,False,False,False,False,False,False,False,False


In [74]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [6]:
# Discover the shape of our data

df.shape



(5110, 12)

In [7]:
# How many null we have here

df.isnull().sum().sum()



201

In [8]:
# Another function to remove the missing values by spisfibg the column name

def remove_all_missing_values(df, column):
    cleaned_data_df = df.dropna(subset=[column])
    return cleaned_data_df


In [9]:
# See if this will work or not

cleaned_data_df = remove_all_missing_values(df, column='bmi')


In [12]:
cleaned_data_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [13]:
df.isnull()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,False,False,False,False,False,False,False,False,False,True,False,False
5106,False,False,False,False,False,False,False,False,False,False,False,False
5107,False,False,False,False,False,False,False,False,False,False,False,False
5108,False,False,False,False,False,False,False,False,False,False,False,False


In [14]:
# Worked for our new data

cleaned_data_df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [15]:
# SO now let's builed function to fill all our missing values in our data with some strategy we know

def fill_the_missing_values(df, strategy='mean'):
    if strategy == 'mean':
        new_data = df.fillna(df.mean())
    elif strategy == 'median':
        new_data = df.fillna(df.median())
    elif strategy == 'mode':
        new_data = df.fillna(df.mode().iloc[0])  # Mode may return multiple values, so we take the first one
    elif strategy == 'ffill':
        new_data = df.fillna(method='ffill')
    elif strategy == 'bfill':
        new_data = df.fillna(method='bfill')
    else:
        raise ValueError("Invalid filling strategy. Specify 'mean', 'median', 'mode', 'ffill', or 'bfill'.")
    return new_data


In [16]:
# Examle of mode fill strategy

new_data = fill_the_missing_values(df[['bmi']], strategy='mode')

# Display the first 5 rows of the imputed DataFrame

print(new_data.head())


    bmi
0  36.6
1  28.7
2  32.5
3  34.4
4  24.0


In [19]:
# Examle of ffill fill strategy

new_data = fill_the_missing_values(df[['bmi']], strategy='ffill')

# Display the first 5 rows of the imputed DataFrame

print(new_data.head())


    bmi
0  36.6
1  36.6
2  32.5
3  34.4
4  24.0


In [46]:
# Builed function using mean fill strategy

def fill_missing_values(df, strategy='mean'):
    if strategy == 'mean':
        new_data = df.fillna(df.mean())
    return new_data


In [50]:
df['bmi']

0       36.600000
1       28.893237
2       32.500000
3       34.400000
4       24.000000
          ...    
5105    28.893237
5106    40.000000
5107    30.600000
5108    25.600000
5109    26.200000
Name: bmi, Length: 5110, dtype: float64

In [21]:
new_data.head(10)

Unnamed: 0,bmi
0,36.6
1,36.6
2,32.5
3,34.4
4,24.0
5,29.0
6,27.4
7,22.8
8,22.8
9,24.2


In [34]:
# Another way to fill the null values in our original data

df['bmi'].fillna(value=df['bmi'].mean(), inplace=True)
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [35]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.600000,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.500000,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.400000,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.000000,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,28.893237,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.000000,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.600000,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.600000,formerly smoked,0


In [30]:
# We build a function encode our data by using one-hot-encod

def encode_data(df, columns):
    my_encoded_data = pd.get_dummies(df, columns=columns ,drop_first=False)
    return my_encoded_data


In [31]:
# Encod one column and this will split it into to column with true and false

encode_column = ['ever_married']
data_after_encode = encode_data(df, columns=encode_column)


In [32]:
data_after_encode

Unnamed: 0,id,gender,age,hypertension,heart_disease,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,ever_married_No,ever_married_Yes
0,9046,Male,67.0,0,1,Private,Urban,228.69,36.6,formerly smoked,1,False,True
1,51676,Female,61.0,0,0,Self-employed,Rural,202.21,,never smoked,1,False,True
2,31112,Male,80.0,0,1,Private,Rural,105.92,32.5,never smoked,1,False,True
3,60182,Female,49.0,0,0,Private,Urban,171.23,34.4,smokes,1,False,True
4,1665,Female,79.0,1,0,Self-employed,Rural,174.12,24.0,never smoked,1,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Private,Urban,83.75,,never smoked,0,False,True
5106,44873,Female,81.0,0,0,Self-employed,Urban,125.20,40.0,never smoked,0,False,True
5107,19723,Female,35.0,0,0,Self-employed,Rural,82.99,30.6,never smoked,0,False,True
5108,37544,Male,51.0,0,0,Private,Rural,166.29,25.6,formerly smoked,0,False,True


In [36]:
# Install sklearn for another way to encod

!pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [38]:
# Import our library

from sklearn import preprocessing



In [39]:
# We use the another way to encod called label encode

encoder = preprocessing.LabelEncoder()
df['smoking_status'] = encoder.fit_transform(df['smoking_status'])

In [40]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.600000,1,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,2,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.500000,2,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.400000,3,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.000000,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,28.893237,2,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.000000,2,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.600000,2,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.600000,1,0


In [42]:
# We apply it on all our data

df['Residence_type'] = encoder.fit_transform(df['Residence_type'])
df['work_type'] = encoder.fit_transform(df['work_type'])
df['ever_married'] = encoder.fit_transform(df['ever_married'])
df['gender'] = encoder.fit_transform(df['gender'])

In [43]:
# Everything worked

df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.600000,1,1
1,51676,0,61.0,0,0,1,3,0,202.21,28.893237,2,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.500000,2,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.400000,3,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.000000,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,0,80.0,1,0,1,2,1,83.75,28.893237,2,0
5106,44873,0,81.0,0,0,1,3,1,125.20,40.000000,2,0
5107,19723,0,35.0,0,0,1,3,0,82.99,30.600000,2,0
5108,37544,1,51.0,0,0,1,2,0,166.29,25.600000,1,0
