# Stroke analysis in Nigeria - dataset from Hugging face
- Dataset name: electricsheepafrica/Africa-stroke-prediction-dataset

In [1]:
# Loading the dataset
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

DATASET_ID = "electricsheepafrica/Africa-stroke-prediction-dataset"

#load_dataset 
dataset = load_dataset(DATASET_ID)

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke'],
        num_rows: 5110
    })
})


In [3]:
# 1. Accessing the 'train' split
train_data = dataset['train']

In [4]:
import pandas as pd

# Converting the 'train' split to a Pandas DataFrame
df = train_data.to_pandas()

# Now you can use standard pandas methods
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,59,1,1,Yes,Self-employed,Rural,256.668558,38.667789,never smoked,1
1,51676,Female,54,0,0,Yes,Self-employed,Urban,225.346253,,never smoked,0
2,31112,Male,71,1,1,Yes,Self-employed,Rural,140.156039,34.732833,formerly smoked,1
3,60182,Female,38,0,0,Yes,Private,Rural,188.481167,36.776307,formerly smoked,0
4,1665,Female,72,1,1,Yes,Private,Urban,205.026178,27.02619,never smoked,1


# Inspecting the dataframe
    - looking for null values
    - summary
    - type

In [5]:
df.dtypes


id                     int64
gender                object
age                    int64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [6]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [7]:
#df['stroke'].value_counts()
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,36.053425,0.3,0.149902,121.102664,30.871673,0.089824
std,21161.721625,22.117873,0.458302,0.35701,48.931095,8.032751,0.285957
min,67.0,0.0,0.0,0.0,44.300789,16.0,0.0
25%,17741.25,18.0,0.0,0.0,89.938684,25.483577,0.0
50%,36932.0,37.0,0.0,0.0,107.534753,30.102617,0.0
75%,54682.0,53.0,1.0,0.0,131.002721,35.281196,0.0
max,72940.0,80.0,1.0,1.0,305.638208,97.846962,1.0


In [8]:
df.tail(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5100,68398,Male,72,1,1,Yes,Private,Urban,111.166101,30.148317,never smoked,1
5101,36901,Female,36,0,0,Yes,Private,Rural,106.610467,26.793638,never smoked,0
5102,45010,Female,49,0,0,Yes,Govt_job,Urban,95.892627,24.172601,never smoked,0
5103,22127,Female,10,0,0,No,Self-employed,Rural,85.609541,48.81429,never smoked,0
5104,14180,Female,5,0,0,No,Self-employed,Urban,100.280748,20.549064,never smoked,0
5105,18234,Female,72,1,1,Yes,Self-employed,Urban,115.032868,,never smoked,0
5106,44873,Female,76,1,1,Yes,Self-employed,Urban,159.858271,43.481184,never smoked,1
5107,19723,Female,27,0,0,Yes,Private,Rural,96.319159,32.632114,never smoked,0
5108,37544,Male,46,0,0,Yes,Self-employed,Urban,193.317911,27.131628,never smoked,0
5109,44679,Female,36,0,0,Yes,Self-employed,Rural,93.738483,28.847973,never smoked,0


# Data cleaning and feature engineering
    - Handling the missing bmi values- use bmi mean to fill the missing values.
    - Removing 'other' values in the gender column  because it represents a very tiny fraction(less then 0.1% of entire dataset)
    - Feature engineering - creating new age groups to better analyze the risk factors in tableau
    - Cleaning binary columns for better readability in tableau (0/1 - Yes/No)

In [9]:
# 1. Missing values in BMI before filling
print(f"Missing BMI values before cleaning: {df['bmi'].isnull().sum()}")

# 2. fill the missing BMI values with the mean of the existing values
bmi_mean = df['bmi'].mean()
df['bmi'].fillna(bmi_mean, inplace=True)

# 3. Verifying that the missing values are now zero
print(f"Missing BMI values after cleaning: {df['bmi'].isnull().sum()}")

Missing BMI values before cleaning: 201
Missing BMI values after cleaning: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(bmi_mean, inplace=True)


In [10]:
# df.tail(15)

In [11]:
# Removing the rows where gender is 'Other'
df = df[df['gender'] != 'Other']

# Checking the count of remaining genders to confirm
print("\nGender Distribution after cleaning 'Other':")
print(df['gender'].value_counts())


Gender Distribution after cleaning 'Other':
gender
Female    2994
Male      2115
Name: count, dtype: int64


In [12]:
# Feature Engineering: Creating Age Groups
# Defining the bins (age ranges) and labels for risk groups
# This partitioning helps highlight when stroke risk starts to accelerate.
bins = [0, 18, 45, 65, df['age'].max() + 1] # Using max age + 1 to ensure all records are covered
labels = ['Child (0-17)', 'Young Adult (18-44)', 'Middle Age (45-64)', 'Senior (65+)']

# new categorical column using pd.cut()
df['Age_Group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

#  distribution of the new groups
print("\nAge Group Distribution:")
print(df['Age_Group'].value_counts())


Age Group Distribution:
Age_Group
Young Adult (18-44)    1845
Middle Age (45-64)     1370
Child (0-17)           1268
Senior (65+)            626
Name: count, dtype: int64


In [13]:
# Cleaning the binary columns for better readability
# Convert 0/1 to Yes/No for clarity
df['hypertension'] = df['hypertension'].replace({'Yes': 1, 'No': 0})
df['heart_disease'] = df['heart_disease'].replace({'Yes': 1, 'No': 0})
df['stroke'] = df['stroke'].replace({'Yes': 1, 'No': 0}) # Target variable clarity

In [14]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Age_Group
0,9046,Male,59,1,1,Yes,Self-employed,Rural,256.668558,38.667789,never smoked,1,Middle Age (45-64)
1,51676,Female,54,0,0,Yes,Self-employed,Urban,225.346253,30.871673,never smoked,0,Middle Age (45-64)
2,31112,Male,71,1,1,Yes,Self-employed,Rural,140.156039,34.732833,formerly smoked,1,Senior (65+)
3,60182,Female,38,0,0,Yes,Private,Rural,188.481167,36.776307,formerly smoked,0,Young Adult (18-44)
4,1665,Female,72,1,1,Yes,Private,Urban,205.026178,27.02619,never smoked,1,Senior (65+)


In [15]:

REPLACEMENT_LABEL = 'Not in Labor Force' 

# 2. Replacing the misclassified label 'children'
df['work_type'] = df['work_type'].replace('children', REPLACEMENT_LABEL)


# Exporting the data to csv for tableau viusalization

In [16]:
# Dropping the unique identifier (id) as it's not needed for analysis or visualization
df.drop('id', axis=1, inplace=True)

# Converting to csv
df.to_csv('Africa_Stroke_Data_Cleaned.csv', index=False)

print("\nCleaning complete. Data exported to 'Africa_Stroke_Data_Cleaned.csv'.")


Cleaning complete. Data exported to 'Africa_Stroke_Data_Cleaned.csv'.
