### Load the Dataset and Initial Exploration

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv("C:/project/Brain-Stroke-Prediction/data/raw/brain_stroke_train.csv")

# Display the first few rows of the dataset
data.head()


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0
3,3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0


### Handling Missing Values

In [2]:
# Checking for missing values
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

# Impute missing values based on the column type
data['bmi'] = data['bmi'].fillna(data['bmi'].median())  # Impute BMI with median


Missing Values:
 id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64


- dentified missing values only in the bmi column.
- Imputed the missing values using the median because it is robust to outliers.


### Encoding Categorical Variables

In [4]:
# Encoding categorical variables
data = pd.get_dummies(data, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], drop_first=True)


- Created dummy variables for categorical features.
- Reduced multicollinearity by dropping the first level in each category.


### Creating New Features


In [5]:
# Creating age bins
data['age_group'] = pd.cut(data['age'], bins=[0, 18, 35, 60, 120], labels=['Child', 'Young Adult', 'Adult', 'Senior'])

# BMI categorization
data['bmi_category'] = pd.cut(data['bmi'], bins=[0, 18.5, 24.9, 29.9, 50], labels=['Underweight', 'Normal', 'Overweight', 'Obese'])


- Added age_group and bmi_category to enhance feature interpretability.
- Age and BMI are now grouped into meaningful categories for better analysis.


### Feature Scaling

In [6]:
from sklearn.preprocessing import StandardScaler

# Standardizing numerical features
scaler = StandardScaler()
numerical_columns = ['age', 'avg_glucose_level', 'bmi']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])


- Standardized numerical features for consistent scaling.
- Ensured features are ready for machine learning algorithms sensitive to feature magnitude.


###  Interaction Features

In [7]:
# Creating interaction terms
data['age_glucose_interaction'] = data['age'] * data['avg_glucose_level']
data['bmi_glucose_interaction'] = data['bmi'] * data['avg_glucose_level']


- Added interaction features to capture complex relationships between variables.


### Removing Irrelevant or Redundant Features

In [8]:
# Dropping redundant columns
data = data.drop(columns=['id'])


- Dropped the id column as it does not contribute to prediction.


### Feature Selection

In [10]:
# Convert all categorical columns to numeric using one-hot encoding or label encoding
from sklearn.preprocessing import LabelEncoder

# Identifying categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns

# Apply LabelEncoder to categorical columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Save encoders if needed later

# Proceed with feature selection
X = data.drop(columns=['stroke'])
y = data['stroke']

selected_features = SelectKBest(score_func=f_classif, k=10).fit(X, y)

top_features = X.columns[selected_features.get_support()]
print("Top Features:\n", top_features)


Top Features:
 Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'ever_married_Yes', 'work_type_Self-employed', 'work_type_children',
       'age_glucose_interaction', 'bmi_glucose_interaction'],
      dtype='object')


-  All categorical columns are now numeric.
-  Ready for feature selection or modeling.


In [14]:
# Saving the processed dataset to a new CSV file
processed_data_path = "C:/project/Brain-Stroke-Prediction/data/processed/processed_data.csv"
data.to_csv(processed_data_path, index=False)

print(f"Processed dataset saved at {processed_data_path}")


Processed dataset saved at C:/project/Brain-Stroke-Prediction/data/processed/processed_data.csv


In [15]:
data.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,...,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,age_group,bmi_category,age_glucose_interaction,bmi_glucose_interaction
0,-0.62571,0,0,-0.373297,0.444397,0,True,False,True,False,...,False,False,True,False,True,False,3,1,0.233576,-0.165892
1,-0.392544,0,0,-0.416084,-0.626698,0,True,False,True,False,...,False,False,False,True,False,False,3,0,0.163331,0.260759
2,0.027154,0,0,0.547988,1.813018,0,False,False,True,False,...,False,False,False,False,False,False,0,1,0.01488,0.993512
3,0.680018,0,0,-0.948757,0.102242,0,True,False,True,False,...,False,False,True,False,True,False,0,2,-0.645172,-0.097003
4,-0.812243,0,0,-0.615493,0.102242,0,False,False,False,False,...,False,False,False,False,True,False,3,2,0.49993,-0.062929
