In [1]:
import pandas as pd

df = pd.read_csv("diabeto_cleaned.csv")
print(df.head())

   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  


In [2]:
# Define age bins and labels
bins = [0, 17, 64, float('inf')]  # Define the ranges
labels = ['Child', 'Adult', 'Old']  # Define the labels

# Create a new column for age groups
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=True)

# Display the first few rows to verify
print(df[['age', 'age_group']].head())


    age age_group
0  80.0       Old
1  54.0     Adult
2  28.0     Adult
3  36.0     Adult
4  76.0       Old


In [3]:
print(df['age_group'].value_counts())

age_group
Adult    61802
Old      17656
Child    16688
Name: count, dtype: int64


In [4]:
# Step 1: One-Hot Encode Gender
df = pd.get_dummies(df, columns=['gender'], prefix='gender')

# Step 2: Combine Gender with Age Groups
age_groups = ['Child', 'Adult', 'Old']
genders = ['Male', 'Female', 'Other']

# Create interaction features for all gender-age combinations
for gender in genders:
    for age_group in age_groups:
        # Create a new column for each gender-age group combination
        df[f'{gender}_{age_group}'] = (df[f'gender_{gender}'] & (df['age_group'] == age_group)).astype(int)

# Display the first few rows to verify
print(df[['age', 'age_group', 'gender_Male', 'gender_Female', 'gender_Other', 
          'Male_Child', 'Male_Adult', 'Male_Old', 'Female_Child', 'Female_Adult', 
          'Female_Old', 'Other_Child', 'Other_Adult', 'Other_Old']].head())


    age age_group  gender_Male  gender_Female  gender_Other  Male_Child  \
0  80.0       Old        False           True         False           0   
1  54.0     Adult        False           True         False           0   
2  28.0     Adult         True          False         False           0   
3  36.0     Adult        False           True         False           0   
4  76.0       Old         True          False         False           0   

   Male_Adult  Male_Old  Female_Child  Female_Adult  Female_Old  Other_Child  \
0           0         0             0             0           1            0   
1           0         0             0             1           0            0   
2           1         0             0             0           0            0   
3           0         0             0             1           0            0   
4           0         1             0             0           0            0   

   Other_Adult  Other_Old  
0            0          0  
1           

In [5]:
print(df.columns)

Index(['age', 'hypertension', 'heart_disease', 'smoking_history', 'bmi',
       'HbA1c_level', 'blood_glucose_level', 'diabetes', 'age_group',
       'gender_Female', 'gender_Male', 'gender_Other', 'Male_Child',
       'Male_Adult', 'Male_Old', 'Female_Child', 'Female_Adult', 'Female_Old',
       'Other_Child', 'Other_Adult', 'Other_Old'],
      dtype='object')


In [6]:
# Step 4: Create a new column for age squared (age^2)
df['age_squared'] = df['age'] ** 2

# Display the first few rows to verify
print(df[['age', 'age_squared']].head())


    age  age_squared
0  80.0       6400.0
1  54.0       2916.0
2  28.0        784.0
3  36.0       1296.0
4  76.0       5776.0


In [7]:
print(df.columns)

Index(['age', 'hypertension', 'heart_disease', 'smoking_history', 'bmi',
       'HbA1c_level', 'blood_glucose_level', 'diabetes', 'age_group',
       'gender_Female', 'gender_Male', 'gender_Other', 'Male_Child',
       'Male_Adult', 'Male_Old', 'Female_Child', 'Female_Adult', 'Female_Old',
       'Other_Child', 'Other_Adult', 'Other_Old', 'age_squared'],
      dtype='object')


In [8]:
def classify_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 24.9:
        return 'Normal weight'
    elif 25 <= bmi < 29.9:
        return 'Overweight'
    else:
        return 'Obese'

df['bmi_category'] = df['bmi'].apply(classify_bmi)

print(df[['bmi', 'bmi_category']].head())

     bmi   bmi_category
0  25.19     Overweight
1  27.32     Overweight
2  27.32     Overweight
3  23.45  Normal weight
4  20.14  Normal weight


In [9]:
# Threshold for HbA1c level (indicating diabetes risk)
df['HbA1c_above_6_5'] = (df['HbA1c_level'] >= 6.5).astype(int)

# Threshold for blood glucose level
df['blood_glucose_above_126'] = (df['blood_glucose_level'] >= 126).astype(int)

print(df[['HbA1c_level', 'HbA1c_above_6_5', 'blood_glucose_level', 'blood_glucose_above_126']].head())

   HbA1c_level  HbA1c_above_6_5  blood_glucose_level  blood_glucose_above_126
0          6.6                1                  140                        1
1          6.6                1                   80                        0
2          5.7                0                  158                        1
3          5.0                0                  155                        1
4          4.8                0                  155                        1


In [10]:
print(df.columns)

Index(['age', 'hypertension', 'heart_disease', 'smoking_history', 'bmi',
       'HbA1c_level', 'blood_glucose_level', 'diabetes', 'age_group',
       'gender_Female', 'gender_Male', 'gender_Other', 'Male_Child',
       'Male_Adult', 'Male_Old', 'Female_Child', 'Female_Adult', 'Female_Old',
       'Other_Child', 'Other_Adult', 'Other_Old', 'age_squared',
       'bmi_category', 'HbA1c_above_6_5', 'blood_glucose_above_126'],
      dtype='object')


In [11]:
df.to_csv('diabetes_engineered.csv', index=False)

In [12]:
from sklearn.preprocessing import StandardScaler

# Step 10: Normalize/Standardize features (excluding target and categorical columns)
scaler = StandardScaler()

# Selecting the columns to scale (continuous features)
columns_to_scale = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']

# Apply StandardScaler
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

# Display the first few rows to verify
print(df[columns_to_scale].head())


        age       bmi  HbA1c_level  blood_glucose_level
0  1.700840 -0.314947     0.994563             0.043554
1  0.543372 -0.000216     0.994563            -1.423096
2 -0.614096 -0.000216     0.155970             0.483549
3 -0.257952 -0.572051    -0.496269             0.410216
4  1.522768 -1.061141    -0.682623             0.410216


In [13]:
df.to_csv('diabetes_engineered_normalized.csv', index=False)