In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
df = pd.read_csv("Diabetes Missing Data.csv")
df.head()

Unnamed: 0,Pregnant,Glucose,Diastolic_BP,Skin_Fold,Serum_Insulin,BMI,Diabetes_Pedigree,Age,Class
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [3]:
# Check column names and their index to catch hidden spaces or characters
for i, col in enumerate(df.columns):
    print(f"{i}: '{col}'")

0: 'Pregnant'
1: 'Glucose'
2: 'Diastolic_BP'
3: 'Skin_Fold'
4: 'Serum_Insulin'
5: 'BMI'
6: 'Diabetes_Pedigree'
7: 'Age'
8: 'Class'


In [4]:
# Clean column names by removing spaces and hidden characters
df.columns = df.columns.str.strip().str.replace('\xa0', '').str.replace(' ', '')
print(df.columns.tolist())

['Pregnant', 'Glucose', 'Diastolic_BP', 'Skin_Fold', 'Serum_Insulin', 'BMI', 'Diabetes_Pedigree', 'Age', 'Class']


In [5]:
# Fill missing values
df['Glucose'] = df['Glucose'].fillna(df['Glucose'].mean())
df['BloodPressure'] = df['BloodPressure'].fillna(df['BloodPressure'].median())
df['SkinThickness'] = df['SkinThickness'].fillna(df['SkinThickness'].median())
df['Insulin'] = df['Insulin'].fillna(df['Insulin'].median())
df['BMI'] = df['BMI'].fillna(df['BMI'].mean())

# Check if all missing values are handled
df.isnull().sum()

KeyError: 'BloodPressure'

In [6]:
# Create BMI category
def bmi_category(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 25:
        return 'Normal'
    elif 25 <= bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

df['BMI_Category'] = df['BMI'].apply(bmi_category)
df[['BMI', 'BMI_Category']].head()

Unnamed: 0,BMI,BMI_Category
0,33.6,Obese
1,26.6,Overweight
2,23.3,Normal
3,28.1,Overweight
4,43.1,Obese


In [7]:
# Create Age Group
def age_group(age):
    if age < 30:
        return 'Young'
    elif 30 <= age < 50:
        return 'Middle-aged'
    else:
        return 'Senior'

df['Age_Group'] = df['Age'].apply(age_group)
df[['Age', 'Age_Group']].head()

Unnamed: 0,Age,Age_Group
0,50,Senior
1,31,Middle-aged
2,32,Middle-aged
3,21,Young
4,33,Middle-aged


In [8]:
# Save the cleaned and modified dataset
df.to_csv("Cleaned_Diabetes_Data.csv", index=False)
print("Cleaned data saved successfully.")

Cleaned data saved successfully.
