In [9]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Sample healthcare data
data = {
    "Patient_ID": [101, 102, 103, 104, 105],
    "Age": [25, 40, np.nan, 35, 29],
    "Gender": ["M", "F", "Male", "F", np.nan],
    "Blood_Pressure": [120, 140, 150, np.nan, 130],
    "Cholesterol_Level": [200, 240, 300, 150, np.nan],
    "Disease_Present": [0, 1, 1, 0, 1],  # 0: No, 1: Yes
}

# Create DataFrame
df = pd.DataFrame(data)

# Display original dataset
print("Original Dataset:\n", df, "\n")

# Handle missing values
# Replace missing numerical values with the mean
imputer = SimpleImputer(strategy="mean")
numerical_columns = df.select_dtypes(include=[np.number]).columns
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])

# Replace missing categorical values with the most frequent value
imputer_cat = SimpleImputer(strategy="most_frequent")
categorical_columns = df.select_dtypes(include=["object"]).columns
df[categorical_columns] = imputer_cat.fit_transform(df[categorical_columns])

# Standardize inconsistent categorical data
df["Gender"] = df["Gender"].replace({"M": "Male", "F": "Female"})

# Remove noisy data
# Example: Removing extreme outliers in numerical columns using IQR
for col in numerical_columns:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

# Standardize numerical columns for scaling
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Save cleaned dataset as CSV file
csv_filename = "cleaned_healthcare_data.csv"
df.to_csv(csv_filename, index=False)
print(f"Cleaned Dataset:\n{df}\n")
print(f"Cleaned data saved to '{csv_filename}'.")


Original Dataset:
    Patient_ID   Age Gender  Blood_Pressure  Cholesterol_Level  Disease_Present
0         101  25.0      M           120.0              200.0                0
1         102  40.0      F           140.0              240.0                1
2         103   NaN   Male           150.0              300.0                1
3         104  35.0      F             NaN              150.0                0
4         105  29.0    NaN           130.0                NaN                1 

Cleaned Dataset:
   Patient_ID       Age  Gender  Blood_Pressure  Cholesterol_Level  \
0   -1.414214 -1.417758    Male            -1.5          -0.457851   
1   -0.707107  1.515535  Female             0.5           0.356106   
2    0.000000  0.000000    Male             1.5           1.577042   
3    0.707107  0.537770  Female             0.0          -1.475297   
4    1.414214 -0.635547  Female            -0.5           0.000000   

   Disease_Present  
0        -1.224745  
1         0.816497  
2   