In [2]:
#!pip install fancyimpute



In [None]:
import pandas as pd
from fancyimpute import IterativeImputer
from sklearn.preprocessing import LabelEncoder

# Load dataset
file_path = "updated_patient_data.xlsx"
df = pd.read_excel(file_path)

# Convert 'Unknown' or non-numeric IMD values to NaN
df["IMD"] = pd.to_numeric(df["IMD"], errors="coerce")

# Select relevant predictors
features = ["IMD", "ethnicgroup", "bmi", "age"]
df_subset = df[features].copy()

# Convert categorical ethnicity to numerical labels
label_encoder = LabelEncoder()
df_subset["ethnicgroup_encoded"] = label_encoder.fit_transform(df_subset["ethnicgroup"])

# Drop fully missing columns before MICE
df_subset.drop(columns=["ethnicgroup"], inplace=True)
df_subset.dropna(axis=1, how="all", inplace=True)  # Removes columns where all values are NaN

# Apply MICE for missing IMD values
imputer = IterativeImputer(max_iter=10, random_state=42)
df_subset_imputed = pd.DataFrame(imputer.fit_transform(df_subset), columns=df_subset.columns)  # Use correct column names

# Replace missing IMD values only
df.loc[df["IMD"].isna(), "IMD"] = df_subset_imputed.loc[df["IMD"].isna(), "IMD"].round()

# Save the updated dataset
df.to_excel("updated_imd_data.xlsx", index=False)

# Display first few rows
print(df.head())
