In [4]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Set random seed for reproducibility
np.random.seed(42)

# Create sample data
n_samples = 1000

data = {
    'Age': np.random.normal(50, 15, n_samples),
    'BloodPressure': np.random.normal(120, 20, n_samples),
    'Cholesterol': np.random.normal(200, 40, n_samples),
    'BMI': np.random.normal(25, 5, n_samples),
    'Diabetes': np.random.choice([0, 1], n_samples, p=[0.8, 0.2])
}

df = pd.DataFrame(data)

# Introduce missing values (about 20% in each column)
for column in df.columns:
    mask = np.random.rand(n_samples) < 0.2
    df.loc[mask, column] = np.nan

# Save original dataset with missing values
df.to_csv('patient_data_with_missing.csv', index=False)
print("CSV file 'patient_data_with_missing.csv' has been created.")

print("\nOriginal data with missing values:")
print(df.head())
print("\nMissing values per column:")
print(df.isnull().sum())

# Prepare data for MICE
X = df.copy()

# Split data into train and test sets
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Create and fit MICE imputer
imputer = IterativeImputer(estimator=BayesianRidge(), n_nearest_features=None, 
                           imputation_order='ascending', random_state=42, max_iter=10)
X_train_imputed = imputer.fit_transform(X_train)

# Create DataFrame with imputed values
columns = X_train.columns
X_train_imputed_df = pd.DataFrame(X_train_imputed, columns=columns, index=X_train.index)

print("\nImputed data:")
print(X_train_imputed_df.head())

# Validate imputation
mse = {}
for column in columns:
    known_values = X_train[column].dropna()
    imputed_values = X_train_imputed_df.loc[known_values.index, column]
    mse[column] = mean_squared_error(known_values, imputed_values)

print("\nMean Squared Error for each imputed column:")
for column, error in mse.items():
    print(f"{column}: {error:.2f}")

# Compare distribution of original and imputed data
print("\nOriginal data statistics:")
print(X_train.describe())
print("\nImputed data statistics:")
print(X_train_imputed_df.describe())

# Save the imputed training data
X_train_imputed_df.to_csv('patient_data_imputed_train.csv')
print("\nImputed training data saved to 'patient_data_imputed_train.csv'")

# Impute test set
X_test_imputed = imputer.transform(X_test)
X_test_imputed_df = pd.DataFrame(X_test_imputed, columns=columns, index=X_test.index)

# Save the imputed test data
X_test_imputed_df.to_csv('patient_data_imputed_test.csv')
print("Imputed test data saved to 'patient_data_imputed_test.csv'")

CSV file 'patient_data_with_missing.csv' has been created.

Original data with missing values:
         Age  BloodPressure  Cholesterol        BMI  Diabetes
0        NaN     147.987109   172.992869  15.460962       0.0
1  47.926035            NaN   194.219253  20.698075       0.0
2  59.715328     121.192607   168.303203  22.931972       1.0
3        NaN            NaN   187.681539  34.438438       0.0
4  46.487699     133.964466   124.255413  27.782766       0.0

Missing values per column:
Age              209
BloodPressure    211
Cholesterol      202
BMI              192
Diabetes         187
dtype: int64

Imputed data:
           Age  BloodPressure  Cholesterol        BMI  Diabetes
29   45.624594      99.544149   180.334553  25.172205       0.0
535  50.710980      88.105939   188.196403  32.972212       1.0
695  45.356803     158.778580   200.655116  23.424566       0.0
557  43.511627     103.936421   200.135020  25.615391       0.0
836  73.257507      97.125476   276.165464  25.26910