In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

# Create sample data
data = {
    'Age': [45, 50, 35, 60, 55, 40, 52, 48, 58, 62],
    'BloodPressure': [120, 130, 115, 140, 125, 118, 132, 128, 135, 145],
    'Cholesterol': [200, np.nan, 180, np.nan, 220, 190, np.nan, 210, 230, np.nan],
    'Diabetes': [0, 1, 0, 1, 1, 0, 1, 0, 1, 1]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('patient_data.csv', index=False)
print("CSV file 'patient_data.csv' has been created.")

# Display the first few rows of original data
print("\nOriginal data:")
print(df)

# Separate features and target
X = df.drop('Diabetes', axis=1)
y = df['Diabetes']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply KNN imputation
imputer = KNNImputer(n_neighbors=5)
X_imputed = imputer.fit_transform(X_scaled)

# Convert back to DataFrame
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

# Rescale the imputed values
X_imputed_df = pd.DataFrame(scaler.inverse_transform(X_imputed_df), columns=X.columns)

# Combine imputed features with the target variable
df_imputed = pd.concat([X_imputed_df, y], axis=1)

print("\nImputed data:")
print(df_imputed)

# Calculate imputation accuracy (for non-missing values)
original_cholesterol = df['Cholesterol'].dropna()
imputed_cholesterol = df_imputed.loc[original_cholesterol.index, 'Cholesterol']
mse = np.mean((original_cholesterol - imputed_cholesterol) ** 2)
print(f"\nMean Squared Error for known Cholesterol values: {mse:.2f}")

CSV file 'patient_data.csv' has been created.

Original data:
   Age  BloodPressure  Cholesterol  Diabetes
0   45            120        200.0         0
1   50            130          NaN         1
2   35            115        180.0         0
3   60            140          NaN         1
4   55            125        220.0         1
5   40            118        190.0         0
6   52            132          NaN         1
7   48            128        210.0         0
8   58            135        230.0         1
9   62            145          NaN         1

Imputed data:
    Age  BloodPressure  Cholesterol  Diabetes
0  45.0          120.0        200.0         0
1  50.0          130.0        210.0         1
2  35.0          115.0        180.0         0
3  60.0          140.0        210.0         1
4  55.0          125.0        220.0         1
5  40.0          118.0        190.0         0
6  52.0          132.0        210.0         1
7  48.0          128.0        210.0         0
8  58.0       