As the original dataset is not publicly available, please employ your data as appropriate when working with this code.

In [21]:
import pandas as pd

In [33]:
data_path = 'data/your_own_data.csv'
df = pd.read_csv(data_path)

In [23]:
print("variable names:")
print(df.columns.tolist())

variable names:
['ID', 'Diagnosis', 'Age', 'Alb', 'eGFR', 'T-chol', 'IgG', 'C3', 'UP/day', 'Urine RBC']


In [24]:
df.shape

(248, 10)

In [25]:
missing_data = df.isnull().sum()
missing_percentage = (missing_data / len(df)) * 100
missing_info = pd.DataFrame({'Number of missing values': missing_data, 'Percentage (%)': missing_percentage})
print("Number and percentage of missing values for each variable:")
print(missing_info)

Number and percentage of missing values for each variable:
           Number of missing values  Percentage (%)
ID                                0        0.000000
Diagnosis                         0        0.000000
Age                               0        0.000000
Alb                              11        4.435484
eGFR                              0        0.000000
T-chol                           20        8.064516
IgG                              12        4.838710
C3                               19        7.661290
UP/day                           41       16.532258
Urine RBC                         0        0.000000


In [26]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

columns_to_keep = [
    'Age',  'Alb', 'eGFR', 'T-chol', 'IgG',
    'C3', 'UP/day', 'Urine RBC'
]
filtered_df = df[columns_to_keep]

imputer = IterativeImputer(max_iter=10, random_state=0)

filtered_df = imputer.fit_transform(filtered_df)

imputed_df = pd.DataFrame(filtered_df, columns=columns_to_keep)

In [27]:
imputed_df.shape

(248, 8)

In [28]:
missing_data = imputed_df.isnull().sum()
missing_percentage = (missing_data / len(imputed_df)) * 100
missing_info = pd.DataFrame({'Number of missing values': missing_data, 'Percentage (%)': missing_percentage})
print("Number and percentage of missing values for each variable:")
print(missing_info)

Number and percentage of missing values for each variable:
           Number of missing values  Percentage (%)
Age                               0             0.0
Alb                               0             0.0
eGFR                              0             0.0
T-chol                            0             0.0
IgG                               0             0.0
C3                                0             0.0
UP/day                            0             0.0
Urine RBC                         0             0.0


In [31]:
imputed_df = pd.concat([df[['ID', 'Diagnosis']], imputed_df], axis=1)

In [30]:
imputed_df.to_csv('data/MICE_data.csv', index=False)