This script performs Multiple Imputation by Chained Equations (MICE) on a dataset. The final datasets are saved as CSV files.


As the dataset is not publicly available, please employ your own data as appropriate when working with this code.

In [None]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
data_path = 'data/your_own_data.csv'
df = pd.read_csv(data_path)

In [None]:
# Display column names
print("変数名:")
print(df.columns.tolist())

変数名:
['ID', 'Diagnosis', 'Age', 'Alb', 'eGFR', 'T-chol', 'IgG', 'C3', 'UP/day', 'Urine RBC']


In [None]:
df.shape

(248, 10)

In [None]:
# Display the number and percentage of missing values for each column
missing_data = df.isnull().sum()
missing_percentage = (missing_data / len(df)) * 100
missing_info = pd.DataFrame({'欠損値の数': missing_data, '割合 (%)': missing_percentage})
print("各変数の欠損値の数と割合:")
print(missing_info)

各変数の欠損値の数と割合:
            欠損値の数     割合 (%)
Unnamed: 0      0   0.000000
ID              0   0.000000
Diagnosis       0   0.000000
Age             0   0.000000
Alb            11   4.435484
eGFR            0   0.000000
T-chol         20   8.064516
IgG            12   4.838710
C3             19   7.661290
UP/day         41  16.532258
Urine RBC       0   0.000000


In [None]:
# Columns to keep for imputation
columns_to_keep = [
    'Age',  'Alb', 'eGFR', 'T-chol', 'IgG',
    'C3', 'UP/day', 'Urine RBC'
]
filtered_df = df[columns_to_keep]

# Create an instance of IterativeImputer
imputer = IterativeImputer(max_iter=10, random_state=0)

# Perform imputation
filtered_df = imputer.fit_transform(filtered_df)

# Create a DataFrame with the imputed data
imputed_df = pd.DataFrame(filtered_df, columns=columns_to_keep)

In [None]:
imputed_df.shape

(248, 8)

In [None]:
# Display the number and percentage of missing values after imputation
missing_data = imputed_df.isnull().sum()
missing_percentage = (missing_data / len(imputed_df)) * 100
missing_info = pd.DataFrame({'欠損値の数': missing_data, '割合 (%)': missing_percentage})
print("各変数の欠損値の数と割合:")
print(missing_info)

各変数の欠損値の数と割合:
           欠損値の数  割合 (%)
Age            0     0.0
Alb            0     0.0
eGFR           0     0.0
T-chol         0     0.0
IgG            0     0.0
C3             0     0.0
UP/day         0     0.0
Urine RBC      0     0.0


In [None]:
# Concatenate the imputed data with ID and Diagnosis columns
imputed_df = pd.concat([df[['ID', 'Diagnosis']], imputed_df], axis=1)

imputed_df.head()

Unnamed: 0,ID,Diagnosis,Age,Alb,eGFR,T-chol,IgG,C3,UP/day,Urine RBC
0,1,1,31.0,1.9,63.964117,497.0,345.0,120.0,6.0,0.0
1,2,0,52.0,1.7,86.983225,307.0,526.0,129.0,6.55,15.0
2,3,1,26.0,1.9,62.384988,630.0,352.0,178.0,6.2,2.5
3,4,0,26.0,2.6,94.945046,182.0,615.0,37.0,5.06,7.5
4,5,0,45.0,2.5,77.027706,231.0,1075.0,43.0,3.0,2.5


In [None]:
imputed_df.shape

(248, 10)

In [None]:
imputed_df.to_csv('data/MICE_data.csv', index=False)