In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error


In [2]:
# Read in previously merged cat/quant data
test_merged = pd.read_csv('test_merged.csv')
train_merged = pd.read_csv('train_merged.csv')

# Read in connectomes
test_connectome = pd.read_csv('TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')
train_connectome = pd.read_csv('TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv')

# Read in training solutions
y_df_raw = pd.read_excel('TRAINING_SOLUTIONS.xlsx')

# Impute the missing in test set
- merge the training and test with connectome matrices
- Use MICE imputation to impute missing age in the training set, and missing survey responses in the test set

In [3]:
# Merge set for imputation 
impute_df = pd.concat([train_merged, test_merged], ignore_index=True)

# Change participant_id to index
impute_df = impute_df.set_index('participant_id')


In [4]:
# MICE

# Get column names 
colnames_imputed = impute_df.columns

# Initiate imputer 
imputer = IterativeImputer(random_state=100, max_iter=10)

imputer.fit(impute_df)

df_imputed = imputer.transform(impute_df)

df_imputed_final = pd.DataFrame(df_imputed, columns=colnames_imputed)

In [5]:
df_imputed_final

Unnamed: 0,Barratt_Barratt_P1_Edu_college_edu,Barratt_Barratt_P1_Edu_graduate_edu,Barratt_Barratt_P1_Edu_high_school_grad,Barratt_Barratt_P1_Edu_junior_middle_school,Barratt_Barratt_P1_Edu_less_than_7th,Barratt_Barratt_P1_Edu_partial_college,Barratt_Barratt_P1_Edu_partial_high_school,Barratt_Barratt_P1_Edu_unknown,Barratt_Barratt_P1_Occ_0,Barratt_Barratt_P1_Occ_0.0,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6.0,1.0,5.0,0.0,5.0,1.0,0.0,10.0,9.317908
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,18.0,6.0,8.0,7.0,8.0,10.0,4.0,5.0,14.503093
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,14.0,2.0,8.0,5.0,7.0,6.0,4.0,9.0,8.239904
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,6.0,24.0,4.0,16.0,9.0,10.0,8.0,4.0,6.0,10.319820
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,18.0,4.0,11.0,4.0,10.0,7.0,3.0,9.0,8.940679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1512,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,18.0,7.0,7.0,5.0,6.0,11.0,4.0,7.0,7.546999
1513,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,16.0,2.0,11.0,5.0,9.0,5.0,3.0,8.0,10.531143
1514,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,11.0,4.0,4.0,4.0,3.0,7.0,3.0,10.0,7.210586
1515,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,5.0,21.0,2.0,10.0,6.0,5.0,11.0,9.0,0.0,12.212183


## Multiple rounds of imputation testing by MICE, age accuracy check

In [None]:
mask = np.random.rand(len(df_imputed_final)) < 0.2
second_df_imputed = df_imputed_final.copy()
second_df_imputed.loc[mask, 'MRI_Track_Age_at_Scan'] = np.nan  

# Re-impute the missing values (Second Imputation)
df_reimputed = second_df_imputed.copy()
df_reimputed = pd.DataFrame(imputer.transform(second_df_imputed), columns=colnames_imputed)

# MAE
mae = mean_absolute_error(
    df_imputed_final.loc[mask, 'MRI_Track_Age_at_Scan'], 
    df_reimputed.loc[mask, 'MRI_Track_Age_at_Scan']
)

# Percentage
epsilon = 1e-5 
percentage_errors = np.abs((df_imputed_final.loc[mask, 'MRI_Track_Age_at_Scan'] - 
                             df_reimputed.loc[mask, 'MRI_Track_Age_at_Scan']) / 
                            (df_imputed_final.loc[mask, 'MRI_Track_Age_at_Scan'] + epsilon)) * 100



In [None]:
print("MAE:", mae)
print(f"Mean percentage error: {percentage_errors.mean()}")

MAE: 1.3802268744190336
MAPE: 160434484179127.3


## Multiple round imputation by mean, age check


In [None]:
# Mean imputation for Age
mean_age = train_merged['MRI_Track_Age_at_Scan'].mean()

# Impute missing 
train_imputed_age_one = train_merged.copy()

train_imputed_age_one['MRI_Track_Age_at_Scan'] = train_merged['MRI_Track_Age_at_Scan'].fillna(mean_age)


In [None]:
# Mean imputation for Age
mean_age = train_merged['MRI_Track_Age_at_Scan'].mean()

# Impute missing 'MRI_Track_Age_at_Scan' with the mean
train_imputed_age_one = train_merged.copy()
train_imputed_age_one['MRI_Track_Age_at_Scan'] = train_merged['MRI_Track_Age_at_Scan'].fillna(mean_age)

# Create a mask to randomly remove 20% of values for testing
mask = np.random.rand(len(train_imputed_age_one)) < 0.2
train_imputed_age_two = train_imputed_age_one.copy()
train_imputed_age_two.loc[mask, 'MRI_Track_Age_at_Scan'] = np.nan
train_imputed_age_two['MRI_Track_Age_at_Scan'] = train_imputed_age_two['MRI_Track_Age_at_Scan'].fillna(mean_age)


# Calculate MAE for the imputation accuracy
mae = mean_absolute_error(
    train_imputed_age_one.loc[mask, 'MRI_Track_Age_at_Scan'], 
    train_imputed_age_two.loc[mask, 'MRI_Track_Age_at_Scan']
)

print(f"MAE: {mae}")


## Split the dataset back to test and train 

In [None]:
df_imputed_final['participant_id'] = impute_df.index

In [8]:
# Split
train_imputed = df_imputed_final.loc[train_merged.index]
test_imputed = df_imputed_final.loc[test_merged.index]


In [9]:
# Write to CSV
train_imputed.to_csv("train_cat_quant_imputed.csv")
test_imputed.to_csv("test_cat_quant_imputed.csv")