In [None]:
# https://www.machinelearningplus.com/machine-learning/mice-imputation/  -> The MICE Algorithm (Step-by-step) paragraph
# https://towardsdatascience.com/imputing-missing-data-with-simple-and-advanced-techniques-f5c7b157fb87#33d5

In [1]:
from IPython.core.interactiveshell import InteractiveShell; InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import pandas as pd

from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [15]:
temp = []
for i in range(6):
    temp.append(np.random.randint(1, 101) + np.random.random())
temp.insert(0, np.nan)
temp.insert(4, np.nan)
temp.insert(8, np.nan)
df = pd.DataFrame(np.array(temp).reshape(3,3), columns=['Charlie', 'Sophie', 'Steven'])
df

Unnamed: 0,Charlie,Sophie,Steven
0,,9.965812,28.806376
1,18.293272,,14.664171
2,53.494715,17.207427,


## fit_transform()

In [195]:
imputer = IterativeImputer(max_iter=100)
transformed_df = pd.DataFrame(imputer.fit_transform(df), columns=imputer.get_feature_names_out())
transformed_df

Unnamed: 0,Charlie,Sophie,Steven
0,68.339763,9.965812,28.806376
1,18.293272,34.379374,14.664171
2,53.494715,17.207427,24.611449


In [196]:
imputer.n_iter_

4

In [198]:
len(imputer.imputation_sequence_)   # num of features * num of iterations = num of triplets ->  3 * 4 = 12
imputer.imputation_sequence_

12

[_ImputerTriplet(feat_idx=0, neighbor_feat_idx=array([1, 2]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=1, neighbor_feat_idx=array([0, 2]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=2, neighbor_feat_idx=array([0, 1]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=0, neighbor_feat_idx=array([1, 2]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=1, neighbor_feat_idx=array([0, 2]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=2, neighbor_feat_idx=array([0, 1]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=0, neighbor_feat_idx=array([1, 2]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=1, neighbor_feat_idx=array([0, 2]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=2, neighbor_feat_idx=array([0, 1]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=0, neighbor_feat_idx=array([1, 2]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=1, neighbor_feat_idx=array([0, 2]), estimator=BayesianRidge()),
 _ImputerT

In [199]:
imputer.imputation_sequence_[0][2].coef_

array([2.01958746, 3.9440674 ])

## fit() and then transform()

In [200]:
imputer = IterativeImputer(max_iter=100)
imputer.fit(df)

In [203]:
imputer.n_iter_
len(imputer.imputation_sequence_)

4

12

In [204]:
imputer.imputation_sequence_[0][2].coef_

array([2.01958746, 3.9440674 ])

In [205]:
transformed_df = pd.DataFrame(imputer.transform(df), columns=imputer.get_feature_names_out())
transformed_df

Unnamed: 0,Charlie,Sophie,Steven
0,68.339763,9.965812,28.806376
1,18.293272,34.379374,14.664171
2,53.494715,17.207427,24.611449


# Simulation of IterativeImputer() executed on seen data

In [27]:
# these are means used to initialize the missing values
charlie_mean = df.iloc[:, 0].mean()
sophie_mean = df.iloc[:, 1].mean()
steven_mean = df.iloc[:, 2].mean()
charlie_mean, sophie_mean, steven_mean

(35.89399350207449, 13.586619483083217, 21.735273746717958)

## first iteration, first ImputerTriplet()

In [89]:
imputer.imputation_sequence_[0]

_ImputerTriplet(feat_idx=0, neighbor_feat_idx=array([1, 2]), estimator=BayesianRidge())

In [124]:
# Charlie is target, Sophie and Steven are features, use only rows 1 and 2 because there are non missing values for Charlie
df

Unnamed: 0,Charlie,Sophie,Steven
0,,9.965812,28.806376
1,18.293272,,14.664171
2,53.494715,17.207427,


In [147]:
target = df.iloc[1:, 0].to_numpy()
target

array([18.29327174, 53.49471527])

In [148]:
features = df.iloc[1:, 1:]
features = features.fillna({'Sophie': sophie_mean, 'Steven': steven_mean}).to_numpy()
features

array([[13.58661948, 14.66417114],
       [17.20742733, 21.73527375]])

In [149]:
model = BayesianRidge()
model.fit(features, target)

In [150]:
# coefficients of our model are same as coefficients of IterativeImputer()
model.coef_
imputer.imputation_sequence_[0][2].coef_

array([2.01958746, 3.9440674 ])

array([2.01958746, 3.9440674 ])

In [151]:
# take features from row with missing value for Charlie and use them for prediction of this missing value
df
values_for_pred = df.iloc[0, 1:].to_numpy()
values_for_pred

Unnamed: 0,Charlie,Sophie,Steven
0,,9.965812,28.806376
1,18.293272,,14.664171
2,53.494715,17.207427,


array([ 9.96581163, 28.80637636])

In [152]:
# prediction
iter_1_charlie_pred = model.predict(values_for_pred.reshape(1, -1))
iter_1_charlie_pred

array([66.75854419])

## first iteration, second ImputerTriplet()

In [90]:
imputer.imputation_sequence_[1]

_ImputerTriplet(feat_idx=1, neighbor_feat_idx=array([0, 2]), estimator=BayesianRidge())

In [82]:
# Sophie is target, Charlie and Steven are features, use only rows 0 and 2 because there are non missing values for Sophie
df

Unnamed: 0,Charlie,Sophie,Steven
0,,9.965812,28.806376
1,18.293272,,14.664171
2,53.494715,17.207427,


In [153]:
target = df.iloc[[0, 2], 1].to_numpy()
target

array([ 9.96581163, 17.20742733])

In [154]:
features = df.iloc[[0, 2], [0, 2]]
# now instead of mean for Charlie put previously calculated prediction for Charlie
features = features.fillna({'Charlie': iter_1_charlie_pred[0], 'Steven': steven_mean}).to_numpy()
features

array([[66.75854419, 28.80637636],
       [53.49471527, 21.73527375]])

In [155]:
model = BayesianRidge()
model.fit(features, target)

In [156]:
# coefficients of our model are same as coefficients of IterativeImputer()
model.coef_
imputer.imputation_sequence_[1][2].coef_

array([-0.42513917, -0.22664667])

array([-0.42513917, -0.22664667])

In [157]:
# take features from row with missing value for Sophie and use them for prediction of this missing value
df
values_for_pred = df.iloc[1, [0, 2]].to_numpy()
values_for_pred

Unnamed: 0,Charlie,Sophie,Steven
0,,9.965812,28.806376
1,18.293272,,14.664171
2,53.494715,17.207427,


array([18.29327174, 14.66417114])

In [158]:
# prediction
iter_1_sophie_pred = model.predict(values_for_pred.reshape(1, -1))
iter_1_sophie_pred

array([33.77558153])

## first iteration, third ImputerTriplet()

In [113]:
imputer.imputation_sequence_[2]

_ImputerTriplet(feat_idx=2, neighbor_feat_idx=array([0, 1]), estimator=BayesianRidge())

In [114]:
# Steven is target, Charlie and Sophie are features, use only rows 0 and 1 because there are non missing values for Steven
df

Unnamed: 0,Charlie,Sophie,Steven
0,,9.965812,28.806376
1,18.293272,,14.664171
2,53.494715,17.207427,


In [159]:
target = df.iloc[0:2, 2].to_numpy()
target

array([28.80637636, 14.66417114])

In [160]:
features = df.iloc[0:2, 0:2]
# now instead of mean for Charlie put previously calculated prediction for Charlie
# and instead of mean for Sophie put previously calculated prediction for Sophie
features = features.fillna({'Charlie': iter_1_charlie_pred[0], 'Sophie': iter_1_sophie_pred[0]}).to_numpy()
features

array([[66.75854419,  9.96581163],
       [18.29327174, 33.77558153]])

In [161]:
model = BayesianRidge()
model.fit(features, target)

In [163]:
# coefficients of our model are same as coefficients of IterativeImputer()
model.coef_
imputer.imputation_sequence_[2][2].coef_

array([ 0.23506711, -0.11548256])

array([ 0.23506711, -0.11548256])

In [164]:
# take features from row with missing value for Steven and use them for prediction of this missing value
df
values_for_pred = df.iloc[2, 0:2].to_numpy()
values_for_pred

Unnamed: 0,Charlie,Sophie,Steven
0,,9.965812,28.806376
1,18.293272,,14.664171
2,53.494715,17.207427,


array([53.49471527, 17.20742733])

In [165]:
# prediction
iter_1_steven_pred = model.predict(values_for_pred.reshape(1, -1))
iter_1_steven_pred

array([24.8522059])

## second iteration, first ImputerTriplet()

In [122]:
imputer.imputation_sequence_[3]

_ImputerTriplet(feat_idx=0, neighbor_feat_idx=array([1, 2]), estimator=BayesianRidge())

In [114]:
# Charlie is target, Sophie and Steven are features, use only rows 1 and 2 because there are non missing values for Charlie
df

Unnamed: 0,Charlie,Sophie,Steven
0,,9.965812,28.806376
1,18.293272,,14.664171
2,53.494715,17.207427,


In [167]:
target = df.iloc[1:, 0].to_numpy()
target

array([18.29327174, 53.49471527])

In [169]:
features = df.iloc[1:, 1:]
# use previously calculated predictions
features = features.fillna({'Sophie': iter_1_sophie_pred[0], 'Steven': iter_1_steven_pred[0]}).to_numpy()
features

array([[33.77558153, 14.66417114],
       [17.20742733, 24.8522059 ]])

In [170]:
model = BayesianRidge()
model.fit(features, target)

In [172]:
# coefficients of our model are same as coefficients of IterativeImputer()
model.coef_
imputer.imputation_sequence_[3][2].coef_

array([-1.54169514,  0.9480141 ])

array([-1.54169514,  0.9480141 ])

In [173]:
# take features from row with missing value for Charlie and use them for prediction of this missing value
df
values_for_pred = df.iloc[0, 1:].to_numpy()
values_for_pred

Unnamed: 0,Charlie,Sophie,Steven
0,,9.965812,28.806376
1,18.293272,,14.664171
2,53.494715,17.207427,


array([ 9.96581163, 28.80637636])

In [175]:
# prediction
iter_2_charlie_pred = model.predict(values_for_pred.reshape(1, -1))
iter_2_charlie_pred

# look how prediction for Charlie changed
print('-------------')
iter_1_charlie_pred

array([68.4076883])

-------------


array([66.75854419])

## second iteration, second ImputerTriplet()

In [176]:
imputer.imputation_sequence_[4]

_ImputerTriplet(feat_idx=1, neighbor_feat_idx=array([0, 2]), estimator=BayesianRidge())

In [114]:
# Sophie is target, Charlie and Steven are features, use only rows 0 and 2 because there are non missing values for Sophie
df

Unnamed: 0,Charlie,Sophie,Steven
0,,9.965812,28.806376
1,18.293272,,14.664171
2,53.494715,17.207427,


In [178]:
target = df.iloc[[0, 2], 1].to_numpy()
target

array([ 9.96581163, 17.20742733])

In [179]:
features = df.iloc[[0, 2], [0, 2]]
# use previously calculated predictions but this time take new prediction for Charlie
features = features.fillna({'Charlie': iter_2_charlie_pred[0], 'Steven': iter_1_steven_pred[0]}).to_numpy()
features

array([[68.4076883 , 28.80637636],
       [53.49471527, 24.8522059 ]])

In [180]:
model = BayesianRidge()
model.fit(features, target)

In [184]:
# coefficients of our model are same as coefficients of IterativeImputer()
model.coef_
imputer.imputation_sequence_[4][2].coef_

array([-0.45369491, -0.12029707])

array([-0.45369491, -0.12029707])

In [185]:
# take features from row with missing value for Sophie and use them for prediction of this missing value
df
values_for_pred = df.iloc[1, [0, 2]].to_numpy()
values_for_pred

Unnamed: 0,Charlie,Sophie,Steven
0,,9.965812,28.806376
1,18.293272,,14.664171
2,53.494715,17.207427,


array([18.29327174, 14.66417114])

In [186]:
# prediction
iter_2_sophie_pred = model.predict(values_for_pred.reshape(1, -1))
iter_2_sophie_pred

# look how prediction for Sophie changed
print('-------------')
iter_1_sophie_pred

array([34.40373372])

-------------


array([33.77558153])

## The next steps are analogous

# Simulation of IterativeImputer() executed on UNSEEN data

In [206]:
temp_2 = []
for i in range(6):
    temp_2.append(np.random.randint(1, 101) + np.random.random())
temp_2.insert(0, np.nan)
temp_2.insert(4, np.nan)
temp_2.insert(8, np.nan)
df_2 = pd.DataFrame(np.array(temp_2).reshape(3,3), columns=['Charlie', 'Sophie', 'Steven'])
df_2

Unnamed: 0,Charlie,Sophie,Steven
0,,14.520244,21.915749
1,78.393574,,67.777473
2,99.273823,12.72657,


In [208]:
pd.DataFrame(imputer.transform(df_2), columns=imputer.get_feature_names_out())

Unnamed: 0,Charlie,Sophie,Steven
0,55.221759,14.520244,21.915749
1,78.393574,0.449702,67.777473
2,99.273823,12.72657,35.560053


In [216]:
values_for_pred = df_2.iloc[0, 1:].to_numpy()
values_for_pred

# prediction for Charlie
# IterativeImputer() takes latest model from .imputation_sequence_ for prediction
prediction = imputer.imputation_sequence_[9][2].predict(values_for_pred.reshape(1, -1))
prediction

array([14.52024364, 21.91574917])

array([55.22175933])

In [217]:
values_for_pred = df_2.iloc[1, [0, 2]].to_numpy()
values_for_pred

# prediction for Sophie
# IterativeImputer() takes latest model from .imputation_sequence_ for prediction
prediction = imputer.imputation_sequence_[10][2].predict(values_for_pred.reshape(1, -1))
prediction

array([78.39357385, 67.77747312])

array([0.44970209])

In [219]:
values_for_pred = df_2.iloc[2, :2].to_numpy()
values_for_pred

# prediction for Steven
# IterativeImputer() takes latest model from .imputation_sequence_ for prediction
prediction = imputer.imputation_sequence_[11][2].predict(values_for_pred.reshape(1, -1))
prediction

array([99.2738229 , 12.72656975])

array([35.56005303])

# Simulation of IterativeImputer(); UNSEEN data, different NaNs distribution

In [6]:
def gen_rand_num():
    return np.random.randint(1, 101) + np.random.random()

In [3]:
temp = []
for i in range(6):
    temp.append(np.random.randint(1, 101) + np.random.random())
temp.insert(0, np.nan)
temp.insert(4, np.nan)
temp.insert(8, np.nan)
df = pd.DataFrame(np.array(temp).reshape(3,3), columns=['Charlie', 'Sophie', 'Steven'])
df

Unnamed: 0,Charlie,Sophie,Steven
0,,23.954847,79.786868
1,49.860382,,32.377146
2,50.579794,100.773471,


In [4]:
imputer = IterativeImputer(max_iter=100)
imputer.fit_transform(df)

array([[ 51.30089873,  23.95484738,  79.78686809],
       [ 49.86038167, 177.41083537,  32.3771463 ],
       [ 50.57979351, 100.77347115,  56.05400744]])

In [7]:
temp_2 = np.array([[np.nan, gen_rand_num(), gen_rand_num()],
         [gen_rand_num(), np.nan, gen_rand_num()],
         [gen_rand_num(), gen_rand_num(), np.nan],
         [gen_rand_num(), np.nan, np.nan]])

df_2 = pd.DataFrame(temp_2, columns=imputer.get_feature_names_out())
df_2

Unnamed: 0,Charlie,Sophie,Steven
0,,89.699334,66.065905
1,76.681927,,6.16595
2,98.517026,37.585851,
3,51.46483,,


In [8]:
pd.DataFrame(imputer.transform(df_2), columns=imputer.get_feature_names_out())

Unnamed: 0,Charlie,Sophie,Steven
0,50.701194,89.699334,66.065905
1,76.681927,259.537685,6.16595
2,98.517026,37.585851,75.712907
3,51.46483,100.693783,56.081191


In [None]:
# finding out how IterativeImputer() works with deugging package ipdb
import ipdb; ipdb.set_trace()
pd.DataFrame(imputer.transform(df_2), columns=imputer.get_feature_names_out())

In [18]:
# this means (from DataFrame on which imputer was fitted) are used as initial values for transforming another DataFrame
imputer.initial_imputer_.statistics_

charlie_mean = df['Charlie'].mean()
sophie_mean = df['Sophie'].mean()
steven_mean = df['Steven'].mean()
charlie_mean, sophie_mean, steven_mean

array([50.22008759, 62.36415926, 56.0820072 ])

(50.22008759407888, 62.36415926181575, 56.08200719948012)

In [45]:
# 1
imputer.imputation_sequence_[0][2].predict(np.array([[89.699334, 66.065905]]))    # 50.51316052   [0, 0]


# 2
imputer.imputation_sequence_[1][2].predict(np.array([[76.681927, 6.165950]]))     # 265.33439896   [1, 1]
imputer.imputation_sequence_[1][2].predict(np.array([[51.464830, steven_mean]]))     # 100.87539932  [3, 1]   -> put this value in step # 3

# 3
imputer.imputation_sequence_[2][2].predict(np.array([[98.517026, 37.585851]]))     # 75.56346395   [2, 2]
imputer.imputation_sequence_[2][2].predict(np.array([[51.464830, 100.87539932]]))     # 56.02229554  [3, 2]  -> put this value in step # 5


# 4
imputer.imputation_sequence_[3][2].predict(np.array([[89.699334, 66.065905]]))    # 50.7011919   [0, 0]

# 5
imputer.imputation_sequence_[4][2].predict(np.array([[76.681927, 6.165950]]))     # 259.53772246   [1, 1]
imputer.imputation_sequence_[4][2].predict(np.array([[51.464830, 56.02229554]]))     # 100.78905961  [3, 1]  -> put this value in step # 6

# 6
imputer.imputation_sequence_[5][2].predict(np.array([[98.517026, 37.585851]]))     # 75.7129053   [2, 2]
imputer.imputation_sequence_[5][2].predict(np.array([[51.464830, 100.78905961]]))     # 56.05175796  [3, 2]  -> put this value in step # 8


# 7
imputer.imputation_sequence_[6][2].predict(np.array([[89.699334, 66.065905]]))    # 50.70119404   [0, 0]


# 8
imputer.imputation_sequence_[7][2].predict(np.array([[76.681927, 6.165950]]))     # 259.53768254   [1, 1]
imputer.imputation_sequence_[7][2].predict(np.array([[51.464830, 56.05175796]]))     # 100.69378316  [3, 1]   -> put this value in step # 9

# 9
imputer.imputation_sequence_[8][2].predict(np.array([[98.517026, 37.585851]]))     # 75.71290633   [2, 2]
imputer.imputation_sequence_[8][2].predict(np.array([[51.464830, 100.69378316]]))     # 56.08119085  [3, 2] 

array([50.51316052])

array([265.33439896])

array([100.87539932])

array([75.56346395])

array([56.02229554])

array([50.70119195])

array([259.53772246])

array([100.78905961])

array([75.7129053])

array([56.05175796])

array([50.70119404])

array([259.53768254])

array([100.69378316])

array([75.71290633])

array([56.08119085])

# Conclusion

How IterativeImputer() works when transforming new, unseen DataFrame ? Firstly, if needed for initial steps, it takes feature means from imputer.initial_imputer_.statistics_ .
These are means of another DataFrame - on which imputer was fitted. Then it computes missing values with corresponding models from triplets in imputer.imputation_sequence_ .
It calculates values with every model from imputer.imputation_sequence_ and replaces previous value with newly calculated. In example above there are 9 models in imputer.imputation_sequence_, so 9 different models are used.
There is no data leakage because feature means are taken from training dataset and models from imputer.imputation_sequence_ are fitted on training dataset.

## Snippet - find unique sequences

In [19]:
imputer.imputation_sequence_

[_ImputerTriplet(feat_idx=0, neighbor_feat_idx=array([1, 2]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=1, neighbor_feat_idx=array([0, 2]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=2, neighbor_feat_idx=array([0, 1]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=0, neighbor_feat_idx=array([1, 2]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=1, neighbor_feat_idx=array([0, 2]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=2, neighbor_feat_idx=array([0, 1]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=0, neighbor_feat_idx=array([1, 2]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=1, neighbor_feat_idx=array([0, 2]), estimator=BayesianRidge()),
 _ImputerTriplet(feat_idx=2, neighbor_feat_idx=array([0, 1]), estimator=BayesianRidge())]

In [24]:
all_arrays = []
imp_seq = imputer.imputation_sequence_
for i in range(len(imp_seq)):
    feat_idx = [imp_seq[i][0]]
    neighbor_feat_idx = imp_seq[i][1].tolist()
    feat_idx.extend(neighbor_feat_idx)
    all_arrays.append(feat_idx)
unique, counts = np.unique(np.array(all_arrays), axis=0, return_counts=True)

In [25]:
unique

array([[0, 1, 2],
       [1, 0, 2],
       [2, 0, 1]])

In [26]:
counts

array([3, 3, 3])

In [None]:
###################

In [32]:
imputer.n_iter_
(len(counts) * imputer.n_iter_) == len(imputer.imputation_sequence_)
# num of features * num of iterations = num of triplets ->  3 * 3 = 9

3

True