# Statistical Matching imputation

This notebook demonstrates how to use MicroImpute's Matching imputer to impute values using the statistical matching approach. Statistical matching (also known as data fusion or synthetic matching) is a technique used to integrate information from different data sources.

In [12]:
# Import needed libraries and setup R environment
import sys
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from rpy2.robjects import pandas2ri
from sklearn.datasets import load_diabetes

# Import MicroImpute tools
from microimpute.comparisons.data import preprocess_data
from microimpute.evaluations import *
from microimpute.models import Matching
from microimpute.config import QUANTILES

In [13]:
# Load the diabetes dataset
diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)

# Display the first few rows of the dataset
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [14]:
# Define variables for the model
predictors = ["age", "sex", "bmi", "bp"]
imputed_variables = ["s1"]  # We'll impute 's1' (total serum cholesterol)

# Create a subset with only needed columns
diabetes_df = df[predictors + imputed_variables]

# Display summary statistics
diabetes_df.describe()

Unnamed: 0,age,sex,bmi,bp,s1
count,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665608,-0.03424784
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137


In [15]:
# Split data into training and testing sets
X_train, X_test = preprocess_data(diabetes_df)

# Let's see how many records we have in each set
print(f"Training set size: {X_train.shape[0]} records")
print(f"Testing set size: {X_test.shape[0]} records")

Training set size: 353 records
Testing set size: 89 records


## Simulating missing data

For this example, we'll simulate missing data in our test set by removing the values we want to impute.

In [16]:
# Create a copy of the test set with missing values
X_test_missing = X_test.copy()

# Store the actual values for later comparison
actual_values = X_test_missing[imputed_variables].copy()

# Remove the values to be imputed
X_test_missing[imputed_variables] = np.nan

X_test_missing.head()

Unnamed: 0,age,sex,bmi,bp,s1
287,0.952161,-0.937474,-0.130325,-0.335978,
211,1.943844,-0.937474,0.775037,0.45932,
72,1.333577,1.064282,-0.085057,-0.263679,
321,2.020127,-0.937474,1.091914,1.664559,
73,0.265611,1.064282,-0.424568,-0.046779,


## Training and using the Matching imputer

Now we'll train the Matching imputer and use it to impute the missing values in our test set.

In [17]:
# Initialize the Matching imputer
matching_imputer = Matching()

# Fit the model with our training data
# This trains a linear regression model
fitted_matching_imputer = matching_imputer.fit(
    X_train, predictors, imputed_variables
)

In [18]:
# Impute values in the test set
# This uses the trained Matching model to predict missing values
imputed_values = fitted_matching_imputer.predict(X_test_missing, QUANTILES)

# Display the first few imputed values at the median (0.5 quantile)
imputed_values[0.5].head()

Unnamed: 0,s1
287,-0.148528
,0.313792
211,-0.206318
NA.1,0.487162
72,-1.391014


## Evaluating the imputation results

Now let's compare the imputed values with the actual values to evaluate the performance of our imputer.

In [None]:
# Extract median predictions for evaluation
median_predictions = imputed_values[0.5]

# Create a scatter plot comparing actual vs. imputed values
min_val = min(actual_values.min().min(), median_predictions.min().min())
max_val = max(actual_values.max().max(), median_predictions.max().max())

# Convert data for plotting
plot_df = pd.DataFrame(
    {
        "Actual": actual_values.values.flatten(),
        "Imputed": median_predictions.values.flatten(),
    }
)

# Create the scatter plot
fig = px.scatter(
    plot_df,
    x="Actual",
    y="Imputed",
    opacity=0.7,
    title="Comparison of Actual vs. Imputed Values using Matching",
)

# Add the diagonal line (perfect prediction line)
fig.add_trace(
    go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode="lines",
        line=dict(color="red", dash="dash"),
        name="Perfect Prediction",
    )
)

# Update layout
fig.update_layout(
    xaxis_title="Actual Values",
    yaxis_title="Imputed Values",
    width=650,
    height=500,
    template="plotly_white",
    margin=dict(l=50, r=50, t=80, b=50),  # Adjust margins
)

fig.show()

## Examining quantile predictions

The Matching imputer can also provide predictions at different quantiles, which can be useful for understanding the uncertainty in the imputation.

In [20]:
# Compare predictions at different quantiles for the first 5 records
quantiles_to_show = QUANTILES
comparison_df = pd.DataFrame(index=range(5))

# Add actual values
comparison_df["Actual"] = actual_values.iloc[:5, 0].values

# Add quantile predictions
for q in quantiles_to_show:
    comparison_df[f"Q{int(q*100)}"] = imputed_values[q].iloc[:5, 0].values

comparison_df

Unnamed: 0,Actual,Q5,Q10,Q30,Q50,Q70,Q90,Q95
0,2.625393,-0.148528,-0.148528,-0.148528,-0.148528,-0.148528,-0.148528,-0.148528
1,-0.524163,0.313792,0.313792,0.313792,0.313792,0.313792,0.313792,0.313792
2,2.163073,-0.206318,-0.206318,-0.206318,-0.206318,-0.206318,-0.206318,-0.206318
3,1.151747,0.487162,0.487162,0.487162,0.487162,0.487162,0.487162,0.487162
4,0.805007,-1.391014,-1.391014,-1.391014,-1.391014,-1.391014,-1.391014,-1.391014


## Assesing the method's performance

To check whether our model is overfitting and ensure robust results we can perform cross-validation and visualize the results.

In [21]:
# Run cross-validation on the same data set
matching_results = cross_validate_model(
    Matching, diabetes_df, predictors, imputed_variables
)

# Display the results
matching_results

Unnamed: 0,0.05,0.10,0.30,0.50,0.70,0.90,0.95
train,0.027467,0.027372,0.026989,0.026606,0.026222,0.025839,0.025744
test,0.023086,0.023339,0.02435,0.02536,0.026371,0.027382,0.027635


In [None]:
# Plot the results
plot_train_test_performance(matching_results)