# Estimating missing data with K-nearest neighbors

In this notebook, we will replace missing data, by the mean value shown by their closest k neighbors.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from feature_engine.wrappers import SklearnTransformerWrapper

## Load data

In [2]:
# Load data with numerical variables

variables = ["A2", "A3", "A8", "A11", "A14", "A15", "target"]

data = pd.read_csv("credit_approval_uci.csv", usecols=variables)

data.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,target
0,30.83,0.0,1.25,1,202.0,0,1
1,58.67,4.46,3.04,6,43.0,560,1
2,24.5,,1.5,0,280.0,824,1
3,27.83,1.54,3.75,5,100.0,3,1
4,20.17,5.625,1.71,0,120.0,0,1


## Split data into train and test sets

In [3]:
# Let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((483, 6), (207, 6))

In [4]:
# Find the fraction of missing data:

X_train.isnull().mean()

A2     0.022774
A3     0.140787
A8     0.132505
A11    0.000000
A14    0.014493
A15    0.000000
dtype: float64

In [5]:
# Set up the imputer to find the closes 5 neighbors
# utilizing euclidean distance, and weighting the
# neighbours so that furthest neighbors have smaller
# influence:

imputer = KNNImputer(
    n_neighbors=5,
    weights="distance",
).set_output(transform="pandas")

In [6]:
# Find the closest neighbors:

imputer.fit(X_train)

In [7]:
# Replace the missing values by the weighted
# mean of the values shown by the neighbors:

X_train_t = imputer.transform(X_train)
X_test_t = imputer.transform(X_test)

In [8]:
X_train_t.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15
596,46.08,3.0,2.375,8.0,396.0,4159.0
303,15.92,2.875,0.085,0.0,120.0,0.0
204,36.33,2.125,0.085,1.0,50.0,1187.0
351,22.17,0.585,0.0,0.0,100.0,0.0
118,57.83,7.04,14.0,6.0,360.0,1332.0


## Find neighbors base on specific variables

In [9]:
# Set up the imputer to find neighbous based on
# 4 numerical variables:

imputer = SklearnTransformerWrapper(
    transformer=KNNImputer(),
    variables=["A2", "A3", "A8", "A11"],
)

In [10]:
# Find neighbors and replace missing data
# by their estimates:

X_train_t = imputer.fit_transform(X_train)
X_test_t = imputer.transform(X_test)