# Finding extreme values for imputation

In this recipe, we will replace missing values by a value at the end of the distribution, estimated with a Gaussian approximation or the inter-quartile range proximity rule, utilizing pandas and Feature-engine.

In [1]:
import pandas as pd

# to split the datasets:
from sklearn.model_selection import train_test_split

# to impute missing data with Feature-engine:
from feature_engine.imputation import EndTailImputer

## Load data

In [2]:
data = pd.read_csv("credit_approval_uci.csv")

data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,,,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


## Select numerical variables

In [3]:
# We exclude the target variable:

numeric_vars = [
    var for var in data.select_dtypes(exclude="O").columns.to_list() if var != "target"
]

## Split data into train and test

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data[numeric_vars],
    data["target"],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((483, 6), (207, 6))

## Find inter-quartile range

In [5]:
IQR = X_train.quantile(0.75) - X_train.quantile(0.25)

IQR

A2      16.4200
A3       6.5825
A8       2.8350
A11      3.0000
A14    212.0000
A15    450.0000
dtype: float64

## Find values beyond the right end of the distribution

In [6]:
imputation_dict = (X_train.quantile(0.75) + 1.5 * IQR).to_dict()

imputation_dict

{'A2': 63.550000000000004,
 'A3': 17.43625,
 'A8': 7.2524999999999995,
 'A11': 7.5,
 'A14': 590.0,
 'A15': 1125.0}

In [7]:
# Replace missing data with estimated values:

X_train_t = X_train.fillna(value=imputation_dict)
X_test_t = X_test.fillna(value=imputation_dict)

## Find imputation values with mean and standard deviation

In [8]:
imputation_dict = (X_train.mean() + 3 * X_train.std()).to_dict()

imputation_dict

{'A2': 68.35771260807589,
 'A3': 19.98993346546277,
 'A8': 12.839303728846648,
 'A11': 18.320547522636247,
 'A14': 710.6258760585449,
 'A15': 12740.850618383225}

In [9]:
# Replace missing data with estimated values:

X_train_t = X_train.fillna(value=imputation_dict)
X_test_t = X_test.fillna(value=imputation_dict)

## End tail imputation with Feature-engine

In [10]:
# Set up the imputer to find extreme values based of
# the inter-quartile range proximity rule, placing
# estimates at the right tail, using 3 times the IQR:

imputer = EndTailImputer(
    imputation_method="iqr",
    tail="right",
    fold=3,
    variables=None,
)

imputer.fit(X_train)

In [11]:
# The values to use for the imputation:

imputer.imputer_dict_

{'A2': 88.18,
 'A3': 27.31,
 'A8': 11.504999999999999,
 'A11': 12.0,
 'A14': 908.0,
 'A15': 1800.0}

In [12]:
# Replace missing data:

X_train_t = imputer.transform(X_train)
X_test_t = imputer.transform(X_test)