# Random Sample Imputation

In this recipe, we will perform random sample imputation using pandas and Feature Engine.

In [1]:
import pandas as pd

# to split the data sets:
from sklearn.model_selection import train_test_split

# to impute missing data with Feature-engine:
from feature_engine.imputation import RandomSampleImputer

## Load data

In [2]:
data = pd.read_csv("credit_approval_uci.csv")

data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


## Split data into train and test

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [4]:
# Number of missing values:

X_train.isnull().sum()

A1      4
A2     11
A3     68
A4      4
A5      4
A6      4
A7      4
A8     68
A9     68
A10    68
A11     0
A12     0
A13     0
A14     7
A15     0
dtype: int64

## Random Sample imputation with pandas

In [5]:
# Find the number of missing observations
# in one variable:

number_na = X_train["A2"].isnull().sum()

number_na

11

In [6]:
# Extract a random sample with as many values
# as missing data in the variable:

random_sample_train = X_train["A2"].dropna().sample(number_na, random_state=0)

random_sample_train

331    33.25
293    35.75
316    21.17
25     15.83
320    21.25
577    25.17
356    41.17
53     34.92
361    23.08
419    26.58
584    28.08
Name: A2, dtype: float64

In [7]:
# Re-index the random sample with the index value
# of the missing observations, so that we can
# join it to our original data:

random_sample_train.index = X_train[X_train["A2"].isnull()].index

random_sample_train

97     33.25
500    35.75
329    21.17
83     15.83
254    21.25
608    25.17
445    41.17
450    34.92
515    23.08
286    26.58
86     28.08
Name: A2, dtype: float64

In [8]:
# Check the new index values:

X_train[X_train["A2"].isnull()].index

Int64Index([97, 500, 329, 83, 254, 608, 445, 450, 515, 286, 86], dtype='int64')

In [9]:
# Replace the missing values:

X_train.loc[X_train["A2"].isnull(), "A2"] = random_sample_train

X_train["A2"].isnull().sum()

0

In [10]:
# Repeat the procedure, in a loop, for the rest
# of the variables and for both train and test sets:

for var in ["A1", "A3", "A4", "A5", "A6", "A7", "A8"]:

    # extract a random sample
    random_sample_train = (
        X_train[var].dropna().sample(
            X_train[var].isnull().sum(), random_state=0)
    )

    random_sample_test = (
        X_train[var].dropna().sample(
            X_test[var].isnull().sum(), random_state=0)
    )

    # re-index the random samples
    random_sample_train.index = X_train[X_train[var].isnull()].index
    random_sample_test.index = X_test[X_test[var].isnull()].index

    # replace the NA
    X_train.loc[X_train[var].isnull(), var] = random_sample_train
    X_test.loc[X_test[var].isnull(), var] = random_sample_test

In [11]:
# Corroborate that there is no missing data:

X_train[["A1", "A3", "A4", "A5", "A6", "A7", "A8"]].isnull().sum()

A1    0
A3    0
A4    0
A5    0
A6    0
A7    0
A8    0
dtype: int64

## Random sample imputation with Feature-engine

In [12]:
# Let's split the data into train and test sets:

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

In [13]:
# Let's st up the imputer to impute all variables:

imputer = RandomSampleImputer(random_state=0)

# The imputer takes a copy of the train set:

imputer.fit(X_train)

RandomSampleImputer(random_state=0)

In [14]:
# The imputer stores the train set:

imputer.X_.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,a,46.08,3.0,u,g,c,v,2.375,t,t,8,t,g,396.0,4159
303,a,15.92,2.875,u,g,q,v,0.085,f,f,0,f,g,120.0,0
204,b,36.33,2.125,y,p,w,v,0.085,t,t,1,f,g,50.0,1187
351,b,22.17,0.585,y,p,ff,ff,0.0,f,f,0,f,g,100.0,0
118,b,57.83,7.04,u,g,m,v,14.0,t,t,6,t,g,360.0,1332


In [15]:
# Replace the missing values with random samples:

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [16]:
# Corroborate that there is no missing data:

X_train.isnull().mean()

A1     0.0
A2     0.0
A3     0.0
A4     0.0
A5     0.0
A6     0.0
A7     0.0
A8     0.0
A9     0.0
A10    0.0
A11    0.0
A12    0.0
A13    0.0
A14    0.0
A15    0.0
dtype: float64

## Random sampling seeding on variable values

In [17]:
# Let's separate into train and test set:

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

In [18]:
# Let's set up the imputer to use the 
# sum of the values in variables A3 and A8
# as seed to replace missing data for each 
# observation:

imputer = RandomSampleImputer(
    random_state=["A3", "A8"],
    seed="observation",
    seeding_method="add",
)

In [19]:
imputer.fit(X_train)

RandomSampleImputer(random_state=['A3', 'A8'], seed='observation')

In [20]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)