## Adding a missing value indicator variable

In this recipe, we will add binary variables to indicate that a value is missing using pandas, Scikit-learn and Feature-Engine, all open source Python libraries.

In [1]:
import pandas as pd
import numpy as np

# to split the data sets
from sklearn.model_selection import train_test_split

# to impute missing data with feature-engine
from feature_engine.missing_data_imputers import AddNaNBinaryImputer

In [2]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [4]:
# find the percentage of missing data within those variables

X_train.isnull().mean()

A1     0.008282
A2     0.022774
A3     0.140787
A4     0.008282
A5     0.008282
A6     0.008282
A7     0.008282
A8     0.140787
A9     0.140787
A10    0.140787
A11    0.000000
A12    0.000000
A13    0.000000
A14    0.014493
A15    0.000000
dtype: float64

## Add missing indicator with pandas

In [5]:
# add missing indicator

for var in ['A1', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8']:

    X_train[var+'_NA'] = np.where(X_train[var].isnull(), 1, 0)
    X_test[var+'_NA'] = np.where(X_test[var].isnull(), 1, 0)

    
# check the new missing indicator variables
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A13,A14,A15,A1_NA,A3_NA,A4_NA,A5_NA,A6_NA,A7_NA,A8_NA
596,a,46.08,3.0,u,g,c,v,2.375,t,t,...,g,396.0,4159,0,0,0,0,0,0,0
303,a,15.92,2.875,u,g,q,v,0.085,f,f,...,g,120.0,0,0,0,0,0,0,0,0
204,b,36.33,2.125,y,p,w,v,0.085,t,t,...,g,50.0,1187,0,0,0,0,0,0,0
351,b,22.17,0.585,y,p,ff,ff,0.0,f,f,...,g,100.0,0,0,0,0,0,0,0,0
118,b,57.83,7.04,u,g,m,v,14.0,t,t,...,g,360.0,1332,0,0,0,0,0,0,0


In [6]:
# the mean of the missing indicator should be the same as the 
# percentage of missing values in the original variable

X_train['A3'].isnull().mean(), X_train['A3_NA'].mean()

(0.14078674948240166, 0.14078674948240166)

## Adding missing indicator with Feature Engine

In [7]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [8]:
# let's create a random sample imputer

imputer = AddNaNBinaryImputer()

imputer.fit(X_train)

AddNaNBinaryImputer(variables=['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8',
                               'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15'])

In [9]:
# transform the data - replace the missing values

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)



In [10]:
# check that null values were replaced
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A6_na,A7_na,A8_na,A9_na,A10_na,A11_na,A12_na,A13_na,A14_na,A15_na
596,a,46.08,3.0,u,g,c,v,2.375,t,t,...,0,0,0,0,0,0,0,0,0,0
303,a,15.92,2.875,u,g,q,v,0.085,f,f,...,0,0,0,0,0,0,0,0,0,0
204,b,36.33,2.125,y,p,w,v,0.085,t,t,...,0,0,0,0,0,0,0,0,0,0
351,b,22.17,0.585,y,p,ff,ff,0.0,f,f,...,0,0,0,0,0,0,0,0,0,0
118,b,57.83,7.04,u,g,m,v,14.0,t,t,...,0,0,0,0,0,0,0,0,0,0


## Adding missing indicator with Scikit-learn

In [11]:
import pandas as pd
from sklearn.impute import MissingIndicator
from sklearn.model_selection import train_test_split

In [12]:
data = pd.read_csv('creditApprovalUCI.csv')

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [14]:
indicator = MissingIndicator(error_on_new=True, features='missing-only')
indicator.fit(X_train)  

MissingIndicator(error_on_new=True, features='missing-only', missing_values=nan,
                 sparse='auto')

In [15]:
# we can see the features with na:
# the result shows the column index in the NumPy array

indicator.features_

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 13], dtype=int64)

In [16]:
# with Sklearn we need to join the missing indicators dataframe
# to the original X_train

# let's create a column name for each of the new MissingIndicators
indicator_cols = [c+'_NA' for c in X_train.columns[indicator.features_]]

# and now let's concatenate the original dataset with the missing indicators
X_train = pd.concat([
    X_train.reset_index(),
    pd.DataFrame(indicator.transform(X_train), columns = indicator_cols)],
    axis=1)

X_train.head()

Unnamed: 0,index,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,A2_NA,A3_NA,A4_NA,A5_NA,A6_NA,A7_NA,A8_NA,A9_NA,A10_NA,A14_NA
0,596,a,46.08,3.0,u,g,c,v,2.375,t,...,False,False,False,False,False,False,False,False,False,False
1,303,a,15.92,2.875,u,g,q,v,0.085,f,...,False,False,False,False,False,False,False,False,False,False
2,204,b,36.33,2.125,y,p,w,v,0.085,t,...,False,False,False,False,False,False,False,False,False,False
3,351,b,22.17,0.585,y,p,ff,ff,0.0,f,...,False,False,False,False,False,False,False,False,False,False
4,118,b,57.83,7.04,u,g,m,v,14.0,t,...,False,False,False,False,False,False,False,False,False,False
