## Adding a bespoke category

In this recipe, we will create a 'Missing' category to replace missing values in categorical variables using pandas, Scikit-learn and Feature-Engine, all open source Python libraries.

In [1]:
import pandas as pd

# to split the datasets
from sklearn.model_selection import train_test_split

# to impute missing data with sklearn
from sklearn.impute import SimpleImputer

# to impute missing data with feature-engine
from feature_engine.missing_data_imputers import CategoricalVariableImputer

In [2]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [4]:
# find the percentage of missing data per variable

X_train.isnull().mean()

A1     0.008282
A2     0.022774
A3     0.140787
A4     0.008282
A5     0.008282
A6     0.008282
A7     0.008282
A8     0.140787
A9     0.140787
A10    0.140787
A11    0.000000
A12    0.000000
A13    0.000000
A14    0.014493
A15    0.000000
dtype: float64

## Adding a bespoke category with pandas

In [5]:
# replace NA in some categorical variables

for var in ['A4', 'A5', 'A6', 'A7']:

    X_train[var].fillna('Missing', inplace=True)
    X_test[var].fillna('Missing', inplace=True)

In [6]:
# check absence of missing values
X_train[['A4', 'A5', 'A6', 'A7']].isnull().sum()

A4    0
A5    0
A6    0
A7    0
dtype: int64

## Adding a bespoke category with Scikit-learn

In [7]:
# let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data[['A4', 'A5', 'A6', 'A7']], data['A16'], test_size=0.3, random_state=0)

In [8]:
# create an instance of the simple imputer
imputer = SimpleImputer(strategy='constant', fill_value='Missing')

# we fit the imputer to the train set
imputer.fit(X_train)

# we can look at the new category:
imputer.statistics_

array(['Missing', 'Missing', 'Missing', 'Missing'], dtype=object)

In [9]:
# and now we impute the train and test set
# NOTE: the data is returned as a numpy array!!!

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [10]:
pd.DataFrame(X_train).isnull().sum()

0    0
1    0
2    0
3    0
dtype: int64

## Adding a bespoke category with feature engine

In [11]:
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [12]:
imputer = CategoricalVariableImputer(variables=['A4', 'A5', 'A6', 'A7'])

imputer.fit(X_train)

CategoricalVariableImputer(variables=['A4', 'A5', 'A6', 'A7'])

In [13]:
# transform the data
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)



In [14]:
X_train[['A4', 'A5', 'A6', 'A7']].isnull().mean()

A4    0.0
A5    0.0
A6    0.0
A7    0.0
dtype: float64

## Adding a bespoke category with Sklearn selecting features to impute

In [15]:
import pandas as pd

# to impute missing data with sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# to split the datasets
from sklearn.model_selection import train_test_split

In [16]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')

# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [17]:
# first we make a lists with the features to be imputed
features_arbitrary = ['A4', 'A5']
features_mode = ['A6', 'A7']

# then we instantiate the imputer within a pipeline
arbitrary_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing'))])

mode_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

# then we put the features list and the imputers in
# the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('arbitrary_imputer', arbitrary_imputer, features_arbitrary),
    ('mean_imputer', mode_imputer, features_mode)
    ], remainder='passthrough')

In [18]:
# now we fit the preprocessor
preprocessor.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('arbitrary_imputer',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value='Missing',
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0))],
                                          verbose=False),
                                 ['A4', 'A5']),
                                ('mean_imputer',
                                 Pipeline(memory=None,
     

In [19]:
# and now we can impute the data

X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [20]:
# be carefutl that Scikit-Learn transformers return NumPy arrays!!
X_train

array([['u', 'g', 'c', ..., 'g', 396.0, 4159],
       ['u', 'g', 'q', ..., 'g', 120.0, 0],
       ['y', 'p', 'w', ..., 'g', 50.0, 1187],
       ...,
       ['u', 'g', 'w', ..., 'g', 220.0, 5],
       ['u', 'g', 'q', ..., 'g', 140.0, 2384],
       ['u', 'g', 'm', ..., 's', 400.0, 0]], dtype=object)