# Categorical variable imputation

Missing data in categorical variables are normally replaced by the most frequent category or by an arbitrary string.

In this recipe, we will replace missing values in categorical variables utilizing pandas, Scikit-learn and Feature-engine.

In [1]:
import pandas as pd

# to split the data sets:
from sklearn.model_selection import train_test_split

# to impute missing data with sklearn:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# to impute missing data with Feature-engine:
from feature_engine.imputation import CategoricalImputer

## Load data

In [2]:
data = pd.read_csv("credit_approval_uci.csv")

data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,,,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


## Split data in train and test sets

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data["target"],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

## Select categorical variables

In [4]:
categorical_vars = X_train.select_dtypes(include="O").columns.to_list()

categorical_vars

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [5]:
# Let's inspect the proportion of missing
# values per variable:

X_train[categorical_vars].isnull().mean()

A1     0.008282
A4     0.008282
A5     0.008282
A6     0.008282
A7     0.008282
A9     0.142857
A10    0.136646
A12    0.000000
A13    0.000000
dtype: float64

## pandas

[pd.fillna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html)

### Frequent category imputation

In [6]:
# Learn the variables' most frequent values:

frequent_values = X_train[categorical_vars].mode().iloc[0].to_dict()

frequent_values

{'A1': 'b',
 'A4': 'u',
 'A5': 'g',
 'A6': 'c',
 'A7': 'v',
 'A9': 't',
 'A10': 'f',
 'A12': 'f',
 'A13': 'g'}

In [7]:
# Replace missing data by the frequent category:

X_train_t = X_train.fillna(value=frequent_values)
X_test_t = X_test.fillna(value=frequent_values)

In [8]:
# Corroborate absence of missing values:

X_train_t[categorical_vars].isnull().sum()

A1     0
A4     0
A5     0
A6     0
A7     0
A9     0
A10    0
A12    0
A13    0
dtype: int64

In [9]:
# Corroborate absence of missing values:

X_test_t[categorical_vars].isnull().sum()

A1     0
A4     0
A5     0
A6     0
A7     0
A9     0
A10    0
A12    0
A13    0
dtype: int64

### Imputation with a string

In [10]:
imputation_dict = {var: "no_data" for var in categorical_vars}

imputation_dict

{'A1': 'no_data',
 'A4': 'no_data',
 'A5': 'no_data',
 'A6': 'no_data',
 'A7': 'no_data',
 'A9': 'no_data',
 'A10': 'no_data',
 'A12': 'no_data',
 'A13': 'no_data'}

In [11]:
# Replace missing data by the frequent category:

X_train_t = X_train.fillna(value=imputation_dict)
X_test_t = X_test.fillna(value=imputation_dict)

In [12]:
# Check the values of an imputed variable:

X_train_t["A1"].value_counts()

A1
b          335
a          144
no_data      4
Name: count, dtype: int64

## Scikit-learn

[SimpleImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html)

### Frequent category imputation

In [13]:
# Make a list with the numerical variables:

remaining_vars = [var for var in X_train.columns if var not in categorical_vars]

remaining_vars

['A2', 'A3', 'A8', 'A11', 'A14', 'A15']

In [14]:
# Set up the imputer to replace missing data with
# the most frequent category:

imputer = SimpleImputer(strategy="most_frequent")

# Indicate which variables to impute:
ct = ColumnTransformer(
    [("imputer", imputer, categorical_vars)], remainder="passthrough"
).set_output(transform="pandas")

# Find the most frequent value per variable:
ct.fit(X_train)

In [15]:
# Check the most frequent categories:

ct.named_transformers_.imputer.statistics_

array(['b', 'u', 'g', 'c', 'v', 't', 'f', 'f', 'g'], dtype=object)

In [16]:
# Replace missing data:

X_train_t = ct.transform(X_train)
X_test_t = ct.transform(X_test)

X_train_t.head()

Unnamed: 0,imputer__A1,imputer__A4,imputer__A5,imputer__A6,imputer__A7,imputer__A9,imputer__A10,imputer__A12,imputer__A13,remainder__A2,remainder__A3,remainder__A8,remainder__A11,remainder__A14,remainder__A15
596,a,u,g,c,v,t,t,t,g,46.08,3.0,2.375,8,396.0,4159
303,a,u,g,q,v,t,f,f,g,15.92,2.875,0.085,0,120.0,0
204,b,y,p,w,v,t,t,f,g,36.33,2.125,0.085,1,50.0,1187
351,b,y,p,ff,ff,f,f,f,g,22.17,0.585,0.0,0,100.0,0
118,b,u,g,m,v,t,t,t,g,57.83,7.04,14.0,6,360.0,1332


In [17]:
# Corroborate absence of missing values:

var_names = [f"imputer__{var}" for var in categorical_vars]

X_train_t[var_names].isnull().sum()

imputer__A1     0
imputer__A4     0
imputer__A5     0
imputer__A6     0
imputer__A7     0
imputer__A9     0
imputer__A10    0
imputer__A12    0
imputer__A13    0
dtype: int64

In [18]:
# Corroborate absence of missing values:

X_test_t[var_names].isnull().sum()

imputer__A1     0
imputer__A4     0
imputer__A5     0
imputer__A6     0
imputer__A7     0
imputer__A9     0
imputer__A10    0
imputer__A12    0
imputer__A13    0
dtype: int64

### Imputation with a string

In [19]:
# Set up the imputer to replace missing data with
# the string "missing":

imputer = SimpleImputer(strategy="constant", fill_value="missing")

# Indicate which variables to impute:
ct = ColumnTransformer(
    [("imputer", imputer, categorical_vars)], remainder="passthrough"
).set_output(transform="pandas")

# Replace missing data:
X_train_t = ct.fit_transform(X_train)
X_test_t = ct.transform(X_test)

In [20]:
# Inspect the values in an imputed variable:

X_train_t["imputer__A1"].value_counts()

imputer__A1
b          335
a          144
missing      4
Name: count, dtype: int64

## Feature-engine

[CategoricalImputer](https://feature-engine.readthedocs.io/en/latest/api_doc/imputation/CategoricalImputer.html)

### Frequent category 

In [21]:
# Set up the imputer to replace missing
# data with the most frequent category:

imputer = CategoricalImputer(
    imputation_method="frequent",
    variables=categorical_vars,
)

imputer.fit(X_train)

In [22]:
# Most frequent category per variable:

imputer.imputer_dict_

{'A1': 'b',
 'A4': 'u',
 'A5': 'g',
 'A6': 'c',
 'A7': 'v',
 'A9': 't',
 'A10': 'f',
 'A12': 'f',
 'A13': 'g'}

In [23]:
# Replace missing data with the most
# frequent category:

X_train_t = imputer.transform(X_train)
X_test_t = imputer.transform(X_test)

In [24]:
# Corroborate absence of missing values:

X_train_t[categorical_vars].isnull().sum()

A1     0
A4     0
A5     0
A6     0
A7     0
A9     0
A10    0
A12    0
A13    0
dtype: int64

In [25]:
# Corroborate absence of missing values:

X_test_t[categorical_vars].isnull().sum()

A1     0
A4     0
A5     0
A6     0
A7     0
A9     0
A10    0
A12    0
A13    0
dtype: int64

### Imputation with a string

In [26]:
# Set up the imputer to replace missing data
# with the string "other":

imputer = CategoricalImputer(
    imputation_method="missing",
    fill_value="other",
    variables=categorical_vars,
)

# Replace missing data with the string "other":

X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [27]:
# Inspect the values in an imputed variable:

X_train["A1"].value_counts()

A1
b        335
a        144
other      4
Name: count, dtype: int64