## Imputing Missing data

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [2]:
%matplotlib inline

#### credit approval dataset from UCI Machine Learning Repository
- http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/

In [56]:
data = pd.read_csv('./data/crx.data', header=None)
data
# header= None implies column names will be given later

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,00202,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,00043,560,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,00280,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,00100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,00120,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,00260,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,00200,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,00200,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,00280,750,-


In [57]:
varnames = ['A'+str(s) for s in range(1,17)]
varnames

['A1',
 'A2',
 'A3',
 'A4',
 'A5',
 'A6',
 'A7',
 'A8',
 'A9',
 'A10',
 'A11',
 'A12',
 'A13',
 'A14',
 'A15',
 'A16']

In [58]:
data.columns = varnames
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,00202,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,00043,560,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,00280,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,00100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,00120,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,00260,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,00200,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,00200,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,00280,750,-


In [59]:
data = data.replace('?', np.nan)
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [60]:
data.dtypes

A1      object
A2      object
A3     float64
A4      object
A5      object
A6      object
A7      object
A8     float64
A9      object
A10     object
A11      int64
A12     object
A13     object
A14     object
A15      int64
A16     object
dtype: object

In [61]:
# from the above it looks like a2 and a14 are float values but cast as object type
data['A2'] = data['A2'].astype('float64')
data.dtypes

A1      object
A2     float64
A3     float64
A4      object
A5      object
A6      object
A7      object
A8     float64
A9      object
A10     object
A11      int64
A12     object
A13     object
A14     object
A15      int64
A16     object
dtype: object

In [62]:
data['A14'] = data['A14'].astype('float64')
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,-


In [10]:
pd.get_dummies(data['A16'])

Unnamed: 0,+,-
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
685,0,1
686,0,1
687,0,1
688,0,1


In [11]:
data['A17'] = pd.get_dummies(data['A16'])['+']
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,A17
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+,1
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+,1
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,+,1
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,-,0
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,-,0
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,-,0
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,-,0


In [12]:
data.drop('A16', axis=1, inplace=True)
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A17
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,1
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,0
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,0
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,0
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,0


In [13]:
data = data.rename(columns={'A17':'A16'})
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,1
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,0
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,0
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,0
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,0


### Read the data again before running the below cell

In [63]:
#### Alternate method to change the columns to numerical type
data['A16'] = data['A16'].map({'+':1, '-':0})
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,1
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,0
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,0
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,0
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,0


In [64]:
np.random.seed(42)
values = set([np.random.randint(0, len(data)) for p in range(0,100)])
values
# These values are used as indices

{1,
 13,
 14,
 20,
 21,
 34,
 40,
 52,
 58,
 62,
 64,
 71,
 80,
 87,
 91,
 99,
 102,
 105,
 106,
 121,
 128,
 130,
 138,
 156,
 160,
 161,
 166,
 187,
 189,
 191,
 201,
 205,
 214,
 216,
 241,
 243,
 251,
 252,
 269,
 270,
 273,
 276,
 295,
 308,
 313,
 315,
 330,
 337,
 339,
 343,
 345,
 366,
 372,
 379,
 385,
 387,
 389,
 401,
 413,
 427,
 435,
 454,
 455,
 458,
 459,
 461,
 466,
 471,
 474,
 475,
 476,
 484,
 491,
 492,
 498,
 504,
 508,
 510,
 520,
 555,
 560,
 561,
 562,
 564,
 565,
 566,
 592,
 600,
 614,
 646,
 647,
 661,
 663,
 681,
 686}

In [65]:
len(values)

95

### Let's introduce some random missing values in few columns
### A3, A8 A9 A10

In [67]:
# values are indices and var is column
for var in ['A3', 'A8', 'A9', 'A10']:
    data.loc[values, var] = np.nan

In [68]:
data.isna().sum()

A1     12
A2     12
A3     95
A4      6
A5      6
A6      9
A7      9
A8     95
A9     95
A10    95
A11     0
A12     0
A13     0
A14    13
A15     0
A16     0
dtype: int64

In [69]:
data.to_csv('./data/CreditApprovalUCI.csv', index=False)

## Removing observations with missing data

In [70]:
data = pd.read_csv('./data/CreditApprovalUCI.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,,u,g,q,h,,,,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [71]:
data.isnull().mean()

A1     0.017391
A2     0.017391
A3     0.137681
A4     0.008696
A5     0.008696
A6     0.013043
A7     0.013043
A8     0.137681
A9     0.137681
A10    0.137681
A11    0.000000
A12    0.000000
A13    0.000000
A14    0.018841
A15    0.000000
A16    0.000000
dtype: float64

In [72]:
data.isnull().mean().sort_values()

A11    0.000000
A12    0.000000
A13    0.000000
A15    0.000000
A16    0.000000
A4     0.008696
A5     0.008696
A6     0.013043
A7     0.013043
A1     0.017391
A2     0.017391
A14    0.018841
A3     0.137681
A8     0.137681
A9     0.137681
A10    0.137681
dtype: float64

In [73]:
# Complete Case Analysis - List wise deletion of cases
data_cca = data.dropna()

In [74]:
data_cca

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,1
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1
5,b,32.08,4.000,u,g,m,v,2.50,t,f,0,t,g,360.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,b,40.58,3.290,u,g,m,v,3.50,f,f,0,t,s,400.0,0,0
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,0
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,0
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,0


In [75]:
len(data_cca)

564

In [76]:
len(data)

690

## Performing mean and median imputation

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from feature_engine.missing_data_imputers import MeanMedianImputer

In [78]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,,u,g,q,h,,,,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [79]:
X = data.drop('A16', axis=1)
y = data['A16']

In [80]:
X

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0
1,a,58.67,,u,g,q,h,,,,6,f,g,43.0,560
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0
686,a,22.67,,u,g,c,v,,,,2,t,g,200.0,394
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750


In [81]:
y

0      1
1      1
2      1
3      1
4      1
      ..
685    0
686    0
687    0
688    0
689    0
Name: A16, Length: 690, dtype: int64

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [83]:
X_train.shape, X_test.shape

((483, 15), (207, 15))

In [84]:
# calculates the percentage of missing values
X_train.isnull().mean()

A1     0.014493
A2     0.012422
A3     0.140787
A4     0.012422
A5     0.012422
A6     0.014493
A7     0.014493
A8     0.140787
A9     0.140787
A10    0.140787
A11    0.000000
A12    0.000000
A13    0.000000
A14    0.022774
A15    0.000000
dtype: float64

In [85]:
# replacing the missing values with mean/median values
for var in ['A2','A3','A8', 'A11', 'A15']:
    value = X_train[var].median()
    X_train[var] = X_train[var].fillna(value)
    X_test[var] = X_test[var].fillna(value)

In [86]:
X_train.isnull().sum()

A1      7
A2      0
A3      0
A4      6
A5      6
A6      7
A7      7
A8      0
A9     68
A10    68
A11     0
A12     0
A13     0
A14    11
A15     0
dtype: int64

In [87]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,,u,g,q,h,,,,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


### Imputing using scikit-learn

In [88]:
X_train, X_test, y_train, y_test = train_test_split(data[['A2','A3','A8','A11','A15']], data['A16'], test_size=0.3)

In [89]:
X_train.isnull().sum()

A2     10
A3     63
A8     63
A11     0
A15     0
dtype: int64

In [90]:
X_train.isnull().mean()

A2     0.020704
A3     0.130435
A8     0.130435
A11    0.000000
A15    0.000000
dtype: float64

### simple imputer needs numerical columns - it does mean and median imputation

In [91]:
imputer = SimpleImputer(strategy='median')

In [92]:
imputer.fit(X_train)

SimpleImputer(strategy='median')

In [93]:
imputer.statistics_

array([28.58,  3.  ,  1.  ,  0.  ,  6.  ])

In [94]:
X_train = imputer.transform(X_train)

In [95]:
X_test = imputer.transform(X_test)

In [96]:
data1 = pd.DataFrame(X_train, columns=['A2','A3','A8','A11','A15'])
data1.head()

Unnamed: 0,A2,A3,A8,A11,A15
0,29.75,0.665,0.25,0.0,0.0
1,33.58,2.75,4.25,6.0,0.0
2,27.58,2.04,2.0,3.0,560.0
3,25.33,2.085,2.75,0.0,1.0
4,29.92,1.835,4.335,0.0,200.0


In [97]:
data1.isnull().sum()

A2     0
A3     0
A8     0
A11    0
A15    0
dtype: int64

In [98]:
data1.isnull().mean()

A2     0.0
A3     0.0
A8     0.0
A11    0.0
A15    0.0
dtype: float64

### Using MeanMedianImputer

In [99]:
X_train, X_test, y_train, y_test = train_test_split(data[['A2','A3','A8','A11','A15']], data['A16'], test_size=0.3)

median_imputer = MeanMedianImputer(imputation_method='median', variables=['A2','A3','A8','A11','A15'])

In [100]:
median_imputer.fit(X_train)

MeanMedianImputer(variables=['A2', 'A3', 'A8', 'A11', 'A15'])

In [101]:
median_imputer.imputer_dict_

{'A2': 28.75, 'A3': 2.6025, 'A8': 1.0, 'A11': 0.0, 'A15': 4.0}

In [102]:
X_train = median_imputer.transform(X_train)
X_test = median_imputer.transform(X_test)

## NOTE 

* SimpleImputer returns numpy array
* MeanMedianImputer returns dataframe

In [103]:
X_train[['A2','A3','A8','A11','A15']].isnull().mean()

A2     0.0
A3     0.0
A8     0.0
A11    0.0
A15    0.0
dtype: float64

In [107]:
data.dtypes

A1      object
A2     float64
A3     float64
A4      object
A5      object
A6      object
A7      object
A8     float64
A9      object
A10     object
A11      int64
A12     object
A13     object
A14    float64
A15      int64
A16      int64
dtype: object

In [108]:
if data['A1'].dtype == 'O':
    print('Object')

Object


In [109]:
if data['A2'].dtype == 'O':
    print('Object')
else:
    print('Not Object')

Not Object


### Implementing Mode or Frequency category imputation

In [116]:
from feature_engine.missing_data_imputers import CategoricalVariableImputer

In [117]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,,u,g,q,h,,,,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [139]:
data = pd.read_csv('./data/CreditApprovalUCI.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,,u,g,q,h,,,,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [119]:
X= data.drop('A16', axis=1)
X.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0
1,a,58.67,,u,g,q,h,,,,6,f,g,43.0,560
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0


In [120]:
y = data['A16']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: A16, dtype: int64

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [122]:
X_train.isnull().mean()

A1     0.018634
A2     0.016563
A3     0.136646
A4     0.008282
A5     0.008282
A6     0.010352
A7     0.010352
A8     0.136646
A9     0.136646
A10    0.136646
A11    0.000000
A12    0.000000
A13    0.000000
A14    0.022774
A15    0.000000
dtype: float64

In [128]:
# lets replace missing values in A4 A5 A6 A7 with mode values
for var in ['A4','A5','A6','A7']:
    value = X_train[var].mode()[0]
    X_train[var] = X_train[var].fillna(value)
    X_test[var] = X_test[var].fillna(value)

### imputing the missing values with most frequent category using scikit

In [132]:
X_train, X_test, y_train, y_test = train_test_split(data[['A4','A5','A6','A7']],data['A16'], test_size=0.3)

In [133]:
imputer = SimpleImputer(strategy='most_frequent')

In [134]:
imputer.fit(X_train)

SimpleImputer(strategy='most_frequent')

In [135]:
imputer.statistics_

array(['u', 'g', 'c', 'v'], dtype=object)

In [137]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

### Imputing using feature engine Categorical Variable Imputer

In [138]:
mode_imputer = CategoricalVariableImputer(variables=['A4','A5','A6','A7'], imputation_method='frequent')

In [140]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,,u,g,q,h,,,,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [141]:
X_train, X_test, y_train, y_test = train_test_split(data[['A4','A5','A6','A7']],data['A16'], test_size=0.3)

In [142]:
mode_imputer.fit(X_train)

CategoricalVariableImputer(imputation_method='frequent',
                           variables=['A4', 'A5', 'A6', 'A7'])

In [143]:
    mode_imputer.imputer_dict_

{'A4': 'u', 'A5': 'g', 'A6': 'c', 'A7': 'v'}

In [144]:
X_train = mode_imputer.transform(X_train)
X_test = mode_imputer.transform(X_test)

In [145]:
X_train[['A4','A5','A6','A7']].isnull().mean()

A4    0.0
A5    0.0
A6    0.0
A7    0.0
dtype: float64