In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# to split the datasets
from sklearn.model_selection import train_test_split

# multivariate imputation
from sklearn.impute import KNNImputer

In [2]:
# list with numerical varables

cols_to_use = [
    'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
    'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
    '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
    'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
    'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
    'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea',
    'WoodDeckSF',  'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
    'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold',
    'SalePrice'
]

In [4]:
# Loading data
ds = pd.read_csv('train.csv', usecols=cols_to_use)

In [6]:
ds.isnull().sum()

MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
SalePrice          0
dtype: int64

We have 3 features having missing values. 

LotFrontage: 259 missing values

MasVnrArea: 8 missing values

GarageYrBlt: 81 missing values

In [7]:
# Split the train and test set

cols_to_use.remove('SalePrice')

X_train, X_test, y_train, y_test = train_test_split(
    ds[cols_to_use],
    ds['SalePrice'],
    test_size=0.3,
    random_state=0)

# print the shape of X_train and X_test
X_train.shape, X_test.shape

((1022, 36), (438, 36))

In [8]:
# reset index, so we can compare values later
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)

### KNN Imputation

In [9]:
# Creating the model
imputer = KNNImputer(
    n_neighbors=5, # the number of neighbours K
    weights='distance', # the weighting factor
    metric='nan_euclidean', # the metric to find the neighbours
    add_indicator=False, # whether to add a missing indicator
)

In [10]:
# fitting the model
imputer.fit(X_train)

KNNImputer(weights='distance')

In [11]:
# Transform and print the result
train_t = imputer.transform(X_train)
test_t = imputer.transform(X_test)

In [12]:
print(train_t)

[[6.00000000e+01 7.01151417e+01 9.37500000e+03 ... 0.00000000e+00
  2.00000000e+00 2.00900000e+03]
 [1.20000000e+02 4.25330531e+01 2.88700000e+03 ... 0.00000000e+00
  1.10000000e+01 2.00800000e+03]
 [2.00000000e+01 5.00000000e+01 7.20700000e+03 ... 0.00000000e+00
  2.00000000e+00 2.01000000e+03]
 ...
 [9.00000000e+01 6.80000000e+01 8.93000000e+03 ... 0.00000000e+00
  4.00000000e+00 2.01000000e+03]
 [1.20000000e+02 4.30000000e+01 3.19600000e+03 ... 0.00000000e+00
  1.00000000e+01 2.00600000e+03]
 [6.00000000e+01 5.80000000e+01 1.67700000e+04 ... 0.00000000e+00
  6.00000000e+00 2.01000000e+03]]


In [13]:
# lets convert the array to dataframe
train_t = pd.DataFrame(train_t, columns=X_train.columns)
test_t = pd.DataFrame(test_t, columns=X_test.columns)


In [15]:
train_t.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60.0,70.115142,9375.0,7.0,5.0,1997.0,1998.0,573.0,739.0,0.0,...,645.0,576.0,36.0,0.0,0.0,0.0,0.0,0.0,2.0,2009.0
1,120.0,42.533053,2887.0,6.0,5.0,1996.0,1997.0,0.0,1003.0,0.0,...,431.0,307.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,2008.0
2,20.0,50.0,7207.0,5.0,7.0,1958.0,2008.0,0.0,696.0,0.0,...,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2010.0
3,50.0,60.0,9060.0,6.0,5.0,1939.0,1950.0,0.0,204.0,0.0,...,280.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,2009.0
4,30.0,60.0,8400.0,2.0,5.0,1920.0,1950.0,0.0,290.0,0.0,...,246.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2009.0


In [16]:
# let's check whether imputation has been performed or not

train_t[['LotFrontage', 'MasVnrArea', 'GarageYrBlt']].isnull().sum()

LotFrontage    0
MasVnrArea     0
GarageYrBlt    0
dtype: int64

In [17]:
# the obseravtions with NaN in the original train set of MasVnrArea column

X_train[X_train['MasVnrArea'].isnull()]['MasVnrArea']

420   NaN
490   NaN
642   NaN
824   NaN
921   NaN
Name: MasVnrArea, dtype: float64

In [18]:
# the replacement values in the transformed dataset

train_t[X_train['MasVnrArea'].isnull()]['MasVnrArea']

420     99.765717
490     34.106592
642      0.000000
824    375.749332
921     85.817715
Name: MasVnrArea, dtype: float64

In [19]:
X_train['MasVnrArea']

0       573.0
1         0.0
2         0.0
3         0.0
4         0.0
        ...  
1017    673.0
1018      0.0
1019      0.0
1020     18.0
1021     30.0
Name: MasVnrArea, Length: 1022, dtype: float64

In [20]:
# the mean value of the variable (i.e., for mean imputation)

X_train['MasVnrArea'].mean()

103.55358898721731

### Imputing a slice of the dataframe
We can use Feature-engine to apply the KNNImputer to a slice of the dataframe.

In [21]:
!pip install feature_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature_engine
  Downloading feature_engine-1.4.0-py2.py3-none-any.whl (276 kB)
[K     |████████████████████████████████| 276 kB 26.3 MB/s 
Installing collected packages: feature-engine
Successfully installed feature-engine-1.4.0


In [22]:
from feature_engine.wrappers import SklearnTransformerWrapper

In [23]:
ds = pd.read_csv('train.csv')

X_train, X_test, y_train, y_test = train_test_split(
    ds.drop('SalePrice', axis=1),
    ds['SalePrice'],
    test_size=0.3,
    random_state=0)

# print the shapes
X_train.shape, X_test.shape

((1022, 80), (438, 80))

In [24]:
# start the KNNimputer inside the SKlearnTransformerWrapper

imputer = SklearnTransformerWrapper(
    transformer = KNNImputer(weights='distance'),
    variables = cols_to_use,
)

In [25]:
# we will consider only these columns during the imputation
cols_to_use

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [26]:
# fit the wrapper + KNNImputer
imputer.fit(X_train)

# transform the data
train_t = imputer.transform(X_train)
test_t = imputer.transform(X_test)

# feature-engine returns a dataframe
train_t.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
64,65,60.0,RL,70.115142,9375.0,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,,GdPrv,,0.0,2.0,2009.0,WD,Normal
682,683,120.0,RL,42.533053,2887.0,Pave,,Reg,HLS,AllPub,...,0.0,0.0,,,,0.0,11.0,2008.0,WD,Normal
960,961,20.0,RL,50.0,7207.0,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,,,,0.0,2.0,2010.0,WD,Normal
1384,1385,50.0,RL,60.0,9060.0,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,,MnPrv,,0.0,10.0,2009.0,WD,Normal
1100,1101,30.0,RL,60.0,8400.0,Pave,,Reg,Bnk,AllPub,...,0.0,0.0,,,,0.0,1.0,2009.0,WD,Normal


In [27]:
# Check for NaN value 
train_t['MasVnrArea'].isnull().sum()

0

In [28]:
# the replacement values in the transformed dataset

train_t[X_train['MasVnrArea'].isnull()]['MasVnrArea']

1278     99.765717
936      34.106592
650       0.000000
234     375.749332
973      85.817715
Name: MasVnrArea, dtype: float64

as we can see, the values are exactly same as previous, i.e. computed using sk-learn.

### Automatically find best imputation parameters


In [29]:
# import extra classes for modelling
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [30]:
# Splitting test and train set
X_train, X_test, y_train, y_test = train_test_split(
    ds[cols_to_use],  # just the features
    ds['SalePrice'],  # the target
    test_size=0.3,  # the percentage of obs in the test set
    random_state=0)  # for reproducibility

X_train.shape, X_test.shape

((1022, 36), (438, 36))

In [31]:
# Creating pipeline model
pipe = Pipeline(steps=[
    ('imputer', KNNImputer(
        n_neighbors=5,
        weights='distance',
        add_indicator=False)),
    
    ('scaler', StandardScaler()),
    ('regressor', Lasso(max_iter=2000)),
])

In [32]:
# now we create the grid with all the parameters that we would like to test

param_grid = {
    'imputer__n_neighbors': [3,5,10],
    'imputer__weights': ['uniform', 'distance'],
    'imputer__add_indicator': [True, False],
    'regressor__alpha': [10, 100, 200],
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, scoring='r2')

# Where,
# cv=3 is the no. of cross-validation
# no_jobs =-1 indicates to use all available cpus
# scoring='r2' indicates to evaluate using the r squared

In [33]:
# and now we train over all the possible combinations 
# of the parameters above
grid_search.fit(X_train, y_train)

# lets print the score
print(("best linear regression from grid search: %.3f"
       % grid_search.score(X_train, y_train)))

best linear regression from grid search: 0.845


In [34]:
# let's check the performance over the test set
print(("best linear regression from grid search: %.3f"
       % grid_search.score(X_test, y_test)))

best linear regression from grid search: 0.730


In [35]:
# lets print the best parameters
grid_search.best_params_

{'imputer__add_indicator': True,
 'imputer__n_neighbors': 10,
 'imputer__weights': 'distance',
 'regressor__alpha': 200}

### Compare with univariate imputation


In [36]:
from sklearn.impute import SimpleImputer # univariate imputer

In [37]:
# Splitting up train and test set
X_train, X_test, y_train, y_test = train_test_split(
    ds[cols_to_use],  # just the features
    ds['SalePrice'],  # the target
    test_size=0.3,  # the percentage of obs in the test set
    random_state=0)  # for reproducibility

X_train.shape, X_test.shape

((1022, 36), (438, 36))

In [38]:
pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean', fill_value=-1)),
    ('scaler', StandardScaler()),
    ('regressor', Lasso(max_iter=2000)),
])

param_grid = {
    'imputer__strategy': ['mean', 'median', 'constant'],
    'imputer__add_indicator': [True, False],
    'regressor__alpha': [10, 100, 200],
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, scoring='r2')

# now we train over all the possible combinations of the parameters above
grid_search.fit(X_train, y_train)

# we print the best score over the train set
print(("best linear regression from grid search: %.3f"
       % grid_search.score(X_train, y_train)))

best linear regression from grid search: 0.845


In [39]:
# let's check the performance over the test set
print(("best linear regression from grid search: %.3f"
       % grid_search.score(X_test, y_test)))

best linear regression from grid search: 0.729


In [40]:
# Let's print the best parameter
grid_search.best_params_

{'imputer__add_indicator': False,
 'imputer__strategy': 'constant',
 'regressor__alpha': 200}

We see that imputing the values with an arbitrary value of -1, returns approximately the same performance as doing KNN imputation, so we might not want to add the additional complexity of training models to impute NA, to then go ahead and predict the real target we are interested in.

