## Missing Value Imputation Methods

#### Missing Value Imputation for Categorical Features

In [None]:
import pandas as pd
import numpy as np
X = pd.DataFrame({'Shape':['square', 'square', 'oval', 'circle', np.nan]})

In [None]:

X

Unnamed: 0,Shape
0,square
1,square
2,oval
3,circle
4,


In [None]:

from sklearn.impute import SimpleImputer

In [None]:

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(X)

array([['square'],
       ['square'],
       ['oval'],
       ['circle'],
       ['square']], dtype=object)

In [None]:

imputer = SimpleImputer(strategy='constant', fill_value='missing')
imputer.fit_transform(X)

array([['square'],
       ['square'],
       ['oval'],
       ['circle'],
       ['missing']], dtype=object)

### Four options for handling missing values (NaNs):

#### Drop rows containing NaNs
#### Drop columns containing NaNs
#### Fill NaNs with imputed values
#### Use a model that natively handles NaN

In [None]:
import pandas as pd
train = pd.read_csv('/content/titanic_train.csv.csv')

test = pd.read_csv('/content/titanic_test.csv.csv', nrows=175)

In [None]:

train = train[['Survived', 'Age', 'Fare', 'Pclass']]

test = test[['Age', 'Fare', 'Pclass']]

In [None]:

# count the number of NaNs in each column
train.isna().sum()
test.isna().sum()

Age       36
Fare       1
Pclass     0
dtype: int64

In [None]:

label = train.pop('Survived')

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

In [None]:
clf = HistGradientBoostingClassifier()

In [None]:

# no errors, despite NaNs in train and test!
clf.fit(train, label)
clf.predict(test)


array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

### Need something better than SimpleImputer for missing value imputation?

### Try KNNImputer or IterativeImputer (inspired by R's MICE package). Both are multivariate approaches (they take other features into account!)

In [None]:
import pandas as pd


df = pd.read_csv('/content/titanic_train.csv.csv')
cols = ['SibSp', 'Fare', 'Age']
df = df[cols]

In [None]:

df

Unnamed: 0,SibSp,Fare,Age
0,1,7.2500,22.0
1,1,71.2833,38.0
2,0,7.9250,26.0
3,1,53.1000,35.0
4,0,8.0500,35.0
...,...,...,...
886,0,13.0000,27.0
887,0,30.0000,19.0
888,1,23.4500,
889,0,30.0000,26.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SibSp   891 non-null    int64  
 1   Fare    891 non-null    float64
 2   Age     714 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 21.0 KB


In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
impute_it = IterativeImputer()
impute_it.fit_transform(df)

array([[ 1.        ,  7.25      , 22.        ],
       [ 1.        , 71.2833    , 38.        ],
       [ 0.        ,  7.925     , 26.        ],
       ...,
       [ 1.        , 23.45      , 26.82938751],
       [ 0.        , 30.        , 26.        ],
       [ 0.        ,  7.75      , 32.        ]])

### KNN Imputer

In [None]:
from sklearn.impute import KNNImputer

In [None]:

impute_knn = KNNImputer(n_neighbors=2)
impute_knn.fit_transform(X1)

array([[ 1.    ,  7.25  , 22.    ],
       [ 1.    , 71.2833, 38.    ],
       [ 0.    ,  7.925 , 26.    ],
       ...,
       [ 1.    , 23.45  , 29.    ],
       [ 0.    , 30.    , 26.    ],
       [ 0.    ,  7.75  , 32.    ]])