In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/income_evaluation.csv', na_values = ' ?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.isna().sum()

age                   0
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week       0
 native-country     583
 income               0
dtype: int64

In [4]:
# hours per week missing values
np.random.seed(seed=0)
h = np.random.choice(a=df.index, replace=False, size=20)
df.loc[h, ' hours-per-week'] = np.nan

In [5]:
# age missing values
np.random.seed(seed=10)
a = np.random.choice(a=df.index, replace=False, size=28)
df.loc[a, 'age'] = np.nan

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(' income', axis=1),
                                                    df[' income'], test_size=0.2,
                                                    random_state=30)

In [7]:
si_age = SimpleImputer(strategy='mean', add_indicator=True)

In [12]:
a = pd.DataFrame(si_age.fit_transform(X_train[['age']]))

In [11]:
si_age.statistics_

array([38.54201729])

In [16]:
a[a[1] == 1]

Unnamed: 0,0,1
2969,38.542017,1.0
3219,38.542017,1.0
3522,38.542017,1.0
4925,38.542017,1.0
5543,38.542017,1.0
5754,38.542017,1.0
6305,38.542017,1.0
7237,38.542017,1.0
8587,38.542017,1.0
11314,38.542017,1.0


In [21]:
si_occ = SimpleImputer(strategy='constant', add_indicator=True, fill_value='not available')

In [22]:
si_occ.fit_transform(X_train[[' occupation']])

array([[' Exec-managerial', False],
       [' Transport-moving', False],
       [' Transport-moving', False],
       ...,
       [' Other-service', False],
       [' Sales', False],
       [' Tech-support', False]], dtype=object)

In [23]:
pd.DataFrame(si_occ.fit_transform(X_train[[' occupation']]))

Unnamed: 0,0,1
0,Exec-managerial,False
1,Transport-moving,False
2,Transport-moving,False
3,Craft-repair,False
4,Adm-clerical,False
5,Sales,False
6,Machine-op-inspct,False
7,Farming-fishing,False
8,Adm-clerical,False
9,Machine-op-inspct,False


In [24]:
si_age.transform(X_test[['age']])

array([[48.,  0.],
       [63.,  0.],
       [33.,  0.],
       ...,
       [48.,  0.],
       [54.,  0.],
       [58.,  0.]])

In [26]:
b = pd.DataFrame(si_age.transform(X_test[['age']]))

In [29]:
b[b[1] == 1]

Unnamed: 0,0,1
2526,38.542017,1.0
4068,38.542017,1.0
4111,38.542017,1.0
5324,38.542017,1.0
5930,38.542017,1.0
