# Handling Missing Values in Python

## Loading Libraries

In [1]:
import pandas as pd
import numpy as np

## Reading Data

In [2]:
## Reading in data
diabetes = pd.read_csv('diabetes.csv')

In [3]:
diabetes.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [4]:
# Dropping target variable
diabetes.drop(columns=['Outcome'], axis=1, inplace = True)

diabetes.shape

(768, 8)

In [5]:
diabetes.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64

In [6]:
# Inserting missing values
diabetes_missing = diabetes.mask(np.random.random(diabetes.shape) < 0.1)

In [7]:
diabetes_missing.isnull().sum()

Pregnancies                 67
Glucose                     81
BloodPressure               73
SkinThickness               73
Insulin                     87
BMI                         86
DiabetesPedigreeFunction    75
Age                         63
dtype: int64

In [8]:
diabetes_missing.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0
1,1.0,85.0,66.0,29.0,0.0,,0.351,31.0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,,33.0
5,5.0,116.0,74.0,0.0,0.0,25.6,0.201,30.0
6,3.0,78.0,50.0,32.0,88.0,31.0,0.248,26.0
7,10.0,115.0,0.0,0.0,0.0,35.3,0.134,29.0
8,2.0,197.0,70.0,45.0,543.0,30.5,0.158,53.0
9,8.0,125.0,96.0,0.0,0.0,,0.232,54.0


## Listwise Deletion

In [9]:
diabetes_remove = diabetes_missing.dropna(axis=0)

diabetes_remove.shape

(320, 8)

## Zero Value Imputation

In [10]:
diabetes_0 = diabetes_missing.fillna(0)

diabetes_0.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0
1,1.0,85.0,66.0,29.0,0.0,0.0,0.351,31.0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.0,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,0.0,33.0
5,5.0,116.0,74.0,0.0,0.0,25.6,0.201,30.0
6,3.0,78.0,50.0,32.0,88.0,31.0,0.248,26.0
7,10.0,115.0,0.0,0.0,0.0,35.3,0.134,29.0
8,2.0,197.0,70.0,45.0,543.0,30.5,0.158,53.0
9,8.0,125.0,96.0,0.0,0.0,0.0,0.232,54.0


## Central Value Imputation

In [11]:
from sklearn.impute import SimpleImputer

In [12]:
num_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [13]:
vals = diabetes_missing.values

In [14]:
vals

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,     nan,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,     nan, ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [15]:
# Fit imputer to data to retrieve central measure:
num_imputer.fit(vals)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [16]:
# Transforming missing data points to mean value:
num_imputer.transform(vals)

array([[  6.        , 148.        ,  72.        , ...,  33.6       ,
          0.627     ,  50.        ],
       [  1.        ,  85.        ,  66.        , ...,  31.95674487,
          0.351     ,  31.        ],
       [  8.        , 183.        ,  64.        , ...,  23.3       ,
          0.672     ,  32.        ],
       ...,
       [  5.        , 121.        ,  72.        , ...,  26.2       ,
          0.245     ,  30.        ],
       [  1.        , 126.        ,  69.30647482, ...,  30.1       ,
          0.349     ,  47.        ],
       [  1.        ,  93.        ,  70.        , ...,  30.4       ,
          0.315     ,  23.        ]])

In [17]:
diabetes_missing.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0
1,1.0,85.0,66.0,29.0,0.0,,0.351,31.0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,,33.0
5,5.0,116.0,74.0,0.0,0.0,25.6,0.201,30.0
6,3.0,78.0,50.0,32.0,88.0,31.0,0.248,26.0
7,10.0,115.0,0.0,0.0,0.0,35.3,0.134,29.0
8,2.0,197.0,70.0,45.0,543.0,30.5,0.158,53.0
9,8.0,125.0,96.0,0.0,0.0,,0.232,54.0


## Regression Imputation

In [18]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [19]:
reg_imputer = IterativeImputer(max_iter=10, random_state=123)

In [20]:
reg_imputer.fit(vals)

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=123,
                 sample_posterior=False, skip_complete=False, tol=0.001,
                 verbose=0)

In [21]:
reg_imputer.transform(vals)

array([[  6.        , 148.        ,  72.        , ...,  33.6       ,
          0.627     ,  50.        ],
       [  1.        ,  85.        ,  66.        , ...,  31.5472973 ,
          0.351     ,  31.        ],
       [  8.        , 183.        ,  64.        , ...,  23.3       ,
          0.672     ,  32.        ],
       ...,
       [  5.        , 121.        ,  72.        , ...,  26.2       ,
          0.245     ,  30.        ],
       [  1.        , 126.        ,  70.65835346, ...,  30.1       ,
          0.349     ,  47.        ],
       [  1.        ,  93.        ,  70.        , ...,  30.4       ,
          0.315     ,  23.        ]])

## k-NN Imputation

In [22]:
from sklearn.impute import KNNImputer

In [23]:
knn_imputer = KNNImputer(n_neighbors=3)

In [24]:
knn_imputer.fit(vals)

KNNImputer(add_indicator=False, copy=True, metric='nan_euclidean',
           missing_values=nan, n_neighbors=3, weights='uniform')

In [25]:
knn_imputer.transform(vals)

array([[  6.        , 148.        ,  72.        , ...,  33.6       ,
          0.627     ,  50.        ],
       [  1.        ,  85.        ,  66.        , ...,  34.4       ,
          0.351     ,  31.        ],
       [  8.        , 183.        ,  64.        , ...,  23.3       ,
          0.672     ,  32.        ],
       ...,
       [  5.        , 121.        ,  72.        , ...,  26.2       ,
          0.245     ,  30.        ],
       [  1.        , 126.        ,  85.33333333, ...,  30.1       ,
          0.349     ,  47.        ],
       [  1.        ,  93.        ,  70.        , ...,  30.4       ,
          0.315     ,  23.        ]])