# Missing Data

Let's explore a few methods for missing value treatment:

In [1]:
import numpy as np
import pandas as pd

In [6]:
df = pd.DataFrame({'A':[10,23,0,7,8,np.nan],
                  'B':[19,13,0,5,np.nan,np.nan],
                  'C':[11,2,43,12,10,9]})

In [7]:
df

Unnamed: 0,A,B,C
0,10.0,19.0,11
1,23.0,13.0,2
2,0.0,0.0,43
3,7.0,5.0,12
4,8.0,,10
5,,,9


In [8]:
df.isna()

Unnamed: 0,A,B,C
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,True,False
5,True,True,False


In [9]:
df.isna().count()

A    6
B    6
C    6
dtype: int64

In [10]:
df.isna().sum()

A    1
B    2
C    0
dtype: int64

In [11]:
# in pandas, isnull and isna are the exact same thing. The difference is because these functions
# were taken from R where isnull refers to zeroes or blanks.
df.isnull()

Unnamed: 0,A,B,C
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,True,False
5,True,True,False


In [12]:
df['A'] + df['B']

0    29.0
1    36.0
2     0.0
3    12.0
4     NaN
5     NaN
dtype: float64

In [13]:
df.dropna()

Unnamed: 0,A,B,C
0,10.0,19.0,11
1,23.0,13.0,2
2,0.0,0.0,43
3,7.0,5.0,12


In [14]:
df.dropna(axis=1)

Unnamed: 0,C
0,11
1,2
2,43
3,12
4,10
5,9


In [15]:
df.drop(['B'], axis=1)

Unnamed: 0,A,C
0,10.0,11
1,23.0,2
2,0.0,43
3,7.0,12
4,8.0,10
5,,9


In [16]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,10.0,19.0,11
1,23.0,13.0,2
2,0.0,0.0,43
3,7.0,5.0,12
4,8.0,,10


In [20]:
df.fillna(value='Fill Value')

Unnamed: 0,A,B,C
0,10,19,11
1,23,13,2
2,0,0,43
3,7,5,12
4,8,Fill Value,10
5,Fill Value,Fill Value,9


In [18]:
df['A'].fillna(value=df['A'].mean())

0    10.0
1    23.0
2     0.0
3     7.0
4     8.0
5     9.6
Name: A, dtype: float64

In [21]:
df

Unnamed: 0,A,B,C
0,10.0,19.0,11
1,23.0,13.0,2
2,0.0,0.0,43
3,7.0,5.0,12
4,8.0,,10
5,,,9
