## Handling Missing Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
vals2 = np.array([1, np.nan, 3, 4]) 
vals2.sum()

nan

In [3]:
vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

In [4]:
1 + np.nan

nan

In [5]:
0 *  np.nan

nan

In [6]:
vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

In [7]:
# NaN 처리
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)

### NaN and None in Pandas

- upcasting & type conversion

In [9]:
# 숫자일 경우 : NaN으로 변환
df = pd.DataFrame({'value' : [1, np.nan, 2, None]})
df

Unnamed: 0,value
0,1.0
1,
2,2.0
3,


In [10]:
df.dtypes

value    float64
dtype: object

In [11]:
# boolean인 경우 Object
df = pd.DataFrame({'value' : [True, np.nan, 2, None]})
df

Unnamed: 0,value
0,True
1,
2,2
3,


In [12]:
df.dtypes

value    object
dtype: object

In [13]:
df = pd.DataFrame({'value' : ['abc', np.nan, 2, None]})
df

Unnamed: 0,value
0,abc
1,
2,2
3,


In [14]:
df.dtypes

value    object
dtype: object

In [15]:
df = pd.DataFrame({'value' : [1, 3, 2, 4]})
print(df.dtypes)
print('------------')
df = pd.DataFrame({'value' : [np.nan, 3, 2, 4]})
print(df.dtypes)

value    int64
dtype: object
------------
value    float64
dtype: object


## Operating on Null Values

- ``isnull()``: Generate a boolean mask indicating missing values
- ``notnull()``: Opposite of ``isnull()``
- ``dropna()``: Return a filtered version of the data
- ``fillna()``: Return a copy of the data with missing values filled or imputed

### Detecting null values

In [17]:
df = pd.DataFrame({'value' : [1, np.nan, 2, None]})
df

Unnamed: 0,value
0,1.0
1,
2,2.0
3,


In [18]:
df.isnull()

Unnamed: 0,value
0,False
1,True
2,False
3,True


In [19]:
df.notnull()

Unnamed: 0,value
0,True
1,False
2,True
3,False


In [20]:
df[df.isnull()]

Unnamed: 0,value
0,
1,
2,
3,


In [21]:
df[df.notnull()]

Unnamed: 0,value
0,1.0
1,
2,2.0
3,


### Dropping null values

In [22]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [23]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [24]:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [25]:
# column drop
df.dropna(axis=1)

Unnamed: 0,2
0,2
1,5
2,6


In [26]:
# row drop
df.dropna(axis=0) # df.dropna(axis='rows')

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [27]:
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [28]:
df.dropna(how='any')

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [29]:
# 허용치
df.dropna(axis='rows', thresh=1)

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [30]:
df.dropna(axis='rows', thresh=2)

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


### Filling null values

In [31]:
data = df[:]
data

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [32]:
# fill NA
data.fillna(0)

Unnamed: 0,0,1,2
0,1.0,0.0,2
1,2.0,3.0,5
2,0.0,4.0,6


In [33]:
# forward-fill
data.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,2.0,4.0,6


In [34]:
data.fillna(method='ffill', axis=1)

Unnamed: 0,0,1,2
0,1.0,1.0,2.0
1,2.0,3.0,5.0
2,,4.0,6.0


In [35]:
# back-fill
data.fillna(method='bfill')

Unnamed: 0,0,1,2
0,1.0,3.0,2
1,2.0,3.0,5
2,,4.0,6


In [34]:
data.fillna(method='bfill', axis=1)

Unnamed: 0,0,1,2
0,1.0,2.0,2.0
1,2.0,3.0,5.0
2,4.0,4.0,6.0
