# 1. 누락된 데이터 처리하기

In [1]:
import pandas as pd
import numpy as np

In [2]:
string_data=pd.Series(['aardvark','artichoke',np.nan,'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.isnull() #누락된 값 찾기

0    False
1    False
2     True
3    False
dtype: bool

## 1.1 누락된 데이터 골라내기

In [4]:
from numpy import nan as NA

In [5]:
data=pd.Series([1,NA,3.5,NA,7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [6]:
data.dropna() # 결측치 제거

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
data[data.notnull()] # data.dropna()와 같은 메서드

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
# DataFrame 객체
data=pd.DataFrame([[1.,6.5,3.],[1,NA,NA],
                  [NA,NA,NA],[NA,6.5,3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [9]:
data.dropna() # NA값을 하나라도 포함하고 있는 로우를 제외시킨다.

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [10]:
data.dropna(how='all') # 모두 NA값인 로우만 제외시킨다.

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [11]:
# 컬럼 제외
data[4]=NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [12]:
data.dropna(axis=1,how='all') # 모두 NA값인 컬럼 삭제

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [13]:
# 몇개 이상의 값이 들어 있는 로우만 살펴보기
# thresh이용

In [14]:
df=pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,-0.833243,-0.428892,0.082694
1,-1.271012,-0.110784,-0.062554
2,0.745536,-1.385732,0.012147
3,-0.106934,0.793041,0.010031
4,-0.673229,-0.113002,-0.781023
5,0.701747,-0.757256,-0.522743
6,-0.926166,-0.299325,2.219337


In [15]:
df.iloc[:4,1]=NA
df.iloc[:2,2]=NA
df

Unnamed: 0,0,1,2
0,-0.833243,,
1,-1.271012,,
2,0.745536,,0.012147
3,-0.106934,,0.010031
4,-0.673229,-0.113002,-0.781023
5,0.701747,-0.757256,-0.522743
6,-0.926166,-0.299325,2.219337


In [16]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.673229,-0.113002,-0.781023
5,0.701747,-0.757256,-0.522743
6,-0.926166,-0.299325,2.219337


In [17]:
df.dropna(thresh=2) #결측치가 2개 이상인 로우만 제거

Unnamed: 0,0,1,2
2,0.745536,,0.012147
3,-0.106934,,0.010031
4,-0.673229,-0.113002,-0.781023
5,0.701747,-0.757256,-0.522743
6,-0.926166,-0.299325,2.219337


## 1.2 결측치 채우기

In [18]:
df

Unnamed: 0,0,1,2
0,-0.833243,,
1,-1.271012,,
2,0.745536,,0.012147
3,-0.106934,,0.010031
4,-0.673229,-0.113002,-0.781023
5,0.701747,-0.757256,-0.522743
6,-0.926166,-0.299325,2.219337


In [19]:
df.fillna(0) #0으로 결측치를 채운다.

Unnamed: 0,0,1,2
0,-0.833243,0.0,0.0
1,-1.271012,0.0,0.0
2,0.745536,0.0,0.012147
3,-0.106934,0.0,0.010031
4,-0.673229,-0.113002,-0.781023
5,0.701747,-0.757256,-0.522743
6,-0.926166,-0.299325,2.219337


In [20]:
df

Unnamed: 0,0,1,2
0,-0.833243,,
1,-1.271012,,
2,0.745536,,0.012147
3,-0.106934,,0.010031
4,-0.673229,-0.113002,-0.781023
5,0.701747,-0.757256,-0.522743
6,-0.926166,-0.299325,2.219337


In [21]:
df.fillna({1:0.5,2:0}) # 사전값을 넘겨서 각 칼럼마다 다른 값으로 채우기

Unnamed: 0,0,1,2
0,-0.833243,0.5,0.0
1,-1.271012,0.5,0.0
2,0.745536,0.5,0.012147
3,-0.106934,0.5,0.010031
4,-0.673229,-0.113002,-0.781023
5,0.701747,-0.757256,-0.522743
6,-0.926166,-0.299325,2.219337


In [22]:
_=df.fillna(0,inplace=True)  # 기존객체 변경
df

Unnamed: 0,0,1,2
0,-0.833243,0.0,0.0
1,-1.271012,0.0,0.0
2,0.745536,0.0,0.012147
3,-0.106934,0.0,0.010031
4,-0.673229,-0.113002,-0.781023
5,0.701747,-0.757256,-0.522743
6,-0.926166,-0.299325,2.219337


In [23]:
df=pd.DataFrame(np.random.randn(6,3))
df.iloc[2:,1]=NA
df.iloc[4:,2]=NA
df

Unnamed: 0,0,1,2
0,-0.508716,0.164735,1.043274
1,-0.776975,0.773177,0.40399
2,-0.294259,,0.902396
3,-0.369914,,0.265877
4,-0.951508,,
5,-1.198246,,


In [24]:
df.fillna(method='ffill') # 보간메서드

Unnamed: 0,0,1,2
0,-0.508716,0.164735,1.043274
1,-0.776975,0.773177,0.40399
2,-0.294259,0.773177,0.902396
3,-0.369914,0.773177,0.265877
4,-0.951508,0.773177,0.265877
5,-1.198246,0.773177,0.265877


In [25]:
df.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
0,-0.508716,0.164735,1.043274
1,-0.776975,0.773177,0.40399
2,-0.294259,0.773177,0.902396
3,-0.369914,0.773177,0.265877
4,-0.951508,,0.265877
5,-1.198246,,0.265877


In [26]:
data=pd.Series([1.,NA,3.5,NA,7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [27]:
data.fillna(data.mean()) # Series의 평균값이나 중간값 전달 가능

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64