# Ch7. 데이터 정제 및 준비

## 7.1 누락된 데이터 처리하기

In [3]:
import pandas as pd
import numpy as np

In [4]:
string_data= pd.Series(['aardvark', 'artichoke', 'np.nan', 'avocado'])
string_data

0     aardvark
1    artichoke
2       np.nan
3      avocado
dtype: object

In [5]:
string_data.isnull()

0    False
1    False
2    False
3    False
dtype: bool

In [6]:
string_data[0]= None

In [7]:
string_data.isnull()

0     True
1    False
2    False
3    False
dtype: bool

### 누락된 데이터 골라내기

In [9]:
from numpy import nan as NA

In [10]:
data = pd.Series([1,NA,3.5,NA,7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data=pd.DataFrame([[1.,6.5,3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5,3]])

In [13]:
cleaned =data.dropna()

In [14]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [15]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [16]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [18]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [19]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


### 결측치 채우기

In [24]:
df= pd.DataFrame(np.random.randn(6,3))

In [26]:
df.iloc[2:, 1]= NA

In [27]:
df.iloc[4:, 2]= NA

In [28]:
df

Unnamed: 0,0,1,2
0,-1.790515,0.401314,0.599313
1,0.362201,0.497891,-0.530481
2,-1.975051,,0.51071
3,1.464568,,1.637272
4,0.812094,,
5,-0.455247,,


In [29]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-1.790515,0.401314,0.599313
1,0.362201,0.497891,-0.530481
2,-1.975051,0.497891,0.51071
3,1.464568,0.497891,1.637272
4,0.812094,0.497891,1.637272
5,-0.455247,0.497891,1.637272


In [30]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-1.790515,0.401314,0.599313
1,0.362201,0.497891,-0.530481
2,-1.975051,0.497891,0.51071
3,1.464568,0.497891,1.637272
4,0.812094,,1.637272
5,-0.455247,,1.637272


In [31]:
data=pd.Series([1., NA, 3.5, NA, 7])

In [32]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## 7.2 데이터 변형

### 중복 제거하기

In [33]:
data=pd.DataFrame({'k1':['one','two']* 3 + ['two'],
                  'k2': [1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [34]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [35]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [36]:
data['v1'] = range(7)

In [38]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [39]:
data.drop_duplicates(['k1','k2'],keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


#### 값 치환하기

In [41]:
data= pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [42]:
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [43]:
data.replace([-999,-1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

### 특잇값을 찾고 제외하기

In [45]:
data= pd.DataFrame(np.random.randn(1000,4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.011367,0.001288,-0.017609,0.060692
std,0.996,1.022533,1.024945,1.013971
min,-3.079265,-3.218105,-3.10873,-3.413984
25%,-0.674909,-0.656179,-0.711331,-0.589084
50%,-0.032342,-0.000784,-0.014145,0.083719
75%,0.641179,0.688185,0.664706,0.787818
max,4.503516,4.492664,2.91709,3.267003


In [46]:
col = data[2]

In [47]:
col[np.abs(col) >3]

646   -3.10873
Name: 2, dtype: float64

In [48]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
337,3.437731,0.05497,0.18275,-0.483145
392,4.503516,-0.112261,-0.11815,0.958596
420,0.286917,-3.218105,-1.47506,1.166591
472,-0.532348,4.237103,1.074161,-0.616033
518,2.141006,3.085045,-1.615903,-1.391623
552,-0.137809,4.492664,0.536954,0.23929
572,-3.079265,-1.795574,-1.863986,0.394301
610,1.273633,-3.145292,0.064535,-1.364474
642,3.223466,1.495385,-1.001761,0.471477
646,0.070894,-0.82379,-3.10873,0.265818


In [49]:
data[np.abs(data)>3] = np.sign(data)*3

In [51]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.013453,-0.001004,-0.0175,0.060738
std,0.987915,1.010775,1.024622,1.011524
min,-3.0,-3.0,-3.0,-3.0
25%,-0.674909,-0.656179,-0.711331,-0.589084
50%,-0.032342,-0.000784,-0.014145,0.083719
75%,0.641179,0.688185,0.664706,0.787818
max,3.0,3.0,2.91709,3.0


In [52]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,-1.0,-1.0,-1.0
1,-1.0,1.0,1.0,1.0
2,1.0,-1.0,1.0,-1.0
3,1.0,-1.0,-1.0,-1.0
4,1.0,1.0,-1.0,-1.0


#### 치환과 임의 샘플링

In [53]:
df= pd.DataFrame(np.arange(5*4).reshape((5,4)))

In [54]:
sampler=np.random.permutation(5)
sampler

array([2, 1, 0, 4, 3])

In [55]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [56]:
df.take(sampler)

Unnamed: 0,0,1,2,3
2,8,9,10,11
1,4,5,6,7
0,0,1,2,3
4,16,17,18,19
3,12,13,14,15


In [57]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
4,16,17,18,19


## 7.3 문자열 다루기

### 문자열 객체 메서드

In [58]:
val = 'a, b, guido'

In [59]:
val.split(',')

['a', ' b', ' guido']

In [60]:
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [61]:
first, second, third, = pieces

In [63]:
first + '::' + second  + '::' + third

'a::b::guido'

In [64]:
'::'.join(pieces)

'a::b::guido'

In [65]:
'guido' in val

True

In [66]:
val.index(',')

1

In [67]:
val.find(':')

-1

In [68]:
val.count(',')

2

In [69]:
val.replace(',', '::')

'a:: b:: guido'

In [70]:
val.replace(',', '')

'a b guido'

### Pandas의 벡터화된 문자열 함수

In [72]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
       'Rob': 'rob@gmail.com','Wes': np.nan}

In [74]:
data= pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [75]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [76]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object