## 7.1 누락된 데이터 처리하기
-----

 * 누락 데이터의 처리는 pandas의 설계목표중 하나
 * pandas의 모든 기술통계는 누락데이터를 배제하고 처리한다.
 * 완벽한 것은 아니나 누락데이터를 NaN(실숫값)으로 취급하여 쉽게 찾을 수 있게 돕는다.


In [57]:
import pandas as pd
import numpy as np

In [24]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [25]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [26]:
string_data.isnull() 

0    False
1    False
2     True
3    False
dtype: bool

#### pandas는 R에서 결측치를 NA(Not Available)로 취급하는 개념을 차용하였다.

1. NA데이터의 의미
  * 분석과정에서 데이터 존재하지 않았음.
  * 존재할 때는, 데이터 수집과정에서 검출되지 않았음.
  
  <br>
  
2. 데이터 정제과정에서 결측치의 용도
  * 데이터 수집과정에서의 실수 파악
  * 결측치로 인한 잠재적 편향 파악
  

In [27]:
string_data[0] = None

In [28]:
string_data.isnull() # In[7]과 비교하면 0번이 True로 바뀌어있다.

0     True
1    False
2     True
3    False
dtype: bool

### 7.1.1 누락된 데이터 골라내기

#### dropna
non-null 데이터와 색인값만 들어있는 Series 반환

1. Series

In [58]:
from numpy import nan as NA

data = pd.Series([1, NA, 3.5, NA, 7])

In [34]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

위 코드와 아래 코드는 동일하다

In [36]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

2. DataFrame

In [44]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                    [NA, NA, NA], [NA, 6.5, 3.]])

In [45]:
cleaned = data.dropna()

In [46]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [47]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [48]:
# NA인 로우만 제외
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [49]:
data[4] = NA

In [50]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [51]:
# NA인 컬럼만 제외(axis)
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [59]:
# tresh로 어떤 값이 몇개 이상인 로우만 확인
df = pd.DataFrame(np.random.rand(7,3))

In [60]:
df.iloc[:4, 1] = NA

In [61]:
df.iloc[:2, 2] = NA

In [55]:
df

Unnamed: 0,0,1,2
0,0.849948,,
1,0.192606,,
2,0.309329,,0.481012
3,0.030148,,0.473001
4,0.672973,0.561254,0.642846
5,0.201095,0.937186,0.685778
6,0.267239,0.131756,0.952688


In [62]:
df.dropna()

Unnamed: 0,0,1,2
4,0.179557,0.06722,0.555981
5,0.134145,0.055196,0.132323
6,0.649843,0.94589,0.658331


In [63]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.838671,,0.332038
3,0.771247,,0.683776
4,0.179557,0.06722,0.555981
5,0.134145,0.055196,0.132323
6,0.649843,0.94589,0.658331


### 7.1.2 결측치 채우기

* fillna 메서드를 사용한다
* 누락된 값을 새로운 값으로 채운 새로운 객체를 반환한다.

In [59]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.849948,0.0,0.0
1,0.192606,0.0,0.0
2,0.309329,0.0,0.481012
3,0.030148,0.0,0.473001
4,0.672973,0.561254,0.642846
5,0.201095,0.937186,0.685778
6,0.267239,0.131756,0.952688


In [62]:
# fillna에 dictionary값을 넘겨 각 컬럼마다 다른 값 채우기
df.fillna({1: 0.5, 2: 0}) 

Unnamed: 0,0,1,2
0,0.849948,0.5,0.0
1,0.192606,0.5,0.0
2,0.309329,0.5,0.481012
3,0.030148,0.5,0.473001
4,0.672973,0.561254,0.642846
5,0.201095,0.937186,0.685778
6,0.267239,0.131756,0.952688


In [63]:
# 새로운 객체 반환없이 기존객체 변경
df.fillna(0, inplace=True)

In [64]:
df

Unnamed: 0,0,1,2
0,0.849948,0.0,0.0
1,0.192606,0.0,0.0
2,0.309329,0.0,0.481012
3,0.030148,0.0,0.473001
4,0.672973,0.561254,0.642846
5,0.201095,0.937186,0.685778
6,0.267239,0.131756,0.952688


In [65]:
df = pd.DataFrame(np.random.randn(6,3))

In [67]:
df.iloc[2:, 1] = NA

In [68]:
df.iloc[4: ,2] = NA

In [69]:
df

Unnamed: 0,0,1,2
0,0.756351,0.278301,0.491457
1,-0.115797,-0.489699,-0.416984
2,0.96947,,0.396248
3,0.518545,,0.202706
4,1.209206,,
5,-0.570478,,


In [70]:
# 보간메서드 사용   
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.756351,0.278301,0.491457
1,-0.115797,-0.489699,-0.416984
2,0.96947,-0.489699,0.396248
3,0.518545,-0.489699,0.202706
4,1.209206,-0.489699,0.202706
5,-0.570478,-0.489699,0.202706


In [77]:
df.fillna(method='ffill', limit=1) # NaN 1개만적용

Unnamed: 0,0,1,2
0,0.756351,0.278301,0.491457
1,-0.115797,-0.489699,-0.416984
2,0.96947,-0.489699,0.396248
3,0.518545,,0.202706
4,1.209206,,0.202706
5,-0.570478,,


In [76]:
df.fillna(method='ffill', limit=2) # NaN 2개만적용

Unnamed: 0,0,1,2
0,0.756351,0.278301,0.491457
1,-0.115797,-0.489699,-0.416984
2,0.96947,-0.489699,0.396248
3,0.518545,-0.489699,0.202706
4,1.209206,,0.202706
5,-0.570478,,0.202706


In [80]:
df.fillna(method='ffill', limit=3) # NaN 3개만적용

Unnamed: 0,0,1,2
0,0.756351,0.278301,0.491457
1,-0.115797,-0.489699,-0.416984
2,0.96947,-0.489699,0.396248
3,0.518545,-0.489699,0.202706
4,1.209206,-0.489699,0.202706
5,-0.570478,,0.202706


In [81]:
df.fillna(method='ffill', limit=4) # NaN 4개만적용

Unnamed: 0,0,1,2
0,0.756351,0.278301,0.491457
1,-0.115797,-0.489699,-0.416984
2,0.96947,-0.489699,0.396248
3,0.518545,-0.489699,0.202706
4,1.209206,-0.489699,0.202706
5,-0.570478,-0.489699,0.202706


In [83]:
# 결측치를 평균값으로 대체하기
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## 7.2 데이터 변형
-----

### 7.2.1 중복 제거하기

In [66]:
# 데이터의 중복 발생
data = pd.DataFrame({'k1':['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})

In [67]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [68]:
# 중복된 row 찾기
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [69]:
# duplicated에서 False인 행만있는 DataFrame 반환
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [70]:
# 기존객체의 값은 그대로이다 = 새로운 객체 반환
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


drop_duplicates와 duplicated는 모든 컬럼에 적용된다(기본)

In [71]:
# 중복을 찾아내기 위한 부분합을 지정해주기
data['v1'] = range(7)

In [72]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [75]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### 7.2.2 함수, 매핑을 이용한 데이터 변형

In [118]:
# 수집한 육류에 대한 정보 
data = pd.DataFrame({'food': ['bacon', 'pulled pork' ,'bacon',
                             'Pastrami', 'corned beef', 'Bacon',
                             'pastrami', 'honey ham', 'nova lox'],
                      'ounces': [4, 3 , 12, 6, 7.5, 8, 3, 5, 6]})

In [119]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [128]:
# 각 육류별 동물
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon',
}

In [129]:
# 모든 요소를 소문자로 변경
lowercased = data['food'].str.lower()

In [130]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [131]:
# map을 이용하면 편리하게 lowercaed의 각 행을 meat_to_animal의 밸류값과 매칭되는 값으로 바꾼 새 컬럼을 만들 수 있다.
data['animal'] = lowercased.map(meat_to_animal)

In [132]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [134]:
# 함수로 lowercaed와 같은 작업 수행
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

### 7.2.3 값 치환하기 - replace()

In [135]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])

In [136]:
# -999로 누락된 값 표현
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [137]:
# replace로 -999 nan으로 치환한 새로운 Series 생성
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [138]:
# 기존객체 변경
data.replace(-999, np.nan, inplace=True)

In [139]:
data

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

#### 리스트안의 요소들 한번에 치환

In [140]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [141]:
# 리스트안의 요소들을 치환할 값들의 리스트로 각각 치환  
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [143]:
# dictionary 이용
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### 7.2.4 축 색인 이름 바꾸기

In [145]:
data = pd.DataFrame(np.arange(12).reshape((3,4)),
                index=['Ohio', 'Colorado', 'New York'],
                columns=['one', 'two', 'three', 'four'])

In [146]:
# 함수이용해 축이름 대문자로 변경 
transform = lambda x: x[:4].upper()
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [None]:
# 대문자로 변경된 축이름을 DataFrame Index에 바로 대입
data.index = data.index.map(transform)

In [152]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


#### rename을 사용하여 새로운 객체 생성

In [153]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [154]:
data.rename(index=str.title, columns=str.upper, inplace=True)

In [155]:
data

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


### 7.2.5 개별화와 양자화
* 연속된 데이터의 개별분할 
* 연속되 데이터의 그룹화

In [156]:
# 수업에 참여하는 학생들의 나이
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [157]:
bins = [18, 25, 35, 60, 100]

In [166]:
# bins에서 설정한 값을 기준으로 age를 그룹화한다.
cats = pd.cut(ages, bins)

In [159]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

#### Categorical 객체
* 그룹 이름이 담긴 배열
* code 속성이 가진 데이터에 대한 카테고리 이름을 내부적으로 담고 있다.

In [160]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [163]:
# 중괄호 < 범위 <= 대괄호  
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [164]:
# pd.cut에 대한 그룹수
pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [167]:
# 대괄호 <= 범위 < 중괄호
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

#### labels 옵션으로 그룹 이름 직접 넘기기

In [168]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [169]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

#### cut함수에 그룹의 경곗값 넘기지 않고 그룹의 개수를 넘겨주어 그룹화
데이터에서 최솟값과 최댓값을 기준으로 균등한 길이의 그룹 자동 계산

In [170]:
data = np.random.rand(20)

In [171]:
# cut 사용
pd.cut(data, 4, precision=2) #소수점 아래 2자리까지로 제한(precision)해 4분위(4)로 분류

[(0.042, 0.26], (0.26, 0.48], (0.48, 0.7], (0.042, 0.26], (0.042, 0.26], ..., (0.042, 0.26], (0.26, 0.48], (0.26, 0.48], (0.48, 0.7], (0.7, 0.92]]
Length: 20
Categories (4, interval[float64]): [(0.042, 0.26] < (0.26, 0.48] < (0.48, 0.7] < (0.7, 0.92]]

#### qcut

* 표본 편위치를 기반으로 데이터를 나눠준다.
* 적당히 **비슷한 크기의 그룹**으로 나눌 수 있다.   
  (cut의 경우 데이터의 분산에 따라 각 그룹마다 데이터 수가 다르게 나뉘는 수가 많다.)

In [172]:
# qcut 사용
data = np.random.randn(1000) # 정규분포

In [173]:
cats = pd.qcut(data, 4) # 4분위로 분류

In [174]:
cats

[(-0.706, -0.0366], (-2.8649999999999998, -0.706], (-0.0366, 0.695], (-0.706, -0.0366], (0.695, 2.748], ..., (0.695, 2.748], (-0.0366, 0.695], (-0.0366, 0.695], (0.695, 2.748], (-0.706, -0.0366]]
Length: 1000
Categories (4, interval[float64]): [(-2.8649999999999998, -0.706] < (-0.706, -0.0366] < (-0.0366, 0.695] < (0.695, 2.748]]

In [175]:
pd.value_counts(cats)

(-2.8649999999999998, -0.706]    250
(-0.706, -0.0366]                250
(-0.0366, 0.695]                 250
(0.695, 2.748]                   250
dtype: int64

In [177]:
# cut 함수처럼 변위치 직접 지정
pd.qcut(data, [0., 0.1, 0.5, 0.9, 1.])

[(-1.275, -0.0366], (-1.275, -0.0366], (-0.0366, 1.224], (-1.275, -0.0366], (-0.0366, 1.224], ..., (1.224, 2.748], (-0.0366, 1.224], (-0.0366, 1.224], (1.224, 2.748], (-1.275, -0.0366]]
Length: 1000
Categories (4, interval[float64]): [(-2.8649999999999998, -1.275] < (-1.275, -0.0366] < (-0.0366, 1.224] < (1.224, 2.748]]

#### cut과 qcut 같은 이산함수는 그룹분석과 변위치 다룰 때 유용

### 7.2.6 특잇값 찾고 제외하기
* 배열연산시 특잇값(outlier)을 제외/대체 하는 것 중요

In [83]:
data = pd.DataFrame(np.random.randn(1000, 4))

In [84]:
# 적절히 분산되어 있다.
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.050235,-0.008409,0.029291,0.015311
std,1.016561,1.020637,1.024169,1.010616
min,-2.748691,-3.154819,-3.310472,-3.232902
25%,-0.748204,-0.681881,-0.670044,-0.660514
50%,-0.089383,0.035024,0.013754,-0.022715
75%,0.597162,0.703507,0.708746,0.700731
max,3.005672,2.936937,2.776772,3.372054


#### 절댓값 3초과하는 것 찾기

In [85]:
col = data[2]

In [86]:
col[np.abs(col) > 3]

626   -3.310472
Name: 2, dtype: float64

In [97]:
data.T[(np.abs(data) > 3).any(0)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,-1.03704,0.280114,0.348141,0.404818,-0.252379,-0.496607,-0.801455,0.010109,0.518206,1.259485,...,-1.175073,-0.355527,-0.891619,-1.670753,-0.172193,0.997816,0.704568,-0.847005,0.0883,2.256119
1,0.480368,-0.658197,0.323081,-1.106004,0.222831,-0.206471,1.258577,0.478437,-0.103478,-1.710111,...,-0.948752,0.432391,-1.325932,0.411172,1.052084,0.018778,0.499831,0.943988,-0.896484,1.506937
2,-0.732494,-0.640299,-0.282703,1.206464,-0.661767,0.91111,-1.00427,1.746622,-0.510934,-0.704893,...,-0.440009,-0.549347,1.92974,-0.47323,0.918225,-0.816405,-1.22783,-0.674538,0.741609,-0.946734
3,-0.962969,-1.392586,-0.756485,0.648723,-1.276991,-0.350624,-1.36672,0.960641,-0.785663,-0.519845,...,-1.033335,-0.638294,-0.166657,0.43504,-1.862281,0.761295,-0.021104,0.698034,-0.464865,1.372058


In [96]:
(np.abs(data) > 3).any(1)

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

In [186]:
#   x >=  -3 , 또는 3 <= x 이면 각각 -3, 3으로 지정
data[np.abs(data) > 3 ] = np.sign(data) * 3

In [187]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.012877,0.034329,0.001791,-0.007808
std,1.031802,1.000609,0.992134,0.986311
min,-3.0,-3.0,-2.866316,-3.0
25%,-0.684784,-0.625814,-0.644758,-0.676616
50%,0.00411,0.025001,0.00939,-0.013527
75%,0.663414,0.704429,0.641861,0.635498
max,2.898936,3.0,3.0,3.0


In [189]:
# sign은 양수 = 1.0 음수 = -1.0 로 만들어 반환
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,1.0,-1.0
1,-1.0,-1.0,-1.0,1.0
2,1.0,-1.0,1.0,1.0
3,1.0,-1.0,1.0,1.0
4,1.0,1.0,-1.0,-1.0


### 7.2.7 치환과 임의 샘플링
#### numpy.random.permutation
* Series나 DataFrame의 row를 임의 순서로 재배치 한다.
* 순서룰 바꾸고 싶은 만큼의 길이를 인자로 넘긴다.

In [190]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))

In [197]:
sampler = np.random.permutation(5)

In [198]:
sampler

array([0, 4, 2, 1, 3])

#### take
또는 iloc

In [None]:
df

In [203]:
# sampler의 행순서대로 정렬
df.take(sampler)

Unnamed: 0,0,1,2,3
0,0,1,2,3
4,16,17,18,19
2,8,9,10,11
1,4,5,6,7
3,12,13,14,15


#### Series(DataFrame).sample()

In [202]:
# 치환없이 일부만 임의 선택
df.sample(n=3)

Unnamed: 0,0,1,2,3
0,0,1,2,3
4,16,17,18,19
2,8,9,10,11


In [205]:
# replace옵션으로 반복 선택 허용하여 표본을 치환을 통해 생성
choices = pd.Series([5,7,-1,6,4])
draws = choices.sample(n=10, replace=True)

In [206]:
draws

1    7
0    5
4    4
4    4
3    6
4    4
0    5
2   -1
0    5
1    7
dtype: int64

### 7.2.8 표시자/더미 변수 계산하기

#### pandas.get_dummies
* 분류값을 '더미'나 '표시자'행렬로 전환
* One-Hot 인코딩

In [207]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                  'data1':range(6)})

In [208]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


#### get_dummies(DataFrame['컬럼명'], prefix)

In [209]:
dummies = pd.get_dummies(df['key'], prefix='key')

In [210]:
df_with_dummy = df[['data1']].join(dummies)

In [211]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


#### MovieLens 영화평점 데이터에서 각 장르별로 표시자값 추가하기
아래의 예제는 [GroupLens.org](https://grouplens.org/datasets/movielens/)에서 제공하는 Movielens 데이터셋 사용

In [212]:
mnames = ['movie_id', 'title', 'genres']

In [217]:
movies = pd.read_table('ml-latest-small\movies.csv', sep=',', header=None, names=mnames)

In [218]:
movies[:10]

Unnamed: 0,movie_id,title,genres
0,movieId,title,genres
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,2,Jumanji (1995),Adventure|Children|Fantasy
3,3,Grumpier Old Men (1995),Comedy|Romance
4,4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,5,Father of the Bride Part II (1995),Comedy
6,6,Heat (1995),Action|Crime|Thriller
7,7,Sabrina (1995),Comedy|Romance
8,8,Tom and Huck (1995),Adventure|Children
9,9,Sudden Death (1995),Action


In [300]:
all_genres = []

In [301]:
# 데이터 셋에서 유일한 장르목록 추출
for x in movies.genres:
    if x == 'genres':
        continue
    #a = x.split('|')
    all_genres.extend(x.split('|'))
print(all_genres)
    

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy', 'Adventure', 'Children', 'Fantasy', 'Comedy', 'Romance', 'Comedy', 'Drama', 'Romance', 'Comedy', 'Action', 'Crime', 'Thriller', 'Comedy', 'Romance', 'Adventure', 'Children', 'Action', 'Action', 'Adventure', 'Thriller', 'Comedy', 'Drama', 'Romance', 'Comedy', 'Horror', 'Adventure', 'Animation', 'Children', 'Drama', 'Action', 'Adventure', 'Romance', 'Crime', 'Drama', 'Drama', 'Romance', 'Comedy', 'Comedy', 'Action', 'Comedy', 'Crime', 'Drama', 'Thriller', 'Comedy', 'Crime', 'Thriller', 'Crime', 'Drama', 'Horror', 'Mystery', 'Thriller', 'Action', 'Crime', 'Thriller', 'Drama', 'Sci-Fi', 'Drama', 'Romance', 'Drama', 'Children', 'Drama', 'Drama', 'Romance', 'Adventure', 'Drama', 'Fantasy', 'Mystery', 'Sci-Fi', 'Crime', 'Drama', 'Drama', 'Mystery', 'Sci-Fi', 'Thriller', 'Children', 'Drama', 'Crime', 'Drama', 'Children', 'Comedy', 'Comedy', 'Romance', 'Drama', 'Drama', 'War', 'Action', 'Crime', 'Drama', 'Drama', 'Action', 'Adventure',

In [302]:
genres = pd.unique(all_genres)

In [303]:
genres

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary', 'IMAX',
       'Western', 'Film-Noir', '(no genres listed)'], dtype=object)

In [304]:
# 표시자 DataFrame 생성(0으로 초기화)
zero_metrix = np.zeros((len(movies), len(genres)))

In [305]:
dummies = pd.DataFrame(zero_metrix, columns=genres)
dummies

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [312]:
# 각 영화 순회하며 dummies 각 row항목을 1로 설정
gen = movies.genres[1]
gen.split('|')

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']

In [313]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2, 3, 4], dtype=int64)

In [314]:
# .iloc활용해 색인에 맞게 값 대입
for i, gen in enumerate(movies.genres):
    if gen == 'genres':
        continue
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [315]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))

In [319]:
movies_windic.iloc[1]

movie_id                                                              1
title                                                  Toy Story (1995)
genres                      Adventure|Animation|Children|Comedy|Fantasy
Genre_Adventure                                                     1.0
Genre_Animation                                                     1.0
Genre_Children                                                      1.0
Genre_Comedy                                                        1.0
Genre_Fantasy                                                       1.0
Genre_Romance                                                       0.0
Genre_Drama                                                         0.0
Genre_Action                                                        0.0
Genre_Crime                                                         0.0
Genre_Thriller                                                      0.0
Genre_Horror                                                    

#### get_dummies와 cut의 조합
* 이 같은 이산함수의 조합을 잘활용하면 통계 애플리케이션에서 유용하다

In [321]:
# 난수 seed 값을 지정하여 값이 불변하게 함.
np.random.seed(12345)

In [322]:
values = np.random.rand(10)

In [323]:
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [324]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

In [325]:
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


### To be continue...
7.3부터는 다음 시간에 

### 소정님 물어보신 것

In [76]:
import pandas as pd
chunker = pd.read_csv('ml-latest-small\movies.csv', chunksize=1000)
chunker

<pandas.io.parsers.TextFileReader at 0x1d143a8c6d0>

In [80]:
next(chunker)

Unnamed: 0,movieId,title,genres
3000,4016,"Emperor's New Groove, The (2000)",Adventure|Animation|Children|Comedy|Fantasy
3001,4017,Pollock (2000),Drama
3002,4018,What Women Want (2000),Comedy|Romance
3003,4019,Finding Forrester (2000),Drama
3004,4020,"Gift, The (2000)",Thriller
...,...,...,...
3995,5636,Welcome to Collinwood (2002),Comedy|Crime
3996,5637,Flirting (1991),Drama
3997,5638,Godzilla vs. Mothra (Mosura tai Gojira) (1964),Action|Adventure|Fantasy|Sci-Fi
3998,5640,"Godzilla, King of the Monsters! (Kaijû-ô Gojir...",Horror|Sci-Fi


In [54]:
print('__iter__' in dir(chunker), '__next__' in dir(chunker)) 

True True


In [None]:
list(chunker)

In [24]:
import csv

In [26]:
class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL

with open('ml-latest-small\movies.csv') as f:
    reader = csv.reader(f, dialect=my_dialect)