# Chapter7 ~ 시험범위인 7.2까지

# Data Cleaning and Preparation

In [6]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

# 누락된 데이터 처리하기

산술 데이터에 한해 pandas는 누락된 데이터를 실숫값인 NaN으로 취급한다.

In [7]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [8]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

분석 애플리케이션에서 NA 데이터는 데이터가 존재하지 않거나, 존재하더라도 데이터를 수집하는 과정 등에서 검출되지 않았음을 의미.

분석을 위해 데이터를 정제하는 과정에서 결측치 자체를 데이터 수집 과정에서 실수나 결측치로 인한 잠재적인 편향을 찾아내는 수단으로 인식하는 것은 중요하다.

파이썬이 내장 None 값 또한 NA 값으로 취급한다.

In [9]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

# 누락된 데이터 골라내기

pandas.isnull이나 불리언 색인을 사용해 직접 손으로 제거하는 것도 한가지 방법이지만, dropna를 매우 유용하게 사용할 수 있다.

Series에 driopna 메서드를 적용하는 널이 아닌 데이터와 색인값만 들어 있는 Series를 반환한다.

In [10]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [11]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

DataFrame 객체의 경우에는 조금 복잡한데, 모두 NA 값인 로우나 컬럼을 제외시키거나 NA값을 하나라도 포함하고 있는 로우나 컬럼을 제외시킬 수 있다.

dropna는 기본적으로 NA 값을 하나라도 포함하고 있는 로우를 제외시킨다.

In [13]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [14]:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


how='all' 옵션을 넘기면 모두 NA 값인 로우만 제외시킨다.

In [15]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


컬럼을 제외시키는 방법도 동일하게 동작한다. 옵션으로 axis = 1을 넘겨주면 된다.

In [16]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [17]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


DataFrame의 로우를 제외시키는 방법은 시계열 데이터에 주로 사용되는 경향이 있다.

몇 개 이상의 값이 들어있는 로우만 살펴보고 싶다면 thresh 인자에 원하는 값을 넘기면 된다.

In [18]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
df.dropna()
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.092908,,0.769023
3,1.246435,,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [19]:
df.dropna()

Unnamed: 0,0,1,2
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [20]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.092908,,0.769023
3,1.246435,,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


# 결측치 채우기

누락된 값을 제외시키지 않고(잠재적으로 다른 데이터도 함께 버려질 가능성이 있다) 데이터상의 '구멍'을 어떻게든 매우고 싶은 경우가 있다.

이 경우 fillna 메서드를 활용하면 되는데, fillna 메서드에 채워 넣고 싶은 값을 넘겨주면 된다.

In [21]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.204708,0.0,0.0
1,-0.55573,0.0,0.0
2,0.092908,0.0,0.769023
3,1.246435,0.0,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


fillna에 사전값을 넘겨서 각 컬럼마다 다른 값을 채울 수도 있다.

In [22]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-0.204708,0.5,0.0
1,-0.55573,0.5,0.0
2,0.092908,0.5,0.769023
3,1.246435,0.5,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


fillna는 새로운 객체를 반환하지만 다음처럼 기존 객체를 변경할 수도 있다.

In [23]:
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,-0.204708,0.0,0.0
1,-0.55573,0.0,0.0
2,0.092908,0.0,0.769023
3,1.246435,0.0,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


재색인에서 사용 가능한 보간 메서드는 fillna 메서드에서도 사용가능하다.

In [24]:
df = pd.DataFrame(np.random.randn(6, 3))
df

Unnamed: 0,0,1,2
0,0.476985,3.248944,-1.021228
1,-0.577087,0.124121,0.302614
2,0.523772,0.00094,1.34381
3,-0.713544,-0.831154,-2.370232
4,-1.860761,-0.860757,0.560145
5,-1.265934,0.119827,-1.063512


In [25]:
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.476985,3.248944,-1.021228
1,-0.577087,0.124121,0.302614
2,0.523772,,1.34381
3,-0.713544,,-2.370232
4,-1.860761,,
5,-1.265934,,


In [26]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.476985,3.248944,-1.021228
1,-0.577087,0.124121,0.302614
2,0.523772,0.124121,1.34381
3,-0.713544,0.124121,-2.370232
4,-1.860761,0.124121,-2.370232
5,-1.265934,0.124121,-2.370232


In [27]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.476985,3.248944,-1.021228
1,-0.577087,0.124121,0.302614
2,0.523772,0.124121,1.34381
3,-0.713544,0.124121,-2.370232
4,-1.860761,,-2.370232
5,-1.265934,,-2.370232


fillna는 Series의 평균값이나 중간값을 전달 할 수도 있다.

In [28]:
data = pd.Series([1., NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [29]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# 데이터 변형

## 중복 제거하기

In [30]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


DataFrame의 duplicated 메서드는 각 로우가 중복인지 아닌지 알려주는 불리언 Series를 반환한다.

In [31]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

drop_duplicates는 duplicated 배열이 False인 DataFrame을 반환한다.

In [32]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


이 두 메서드는 기본적으로 모든 컬럼에 적용되며 중복을 찾아내기 위한 부분합을 따로 지정해 줄 수도 있다. 

새로운 컬럼을 하나 추가하고 'k1' 컬럼에 기반해서 중복을 걸러내려면 다음과 같이 해야한다.

In [33]:
data['v1'] = range(7)
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


duplicated와 drop_duplicates는 기본적으로 처음 발견된 값을 유지한다.

keep='last' 옵션을 넘기면 마지막으로 발견된 값을 반환한다.

In [34]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


# 함수나 매핑을 이용해서 데이터 변형하기

In [35]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


# ------------------------------------------------------------------

# 교수님 설명

In [36]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

In [37]:
meat_to_animal[data['food'][3]] # 밑에있는 딕셔너리에서 못찾음 따라서 오류남

KeyError: 'Pastrami'

In [38]:
meat_to_animal[data['food'].str.lower()[3]] # 그래서 str.lower()로 소문자로 바꿔줌

'cow'

Series의 map 메서드는 사전류의 객체나 어떤 함수를 받을 수 있는데, 위 데이터에는 육류 이름에 대소문자가 섞여 있는 사소한 문제가 있으므로 str.lower 메서드를 사용해서 모두 소문자로 변경한다.

In [39]:
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [40]:
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [41]:
meat_to_animal[data['food'].str.lower()[0]]

'pig'

In [42]:
data['food'].map(lambda x: meat_to_animal[x.lower()])
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [43]:
data['animal'] = data['food'].map(lambda x: meat_to_animal[x.lower()])
# meat_to_animal이 x.lower()안에 소문자로 바뀌고 딕셔너리 안으로 들어감
# for loop문 대신에 'lambda x:' 가 소문자로 변환된 객체로 바뀌어서 전체로 적용이 됨 
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


map 메서드를 사용하면 데이터의 요소별 변환 및 데이터를 다듬든 작업을 편리하게 수행 가능

# 값 치환하기

fillna 메서드를 사용해서 누락된 값을 채우는 일은 일반적인 값 치환 작업이라 볼 수 있다.

map 메서드는 한 객체 안에서 값의 부분집합을 변경하는데 사용했다면 replace 메서드는 같은 작업에 대해 좀 더 간단하고 유연한 방법을 제공한다.

In [44]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

-999는 누락된 데이터를 나타내기 위한 값이다. replace 메서드를 이용하면 이 값을 pandas에서 인식할 수 있는 NA 값으로 치환한 새로운 Series를 생성할 수 있다. (인자로 inplace = True를 넘기지 않았다면).

In [45]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

여러 개의 값을 한 번에 치환하려면 하나의 값 대신 치환하려는 값의 리스트를 넘기면 된다.

In [46]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

치환하려는 값마다 다른 값으로 치환하려면 누락된 값 대신 새로 지정할 값의 리스트를 사용하면 된다.

In [47]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

두 개의 리스트 대신 사전을 이용하는 것도 가능하다.

In [48]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

# 축 색인 이름 바꾸기

In [49]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


Series와 마찬가지로 축 색인에도 map 메서드가 있다.

In [50]:
transform = lambda x: x[:4].upper()
transform

<function __main__.<lambda>(x)>

In [51]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

대문자로 변경된 축 이름을 DataFrame의 index에 바로 대입할 수 있다.

In [52]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


원래 객체를 변경하지 않고 새로운 객체를 생성하려면 rename 메서드를 사용한다.

In [53]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


특히 rename 메서드는 사전 형식의 객체를 이용해서 축 이름 중 일부만 변경하는 것도 가능하다.

In [54]:
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


rename 메서드를 사용하면 DataFrame을 직접 복사해서 index와 columns 속성을 갱신할 필요없이 바로 변경할 수 있다. 

원본 데이터를 바로 변경하려면 inplace=True 옵션을 넘겨주면 된다.

In [55]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


# 개별화와 양자화

In [56]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
ages

[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

이 데이터를 pandas의 cut 함수를 이용해서 18-25, 25-35, 35-60, 60 이상 그룹으로 나누어보자.

In [57]:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

pandas에서 반환하는 객체는 Categorical이라는 특수한 객체다. 

결과에서 보이는 그룹은 pandas.cut으로 계산된 것이다. 이 객체는 그룹 이름이 담긴 배열이라고 생각하면 된다.

이 Categorical 객체는 codes 속성에 있는 ages 데이터에 대한 카테고리 이름을 categories라는 배열에 내부적으로 담고 있다.

In [58]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [59]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [60]:
pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

pd.value_counts(cats)는 pandas.cut 결과에 대한 그룹 수다.

간격을 나타내는 표기법은 중괄호로 시작해서 대괄호로 끝나는데 중괄호 쪽의 값은 포함하지 않고 대괄호 쪽의 값은 포함하는 간격을 나타낸다.

right=False를 넘겨서 중괄호 대신 대괄호 쪽이 포함되지 않도록 바꿀 수 있다.

In [61]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

# -----------------------------------------------

# Quiz

위의 ages 객체를 10대, 20대, 30대, 40대, 50대, 60대 이상으로 구분해 봅시다.

In [62]:
ages = [20, 22, 25, 27, 21, 23, 23, 37, 31, 61, 45, 41, 32]
group_names = ['10s', '20s', '30s', '40s', '50s', 'over_60']
bins = [10, 20, 30, 40, 50, 60, 100]
pd.cut(ages, bins, labels = group_names, right=False)

['20s', '20s', '20s', '20s', '20s', ..., '30s', 'over_60', '40s', '40s', '30s']
Length: 13
Categories (6, object): ['10s' < '20s' < '30s' < '40s' < '50s' < 'over_60']

In [63]:
group_names = ['Youth', 'YoungAdult', 'NiddleAged', 'Senior']
bins = [18, 25, 35, 60, 100]
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'NiddleAged', 'NiddleAged', 'YoungAdult']
Length: 13
Categories (4, object): ['Youth' < 'YoungAdult' < 'NiddleAged' < 'Senior']

In [64]:
data = np.random.rand(20)
data

array([0.4896, 0.3773, 0.8486, 0.9111, 0.3838, 0.3155, 0.5684, 0.1878,
       0.1258, 0.6876, 0.7996, 0.5735, 0.9732, 0.6341, 0.8884, 0.4954,
       0.3516, 0.7142, 0.5039, 0.2256])

In [65]:
pd.cut(data, 4, precision=2)

[(0.34, 0.55], (0.34, 0.55], (0.76, 0.97], (0.76, 0.97], (0.34, 0.55], ..., (0.34, 0.55], (0.34, 0.55], (0.55, 0.76], (0.34, 0.55], (0.12, 0.34]]
Length: 20
Categories (4, interval[float64, right]): [(0.12, 0.34] < (0.34, 0.55] < (0.55, 0.76] < (0.76, 0.97]]

# Quiz

학점을 매기는데, A학점은 상위 30%, B학점은 30~70%, C학점은 70% 이하에게 준다고 했을 때, data의 학점을 매겨봅시다.

# ------------------------------------------------

labels 옵션으로 그룹의 이름을 직접 넘겨줄 수도 있다.

In [66]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 13
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

만약 cut 함수에 명시적으로 그룹의 경곗값을 넘기지 않고 그룹의 개수를 넘겨주면 데이터에서 최솟값과 최댓값을 기준으로 균등한 길이의 그룹을 자동으로 계산한다.

In [67]:
data = np.random.rand(20)
pd.cut(data, 4, precision=2)

[(0.11, 0.32], (0.74, 0.95], (0.32, 0.53], (0.74, 0.95], (0.74, 0.95], ..., (0.74, 0.95], (0.74, 0.95], (0.74, 0.95], (0.11, 0.32], (0.74, 0.95]]
Length: 20
Categories (4, interval[float64, right]): [(0.11, 0.32] < (0.32, 0.53] < (0.53, 0.74] < (0.74, 0.95]]

precision=2 옵션은 소수점 아래 2자리까지로 제한한다.

이를 위한 가장 적합한 함수로 qcut이 있는데 표본 변위치를 기반으로 데이터를 나눠준다.

cut 함수를 사용하면 데이터의 분산에 따라 각각의 그룹마다 데이터 수가 다르게 나뉘는 경우가 많다. qcut은 표준 변위치를 사용하기 때문에 적당히 같은 크기의 그룹으로 나눌 수 있다.

In [68]:
data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 4)  # 4분위로 분류
cats

[(-0.0309, 0.613], (0.613, 3.928], (-0.686, -0.0309], (-0.686, -0.0309], (-2.9499999999999997, -0.686], ..., (-2.9499999999999997, -0.686], (-2.9499999999999997, -0.686], (-0.686, -0.0309], (0.613, 3.928], (-2.9499999999999997, -0.686]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.9499999999999997, -0.686] < (-0.686, -0.0309] < (-0.0309, 0.613] < (0.613, 3.928]]

In [69]:
pd.value_counts(cats)

(-2.9499999999999997, -0.686]    250
(-0.686, -0.0309]                250
(-0.0309, 0.613]                 250
(0.613, 3.928]                   250
dtype: int64

cut 와 유사하게 변위치를 직접 지정해줄 수 있다.(변위치는 0부터 1까지다.)

In [70]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(-0.0309, 1.287], (-0.0309, 1.287], (-1.187, -0.0309], (-1.187, -0.0309], (-1.187, -0.0309], ..., (-1.187, -0.0309], (-1.187, -0.0309], (-1.187, -0.0309], (1.287, 3.928], (-2.9499999999999997, -1.187]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.9499999999999997, -1.187] < (-1.187, -0.0309] < (-0.0309, 1.287] < (1.287, 3.928]]

# 특잇값을 찾고 제외하기

배열 연산을 수행할 때는 특잇값을 제외하거나 적당한 값으로 대체하는 것이 중요하다.

In [71]:
data = pd.DataFrame(np.random.randn(1000, 4))
data

Unnamed: 0,0,1,2,3
0,-0.669619,0.781199,-0.395813,-0.180737
1,1.337508,-0.416584,0.329313,-0.732599
2,0.137889,-0.719672,1.927640,-0.315813
3,-1.869341,-0.428579,-0.017905,-1.720285
4,0.010867,0.824170,-0.489180,1.832492
...,...,...,...,...
995,-0.485783,1.181563,-2.314042,-0.865834
996,-0.457931,0.240360,-1.839647,0.578781
997,-1.365114,0.117161,-0.138244,-1.251215
998,-1.088757,-2.093583,-0.111520,-0.475629


In [72]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.046794,0.025429,-0.006855,-0.047439
std,0.997575,1.009162,0.993734,0.997187
min,-3.64586,-3.184377,-3.745356,-3.428254
25%,-0.599807,-0.612162,-0.697084,-0.743886
50%,0.043663,-0.01107,-0.029924,-0.086309
75%,0.746527,0.695298,0.694459,0.624413
max,2.653656,3.525865,2.735527,3.366626


이 DataFrame의 한 컬럼에서 절댓값이 3을 초과하는 값을 찾아내자

In [73]:
col = data[2]
col

0     -0.395813
1      0.329313
2      1.927640
3     -0.017905
4     -0.489180
         ...   
995   -2.314042
996   -1.839647
997   -0.138244
998   -0.111520
999    0.345292
Name: 2, Length: 1000, dtype: float64

In [74]:
col[np.abs(col) > 3]

37    -3.399312
132   -3.745356
Name: 2, dtype: float64

절댓값이 3을 초과하는 값이 들어 있는 모든 로우를 선택하려면 불리언 DataFrame에서 any 메서드를 사용하면 된다.

In [75]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
37,0.457246,-0.025907,-3.399312,-0.974657
56,1.951312,3.260383,0.963301,1.201206
132,0.508391,-0.196713,-3.745356,-1.520113
231,-0.242459,-3.05699,1.918403,-0.578828
254,0.682841,0.326045,0.425384,-3.428254
318,1.179227,-3.184377,1.369891,-1.074833
540,-3.548824,1.553205,-2.186301,1.277104
631,-0.578093,0.193299,1.397822,3.366626
778,-0.207434,3.525865,0.28307,0.544635
799,-3.64586,0.255475,-0.549574,-1.907459


이 기준대로 쉽게 값을 선택할 수 있으며, 아래 코드로 -3이나 3을 초과하는 값을 -3 또는 3으로 지정할 수 있다.

In [76]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.047988,0.024884,-0.005711,-0.047378
std,0.993553,1.005924,0.989915,0.994634
min,-3.0,-3.0,-3.0,-3.0
25%,-0.599807,-0.612162,-0.697084,-0.743886
50%,0.043663,-0.01107,-0.029924,-0.086309
75%,0.746527,0.695298,0.694459,0.624413
max,2.653656,3.0,2.735527,3.0


np.sign(data)는 data 값이 양수인지 음수인지에 따라 1이나 -1이 담긴 배열을 반환한다.

In [77]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,-1.0
1,1.0,-1.0,1.0,-1.0
2,1.0,-1.0,1.0,-1.0
3,-1.0,-1.0,-1.0,-1.0
4,1.0,1.0,-1.0,1.0


# Quiz

data에서 모든 컬럼의 값이 양수인 행을 추출해 봅시다.

In [78]:
data1 = pd.DataFrame(np.random.randn(1000, 4))
data1

Unnamed: 0,0,1,2,3
0,-1.394656,-0.281825,0.109331,-0.444135
1,1.442611,0.342310,0.299675,-0.550621
2,1.809901,0.025721,-0.161442,2.771398
3,0.460863,1.694641,-0.935589,0.766984
4,-2.040757,-0.882594,-1.131414,0.490516
...,...,...,...,...
995,-0.887796,0.372837,-0.850192,-0.686489
996,0.207363,-0.266300,0.253912,-1.270547
997,0.682816,-0.648243,0.433294,0.157510
998,1.386222,0.010130,0.457039,-0.338315


In [79]:
data1[np.abs(data1) > 3] = np.sign(data1) * 3
data1.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.002955,-0.001256,-0.05686,-0.050641
std,1.03888,1.014022,0.978234,1.001213
min,-3.0,-3.0,-3.0,-3.0
25%,-0.740508,-0.696704,-0.753809,-0.736424
50%,0.016668,0.034523,-0.054956,-0.038821
75%,0.700698,0.702548,0.616784,0.601663
max,3.0,2.961194,3.0,2.916153


# Quiz

다음 데이터프레임에서 3번 컬럼의 상위 5% 값을 제거해 봅시다.

In [80]:
data = pd.DataFrame(np.random.randn(1000, 4))
data

np.quantile(data[3],0.95)

1.5931306853307015

In [81]:
outlier = np.quantile(data[3],0.95)
outlier

1.5931306853307015

# 치환과 임의 샘플링

numpy.random.permutation 함수를 이용하면 Series나 DataaFrame의 로우를 쉽게 임의 순서로 재배치 할 수 있다.

순서를 바꾸고 싶은 만큼의 길이를 permuatation 함수로 넘기면 바뀐 순서가 담긴 정수 배열이 생성된다.

In [82]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler

array([3, 2, 4, 0, 1])

In [83]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [84]:
df.take(sampler)

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
4,16,17,18,19
0,0,1,2,3
1,4,5,6,7


치환 없이 일부만 임의로 선택하려면 Series나 DataFrame의 sample 메서드를 사용하면 된다.

In [85]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
2,8,9,10,11
0,0,1,2,3
4,16,17,18,19


(반복 선택을 허용하며) 표본을 치환을 통해 생성해내려면 sample에 replace=True 옵션을 넘긴다.

In [86]:
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws

1    7
0    5
1    7
1    7
0    5
3    6
3    6
0    5
3    6
1    7
dtype: int64

# 표시자/더미 변수 계산하기

통계 모델이나 머신러닝 애플리케이션을 위한 또 다른 데이터 변호나은 분류값을 '더미'나 '표시자' 행렬로 전환하는 것이다. 만약 어떤 DataFrame의 한 컬럼에 k가지의 값이 있다면 k개의 컬럼이 있는 DataFrame이나 행렬을 만들고 값으로는 1과 0을 채워 넣을 것이다.

pandas의 get_dummies가 이를 위한 함수이다.

In [87]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                  'datal' : range(6)})
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


표시자 DataFrame 안에 있는 컬럼에 접두어를 추가한 후 다른 데이터와 병합하고 싶을 경우가 있다.

get_dummies 함수의 prefix 인자를 사용하면 이를 수행할 수 있다.

In [88]:
dummies = pd.get_dummies(df['key'], prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [89]:
df_with_dummy = df[['datal']].join(dummies)
df_with_dummy

Unnamed: 0,datal,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [90]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('/Users/otaehun/Desktop/2022_1학기/major/python/chapter7/movies.dat.txt', sep='::',
                       header=None, names=mnames)
movies[:10]

  return func(*args, **kwargs)


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


각 장르마다 표시자 값을 추출하기

In [91]:
all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))
genres = pd.unique(all_genres)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

표시자 DataFrame을 생성하기 위한 0으로 초기화된 DataFrame 생성

In [92]:
zero_matrix = np.zeros((len(movies), len(genres)))
zero_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [93]:
dummies = pd.DataFrame(zero_matrix, columns = genres)
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


각 영화를 순회하면서 dummies의 각 로우의 항목을 1로 설정한다.

각 장르의 컬럼 색인을 계산하기 위해 dummies.columns를 사용한다.

In [94]:
gen = movies.genres[0]
gen.split('|')

['Animation', "Children's", 'Comedy']

In [95]:
# 다른방법
movies.genres[0].split('|')

['Animation', "Children's", 'Comedy']

In [96]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2])

.iloc를 이용해서 색인에 맞게 값을 대입

In [97]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [98]:
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


movies와 조합

In [99]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.iloc[0]

movie_id                                      1
title                          Toy Story (1995)
genres              Animation|Children's|Comedy
Genre_Animation                             1.0
Genre_Children's                            1.0
                               ...             
Genre_War                                   0.0
Genre_Musical                               0.0
Genre_Mystery                               0.0
Genre_Film-Noir                             0.0
Genre_Western                               0.0
Name: 0, Length: 21, dtype: object

In [100]:
movies_windic.head()

Unnamed: 0,movie_id,title,genres,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Adventure,Genre_Fantasy,Genre_Romance,Genre_Drama,...,Genre_Crime,Genre_Thriller,Genre_Horror,Genre_Sci-Fi,Genre_Documentary,Genre_War,Genre_Musical,Genre_Mystery,Genre_Film-Noir,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


get_dummies와 cut 같은 이산함수를 잘 조합하면 통계 애플리케이션에서 유용하게 사용가능

In [101]:
np.random.seed(12345)
values = np.random.rand(10)
values

array([0.9296, 0.3164, 0.1839, 0.2046, 0.5677, 0.5955, 0.9645, 0.6532,
       0.7489, 0.6536])

In [102]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


In [103]:
pd.cut(values, bins)

[(0.8, 1.0], (0.2, 0.4], (0.0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.4, 0.6], (0.8, 1.0], (0.6, 0.8], (0.6, 0.8], (0.6, 0.8]]
Categories (5, interval[float64, right]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]

# Quiz

아래와 같은 사람의 키와 나이가 있는 데이터프레임이 있다고 합시다.

나이를 18-25세, 26-35세, 36-60세, 61-100세로 카테고리화 하고, 해당 변수를 Dummy variable로 변환해봅시다.

최종적으로 생성한 dummy 변수들을 기존의 데이터프레임에 합쳐봅시다.

In [115]:
ages = pd.DataFrame({'height':['175', '172','185', '179', '197', '157',
                              '160', '190', '171', '180'],
            'age':[20, 22, 25, 27, 21, 23, 37, 31, 61, 45]})
ages

Unnamed: 0,height,age
0,175,20
1,172,22
2,185,25
3,179,27
4,197,21
5,157,23
6,160,37
7,190,31
8,171,61
9,180,45


In [116]:
bins = [18, 26, 36, 61, 100]

In [117]:
ages_1 = list(ages['age'])
ages_1

[20, 22, 25, 27, 21, 23, 37, 31, 61, 45]

In [118]:
cats = pd.cut(ages_1, bins)
cats

[(18, 26], (18, 26], (18, 26], (26, 36], (18, 26], (18, 26], (36, 61], (26, 36], (36, 61], (36, 61]]
Categories (4, interval[int64, right]): [(18, 26] < (26, 36] < (36, 61] < (61, 100]]

In [119]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 2, 2], dtype=int8)

In [120]:
cats.categories

IntervalIndex([(18, 26], (26, 36], (36, 61], (61, 100]], dtype='interval[int64, right]')

In [121]:
dummies = pd.get_dummies(cats.codes)
dummies

Unnamed: 0,0,1,2
0,1,0,0
1,1,0,0
2,1,0,0
3,0,1,0
4,1,0,0
5,1,0,0
6,0,0,1
7,0,1,0
8,0,0,1
9,0,0,1


In [122]:
ages['age_cate'] = pd.cut(ages['age'], bins)
ages

Unnamed: 0,height,age,age_cate
0,175,20,"(18, 26]"
1,172,22,"(18, 26]"
2,185,25,"(18, 26]"
3,179,27,"(26, 36]"
4,197,21,"(18, 26]"
5,157,23,"(18, 26]"
6,160,37,"(36, 61]"
7,190,31,"(26, 36]"
8,171,61,"(36, 61]"
9,180,45,"(36, 61]"


In [123]:
result = pd.concat([ages, dummies], axis=1)
result

Unnamed: 0,height,age,age_cate,0,1,2
0,175,20,"(18, 26]",1,0,0
1,172,22,"(18, 26]",1,0,0
2,185,25,"(18, 26]",1,0,0
3,179,27,"(26, 36]",0,1,0
4,197,21,"(18, 26]",1,0,0
5,157,23,"(18, 26]",1,0,0
6,160,37,"(36, 61]",0,0,1
7,190,31,"(26, 36]",0,1,0
8,171,61,"(36, 61]",0,0,1
9,180,45,"(36, 61]",0,0,1
