In [1]:
import pandas as pd
import numpy as np
import sys,os

pd.options.display.max_rows = 10

# Ch10. 데이터 집계와 그룹 연산

## 10.1. GroupBy 메카닉  
분리 - 적용 - 결합 (split-apply-combine)

In [2]:
df = pd.DataFrame({
    'key1':['a','a','b','b','a'],
    'key2':['one','two','one','two','one'],
    'data1':np.random.randn(5),
    'data2':np.random.randn(5),
})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.022561,-0.654161
1,a,two,0.334923,-0.146149
2,b,one,0.755614,-0.657047
3,b,two,0.251561,-2.476287
4,a,one,1.225589,0.383743


위 데이터를 key1으로 묶고 각 그룹에서 data1의 평균을 구하기  
groupby 메서드를 호출하면됨

In [3]:
df['data1'].groupby(df['key1'])
grouped = df['data1'].groupby(df['key1'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001F28156FC18>

GroupBy 객체는 key1으로 참조되는 중간값에 대한것 외에는 아무것도 계산되지 않은 객체임  
그룹연산에 필요한 모든 정보를 포함하고 있어, 각 그룹에 연산을 적용할수 있게함

In [4]:
grouped.sum()
grouped.mean()

key1
a    1.583073
b    1.007175
Name: data1, dtype: float64

key1
a    0.527691
b    0.503587
Name: data1, dtype: float64

데이터가 그룹색인에 따라 수집되고 key1컬럼의 유니크한 값으로 색인되는 새로운 Series 객체가 생성됨

여러개의 배열을 넘기면 계층 색인을 가지는 Series가 나옴

In [5]:
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means

key1  key2
a     one     0.624075
      two     0.334923
b     one     0.755614
      two     0.251561
Name: data1, dtype: float64

groupby에 넘기는 객체는 길이만 같으면 어떤것도 상관없음

In [6]:
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()

California  2005    0.334923
            2006    0.755614
Ohio        2005    0.137061
            2006    1.225589
Name: data1, dtype: float64

한 그룹으로 묶을 정보를 같은 DataFrame안에서 찾을 경우 컬럼이름을 넘겨서 사용할수 있음

In [7]:
df.groupby('key1').mean()
'''key2의 경우 숫자데이터가 아니기 때문에 (성가신컬럼,nuisance column)결과에서 제외됨'''
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.527691,-0.138856
b,0.503587,-1.566667


'key2의 경우 숫자데이터가 아니기 때문에 (성가신컬럼,nuisance column)결과에서 제외됨'

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.624075,-0.135209
a,two,0.334923,-0.146149
b,one,0.755614,-0.657047
b,two,0.251561,-2.476287


GroupBy 메서드 중 size메서드는 그룹의 크기를 돌려주므로 유용함

In [8]:
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

### 10.1.1 그룹간 순회하기  
groupby 객체는 이터레이션을 지원함  
그룹이름과 그에 따른 데이터 묶음을 반환

In [9]:
for name,data in df.groupby('key1'):
    print(name)
    print(data)
    print('--------------')

a
  key1 key2     data1     data2
0    a  one  0.022561 -0.654161
1    a  two  0.334923 -0.146149
4    a  one  1.225589  0.383743
--------------
b
  key1 key2     data1     data2
2    b  one  0.755614 -0.657047
3    b  two  0.251561 -2.476287
--------------


In [10]:
#색인이 여러개일 경우 튜플의 첫번째 원소가 색인값이됨
for k,group in df.groupby(['key1','key2']):
    print(k)
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one  0.022561 -0.654161
4    a  one  1.225589  0.383743
('a', 'two')
  key1 key2     data1     data2
1    a  two  0.334923 -0.146149
('b', 'one')
  key1 key2     data1     data2
2    b  one  0.755614 -0.657047
('b', 'two')
  key1 key2     data1     data2
3    b  two  0.251561 -2.476287


In [11]:
#쪼갠데이터를 사전에 넣어쓰기
pieces = dict(list(df.groupby('key1')))
pieces
pieces['a']

{'a':   key1 key2     data1     data2
 0    a  one  0.022561 -0.654161
 1    a  two  0.334923 -0.146149
 4    a  one  1.225589  0.383743, 'b':   key1 key2     data1     data2
 2    b  one  0.755614 -0.657047
 3    b  two  0.251561 -2.476287}

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.022561,-0.654161
1,a,two,0.334923,-0.146149
4,a,one,1.225589,0.383743


기본적으로 axis=0 으로 그룹을 만들지만 다른축도 가능함,  
데이터 타입에 따라 쪼개기

In [12]:
df.dtypes
grouped = df.groupby(df.dtypes,axis=1)
for k,data in grouped:
    print(k,'\n',data)

key1      object
key2      object
data1    float64
data2    float64
dtype: object

float64 
       data1     data2
0  0.022561 -0.654161
1  0.334923 -0.146149
2  0.755614 -0.657047
3  0.251561 -2.476287
4  1.225589  0.383743
object 
   key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


### 10.1.2 컬럼이나 컬럼의 일부만 선택하기

In [13]:
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001F281A97BA8>

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F281A97550>

In [14]:
#신택틱 슈거
df.groupby('key1')['data1']
df.groupby('key2')[['data2']]

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001F281A97C50>

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F281AA5198>

array를 넘기면 DF로, 단일값을 넘기면 series

In [15]:
df.groupby(['key1','key2'])[['data1']].mean()
df.groupby(['key1','key2'])['data1'].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1
key1,key2,Unnamed: 2_level_1
a,one,0.624075
a,two,0.334923
b,one,0.755614
b,two,0.251561


key1  key2
a     one     0.624075
      two     0.334923
b     one     0.755614
      two     0.251561
Name: data1, dtype: float64

### 10.1.3. 사전과 Series에서 그루핑하기

In [16]:
people = pd.DataFrame(np.random.randn(5,5),
                     columns = list('abcde'),
                     index = ['Joe','Steve','Wes','Jim','Travis'])
people.iloc[2:3,[1,2]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,0.055102,-0.440212,1.178929,0.143431,-0.035396
Steve,1.274939,-0.175007,0.286698,-0.286489,-0.727177
Wes,-0.688031,,,-0.695083,1.612965
Jim,-0.126124,0.11659,-0.494622,-0.899572,-0.894956
Travis,-1.821673,1.334003,0.638526,1.149901,0.375524


컬럼에 매핑되는 새로운 그룹이 있고, 그룹별로 더하기

In [17]:
mapping={
    'a':'red',
    'b':'red',
    'c':'blue',
    'd':'blue',
    'e':'red',
    'f':'orange'
}

by_col = people.groupby(mapping,axis=1)
by_col.sum()

Unnamed: 0,blue,red
Joe,1.32236,-0.420506
Steve,0.000209,0.372755
Wes,-0.695083,0.924933
Jim,-1.394194,-0.904491
Travis,1.788427,-0.112145


In [18]:
map_series = pd.Series(mapping)
map_series
people.groupby(map_series,axis=1).count()

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


### 10.1.4 함수로 그루핑하기  
사전이나 Series를 이용하는것 보다 파이썬 함수를 이용하는것이 조금더 일반적임  
넘긴 함수는 색인값 하나마다 한번씩 호출됨

이름이 같은 놈끼리 묶으려면..

In [19]:
a = people.groupby(len,axis=0)
[t[0] for t in a]
a.sum()

[3, 5, 6]

Unnamed: 0,a,b,c,d,e
3,-0.759054,-0.323622,0.684307,-1.451223,0.682612
5,1.274939,-0.175007,0.286698,-0.286489,-0.727177
6,-1.821673,1.334003,0.638526,1.149901,0.375524


내부적으로 모두 배열로 변환되므로 함수와 다른것을 섞어도 문제가 안됨

In [20]:
key_list = ['one','one','one','two','two']

people.groupby([len,key_list]).mean()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.316465,-0.440212,1.178929,-0.275826,0.788784
3,two,-0.126124,0.11659,-0.494622,-0.899572,-0.894956
5,one,1.274939,-0.175007,0.286698,-0.286489,-0.727177
6,two,-1.821673,1.334003,0.638526,1.149901,0.375524


### 10.1.5 색인 단계로 그루핑하기  
계층색인에서 하나를 선택하기 위한 기능

In [21]:
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],
                                   [1,3,5,1,3]],names=['cty','tenor'])
hier_df = pd.DataFrame(np.random.randn(4,5), columns = columns)
hier_df
hier_df.groupby(level='cty',axis=1).count()

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-2.831089,-0.057284,0.249122,0.727816,-1.299475
1,-0.124871,-0.178453,-1.531086,-2.216218,0.497358
2,0.87164,0.585247,-0.272664,0.41667,0.076208
3,-0.985712,1.494674,0.446053,-0.018998,0.990033


cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


## 10.2. 데이터 집계  
배열로부터 스칼라값을 만들어 내는 모든 데이터 변환 작업  
- count,sum,mean,median,std,var,min,max,prod,first,last 등등

GroupBy 메서드는 아니지만 Series메서드인 quantile등도 사용가능

In [22]:
df.groupby('key1')['data1'].quantile(0.9)

key1
a    1.047456
b    0.705209
Name: data1, dtype: float64

직접 만든 집계함수를 사용할수도 있음  
- aggregate나 agg 메서드에 함수를 넘기면됨

In [23]:
def peak_to_peak(arr):
    return arr.max() - arr.min()
df.groupby('key1').agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.203028,1.037904
b,0.504054,1.81924


In [24]:
df.groupby('key1').describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,0.527691,0.624251,0.022561,0.178742,0.334923,0.780256,1.225589,3.0,-0.138856,0.518991,-0.654161,-0.400155,-0.146149,0.118797,0.383743
b,2.0,0.503587,0.35642,0.251561,0.377574,0.503587,0.629601,0.755614,2.0,-1.566667,1.286397,-2.476287,-2.021477,-1.566667,-1.111857,-0.657047


사용자 정의 함수는 일반적으로 GroupBy함수보다 매우 느린데, 중간 데이터를 행성하는 과정에서 호출이나 정렬같은 오버헤드가 발생하기 때문

### 10.2.1. 컬럼에 여러가지 함수 적용하기

In [25]:
tips = pd.read_csv('Datas/tips.csv')
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [26]:
tips.aggregate(np.mean)

total_bill    19.785943
tip            2.998279
size           2.569672
tip_pct        0.160803
dtype: float64

모든 컬럼을 집계하는것은 mean등의 메서드를 사용하거나 원하는 함수에 aggregate를 사용하면되지만  
컬럼에 따라 다른 함수를 사용해서 집계를 수행하거나, 여러개의 함수를 한번에 적용하려면 다음과 같이..

In [27]:
#tips를 day와 smoker로 묶기
grouped = tips.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
#groupby 메서드의 경우 문자열로 넘김
#grouped_pct.mean()
grouped_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

함수 목록이나 함수 이름을 넘기면 함수 이름을 컬럼으로 하는 DataFrame을 얻음

In [28]:
grouped_pct.agg(['mean','std','count',peak_to_peak,lambda x:'람다! {}'.format(len(x))])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,count,peak_to_peak,<lambda>
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Fri,No,0.15165,0.028123,4,0.067349,람다! 4
Fri,Yes,0.174783,0.051293,15,0.159925,람다! 15
Sat,No,0.158048,0.039767,45,0.235193,람다! 45
Sat,Yes,0.147906,0.061375,42,0.290095,람다! 42
Sun,No,0.160113,0.042347,57,0.193226,람다! 57
Sun,Yes,0.18725,0.154134,19,0.644685,람다! 19
Thur,No,0.160298,0.038774,45,0.19335,람다! 45
Thur,Yes,0.163863,0.039389,17,0.15124,람다! 17


함수의 결과값을 따로 지정해 줄수 있음  
[(컬럼이름1, 함수1),(컬럼이름2,함수2)...]

In [29]:
grouped_pct.agg([('평균','mean'),('표준편차','std'),('갯수','count'),('P2P',peak_to_peak),('람다',lambda x:'람다! {}'.format(len(x)))])

Unnamed: 0_level_0,Unnamed: 1_level_0,평균,표준편차,갯수,P2P,람다
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Fri,No,0.15165,0.028123,4,0.067349,람다! 4
Fri,Yes,0.174783,0.051293,15,0.159925,람다! 15
Sat,No,0.158048,0.039767,45,0.235193,람다! 45
Sat,Yes,0.147906,0.061375,42,0.290095,람다! 42
Sun,No,0.160113,0.042347,57,0.193226,람다! 57
Sun,Yes,0.18725,0.154134,19,0.644685,람다! 19
Thur,No,0.160298,0.038774,45,0.19335,람다! 45
Thur,Yes,0.163863,0.039389,17,0.15124,람다! 17


DataFrame은 컬럼마다 다른 함수를 적용하거나, 여러개의 함수를 모든 컬럼에 적용할 수 있다.

In [30]:
functions=['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


아래처럼 concat으로 이어붙이는것이랑 같은결과

In [31]:
a = grouped['tip_pct'].agg(functions)
b = grouped['total_bill'].agg(functions)
pd.concat([a,b],axis=1,keys=['tip_pct','total_bill'])

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [32]:
functions=[('갯수','count'),('평균','mean'),('최대값','max')]
result = grouped['tip_pct','total_bill'].agg(functions)
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,갯수,평균,최대값,갯수,평균,최대값
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


**컬럼마다 다른 함수를 적용하고 싶다면 agg메서드에 컬럼 이름에 대응하는 함수가 들어 있는 사전을 넘기면됨**

In [33]:
grouped.agg({'tip':np.max,'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [34]:
grouped.agg({'tip_pct':['mean','max','std'],'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.187735,0.028123,9
Fri,Yes,0.174783,0.26348,0.051293,31
Sat,No,0.158048,0.29199,0.039767,115
Sat,Yes,0.147906,0.325733,0.061375,104
Sun,No,0.160113,0.252672,0.042347,167
Sun,Yes,0.18725,0.710345,0.154134,49
Thur,No,0.160298,0.266312,0.038774,112
Thur,Yes,0.163863,0.241255,0.039389,40


### 10.2.2 색인되지 않은 형태로 집계된 데이터 반환하기  
as_index=False

In [35]:
tips.groupby(['day','smoker'],as_index=False).mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


reset_index로 빼도 되지만 옵션을 쓰는게 불필요한 연산이 덜들어감