In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.046282,1.397819
1,a,two,-0.218011,0.421334
2,b,one,-0.543885,0.104163
3,b,two,-0.349449,-0.584049
4,a,one,-0.119999,-2.69564


In [3]:
grouped=df.groupby('key1')

In [4]:
grouped['data1'].quantile(0.9) #quantile메서드는 Series메서드이다.

key1
a   -0.061025
b   -0.368893
Name: data1, dtype: float64

* 자신만의 데이터 집계함수 사용

In [5]:
# 배열의 aggregate나 agg메서드에 해당 함수를 넘긴다.
def peak_to_peak(arr):
    return arr.max()-arr.min()

grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.171729,4.093459
b,0.194436,0.688212


In [6]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,-0.128097,0.08615,-0.218011,-0.169005,-0.119999,-0.083141,-0.046282,3.0,-0.292163,2.137969,-2.69564,-1.137153,0.421334,0.909576,1.397819
b,2.0,-0.446667,0.137487,-0.543885,-0.495276,-0.446667,-0.398058,-0.349449,2.0,-0.239943,0.486639,-0.584049,-0.411996,-0.239943,-0.06789,0.104163


## 1. 컬럼에 여러 가지 함수 적용하기

In [7]:
tips=pd.read_csv('tips.csv')
tips['tip_pct']=tips['tip']/tips['total_bill']
tips[:6]

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624


In [8]:
grouped=tips.groupby(['day','smoker'])
grouped_pct=grouped['tip_pct']
grouped_pct.agg('mean') # 기술 통계에서는 함수 이름을 문자열로 넘긴다.

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [9]:
# 컬럼 이름 지정하기
# (name,function) 튜플의 리스트 넘기기
grouped_pct.agg([('foo','mean'),('bar','max')])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.187735
Fri,Yes,0.174783,0.26348
Sat,No,0.158048,0.29199
Sat,Yes,0.147906,0.325733
Sun,No,0.160113,0.252672
Sun,Yes,0.18725,0.710345
Thur,No,0.160298,0.266312
Thur,Yes,0.163863,0.241255


In [10]:
# 컬럼마다 다른 함수적용 & 여러 개의 함수를 모든 컬럼에 적용
functions=['count','mean','max']
result=grouped['tip_pct','total_bill'].agg(functions)
result

  result=grouped['tip_pct','total_bill'].agg(functions)


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [11]:
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,4,0.15165,0.187735
Fri,Yes,15,0.174783,0.26348
Sat,No,45,0.158048,0.29199
Sat,Yes,42,0.147906,0.325733
Sun,No,57,0.160113,0.252672
Sun,Yes,19,0.18725,0.710345
Thur,No,45,0.160298,0.266312
Thur,Yes,17,0.163863,0.241255


In [12]:
# 튜플의 리스트 넘기기
ftuples=[('Durchschnitt','mean'),('Abweichung','min')]
grouped['tip_pct','total_bill'].agg(ftuples)

  grouped['tip_pct','total_bill'].agg(ftuples)


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.120385,18.42,12.46
Fri,Yes,0.174783,0.103555,16.813333,5.75
Sat,No,0.158048,0.056797,19.661778,7.25
Sat,Yes,0.147906,0.035638,21.276667,3.07
Sun,No,0.160113,0.059447,20.506667,8.77
Sun,Yes,0.18725,0.06566,24.12,7.25
Thur,No,0.160298,0.072961,17.113111,7.51
Thur,Yes,0.163863,0.090014,19.190588,10.34


In [13]:
# 컬럼마다 다른 함수 적용하기
grouped.agg({'tip':np.max,'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [14]:
grouped.agg({'tip':['min','max','mean','std'],
            'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip,tip,tip,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,1.5,3.5,2.8125,0.898494,9
Fri,Yes,1.0,4.73,2.714,1.077668,31
Sat,No,1.0,9.0,3.102889,1.642088,115
Sat,Yes,1.0,10.0,2.875476,1.63058,104
Sun,No,1.01,6.0,3.167895,1.224785,167
Sun,Yes,1.5,6.5,3.516842,1.261151,49
Thur,No,1.25,6.7,2.673778,1.282964,112
Thur,Yes,2.0,5.0,3.03,1.113491,40


## 2. 색인되지 않은 형태로 집계된 데이터 반환하기

In [15]:
tips.groupby(['day','smoker'],as_index=False).mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863
