In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.344717,0.06897
1,a,two,1.435413,-1.816822
2,b,one,0.606245,-0.065414
3,b,two,-0.148924,-0.013699
4,a,one,-0.360904,0.669582


In [3]:
# data1에 대해 groupby 메서드를 호출하고 key1 컬럼을 넘긴다.
grouped=df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001BB7C32B610>

In [4]:
# 그룹별 평균구하기
grouped.mean()

key1
a    0.806409
b    0.228660
Name: data1, dtype: float64

In [5]:
means=df['data1'].groupby([df['key1'],df['key2']]).mean()
means

key1  key2
a     one     0.491907
      two     1.435413
b     one     0.606245
      two    -0.148924
Name: data1, dtype: float64

In [6]:
# Series 객체 그룹색인
states=np.array(['Ohio','California','California','Ohio','Ohio'])
years=np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()

California  2005    1.435413
            2006    0.606245
Ohio        2005    0.597896
            2006   -0.360904
Name: data1, dtype: float64

In [7]:
# size메서드-그룹의 크기를 담고 있는 Series 객체 반환
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

* 그룹 색인에서 누락된 값은 결과에서 제외된다.

## 1. 그룹 간 순회하기

* 그룹 이름과 그에 따른 데이터 묶음을 튜플로 반환

In [8]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.344717,0.06897
1,a,two,1.435413,-1.816822
2,b,one,0.606245,-0.065414
3,b,two,-0.148924,-0.013699
4,a,one,-0.360904,0.669582


In [9]:
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  1.344717  0.068970
1    a  two  1.435413 -1.816822
4    a  one -0.360904  0.669582
b
  key1 key2     data1     data2
2    b  one  0.606245 -0.065414
3    b  two -0.148924 -0.013699


* 여러 색인이 존재하는 경우 튜플의 첫 번째 원소가 색인값이 된다.

In [10]:
for (k1,k2),group in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one  1.344717  0.068970
4    a  one -0.360904  0.669582
('a', 'two')
  key1 key2     data1     data2
1    a  two  1.435413 -1.816822
('b', 'one')
  key1 key2     data1     data2
2    b  one  0.606245 -0.065414
('b', 'two')
  key1 key2     data1     data2
3    b  two -0.148924 -0.013699


* 그룹별 데이터를 사전형으로 바꿔서 원하는 데이터만 골라내기

In [11]:
pieces=dict(list(df.groupby('key1')))
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,0.606245,-0.065414
3,b,two,-0.148924,-0.013699


* 다른 축으로 그룹을 만들기

In [12]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [13]:
grouped=df.groupby(df.dtypes,axis=1)
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001BB7C37CA00>

In [14]:
for dtype,group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0  1.344717  0.068970
1  1.435413 -1.816822
2  0.606245 -0.065414
3 -0.148924 -0.013699
4 -0.360904  0.669582
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


## 2. 컬럼이나 컬럼의 일부만 선택하기

In [15]:
# 특정 컬럼에 대해서만 평균을 구할때
df.groupby(['key1','key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.369276
a,two,-1.816822
b,one,-0.065414
b,two,-0.013699


## 3. 사전과 Series에서 그룹핑하기
* 배열이 아닌 형태

In [16]:
people=pd.DataFrame(np.random.randn(5,5),
                   columns=['a','b','c','d','e'],
                   index=['Joe','Steve','Wes','Jim','Travis']) # people이라는 데이터프레임

people.iloc[2:3,[1,2]]=np.nan # nan값 추가
people

Unnamed: 0,a,b,c,d,e
Joe,-0.335895,-1.522016,-0.249295,0.291547,-0.140654
Steve,0.4746,-0.140943,1.001918,1.300694,1.516114
Wes,-1.734126,,,0.076499,0.66596
Jim,0.456135,0.071466,-0.923911,-0.434972,1.529085
Travis,-1.33205,0.199706,-0.208843,0.142666,-0.05031


In [17]:
# 각 컬럼을 나타낼 그룹 목록
mapping={'a':'red','b':'red','c':'blue',
        'd':'blue','e':'red','f':'orange'}
mapping

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [18]:
by_columns=people.groupby(mapping,axis=1)
by_columns.sum()

Unnamed: 0,blue,red
Joe,0.042252,-1.998565
Steve,2.302612,1.849771
Wes,0.076499,-1.068166
Jim,-1.358883,2.056685
Travis,-0.066178,-1.182654


In [19]:
# Series
map_series=pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [20]:
people.groupby(map_series,axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


## 4. 함수로 그룹핑하기

In [21]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.335895,-1.522016,-0.249295,0.291547,-0.140654
Steve,0.4746,-0.140943,1.001918,1.300694,1.516114
Wes,-1.734126,,,0.076499,0.66596
Jim,0.456135,0.071466,-0.923911,-0.434972,1.529085
Travis,-1.33205,0.199706,-0.208843,0.142666,-0.05031


In [22]:
# 이름의 길이별로 그룹을 묶고 싶을때 (색인)
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-1.613886,-1.45055,-1.173205,-0.066927,2.054391
5,0.4746,-0.140943,1.001918,1.300694,1.516114
6,-1.33205,0.199706,-0.208843,0.142666,-0.05031


In [23]:
# 함수를 배열,사전 또는 Series와 섞어 쓸때
key_list=['one','one','one','two','two']
people.groupby([len,key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-1.734126,-1.522016,-0.249295,0.076499,-0.140654
3,two,0.456135,0.071466,-0.923911,-0.434972,1.529085
5,one,0.4746,-0.140943,1.001918,1.300694,1.516114
6,two,-1.33205,0.199706,-0.208843,0.142666,-0.05031


## 5.색인 단계로 그룹핑하기

In [24]:
columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],
                                  [1,3,5,1,3]],
                                 names=['cty','tenor'])

In [25]:
hier_df=pd.DataFrame(np.random.randn(4,5),columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.813559,0.478572,-0.424642,-0.189277,0.403342
1,-1.637794,-0.97531,-0.765304,1.074092,-1.128156
2,0.383669,0.077656,0.872589,0.58123,-1.142966
3,-0.402631,1.796185,1.124946,-0.780265,-0.344936


In [26]:
# level 예약어 사용
hier_df.groupby(level='cty',axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3
