In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                       'B' : ['one', 'one', 'two', 'three',
                              'two', 'two', 'one', 'three'],
                       'C' : np.random.randn(8),
                       'D' : np.random.randn(8)})

In [3]:
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.260414,0.06857
1,bar,one,0.331701,-0.965936
2,foo,two,2.405985,0.5137
3,bar,three,0.158105,1.701912
4,foo,two,-0.307115,0.349799
5,bar,two,0.448139,0.391535
6,foo,one,0.547008,1.221357
7,foo,three,0.514044,-0.380083


In [4]:
grouped = df.groupby('A')

In [7]:
grouped.count()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,3,3,3
foo,5,5,5


In [8]:
grouped = df.groupby(['A', 'B'])

In [10]:
grouped.count()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,1
bar,three,1,1
bar,two,1,1
foo,one,2,2
foo,three,1,1
foo,two,2,2


In [14]:
# 当groupby多个key的时候，get_group需要传递tuple来获取 
grouped.get_group(('foo', 'one'))

Unnamed: 0,C,D
0,-0.260414,0.06857
6,0.547008,1.221357


可以指定axis来对column还是row进行group，

同时group的方式可以是简单的groupby keys也可以是group by function

In [15]:
def get_letter_type(letter):
        if letter.lower() in 'aeiou':
            return 'vowel'
        else:
            return 'consonant'

In [16]:
grouped = df.groupby(get_letter_type, axis=1)

In [17]:
grouped.count()

Unnamed: 0,consonant,vowel
0,3,1
1,3,1
2,3,1
3,3,1
4,3,1
5,3,1
6,3,1
7,3,1


##### get_group() 获取某个group

In [19]:
grouped.get_group('consonant')

Unnamed: 0,B,C,D
0,one,-0.260414,0.06857
1,one,0.331701,-0.965936
2,two,2.405985,0.5137
3,three,0.158105,1.701912
4,two,-0.307115,0.349799
5,two,0.448139,0.391535
6,one,0.547008,1.221357
7,three,0.514044,-0.380083


index object 目前能支持多个values，即多重index；

groupby的时候可以指定level进行group

In [20]:
lst = [1, 2, 3, 1, 2, 3]
s = pd.Series([1, 2, 3, 10, 20, 30], lst)
grouped = s.groupby(level=0)

In [21]:
s

1     1
2     2
3     3
1    10
2    20
3    30
dtype: int64

groupby 默认会对group的key进行**排序**，例如上面的几个例子中，bar都出现在foo之前（但在df中bar并非出现在foo前面）

使用sort=False可以加快操作的运算速度

In [22]:
grouped.groups

{1: [1L, 1L], 2: [2L, 2L], 3: [3L, 3L]}

查看grouped之后的大体情况除了使用前面出现过的count()

最正确的方法应该是使用groups，groups返回的是一个字典，key是group的get，value是group起来的list

一个比较有用的语法糖：groupby之后，如果只想取想要的列，可以直接传入列名：


In [None]:
grouped = df.groupby(['A'])
grouped_C = grouped['C']
grouped_D = grouped['D']

##### groups的遍历
可以直接使用for循环中进行遍历；

但是要注意的是当使用多个key进行group的情况，group name将会是一个tuple

In [23]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                       'B' : ['one', 'one', 'two', 'three',
                              'two', 'two', 'one', 'three'],
                       'C' : np.random.randn(8),
                       'D' : np.random.randn(8)})

In [24]:
grouped = df.groupby('A')
for name, group in grouped:
           print(name)
           print(group)

bar
     A      B         C         D
1  bar    one -1.609096 -0.666839
3  bar  three -0.462851  0.240259
5  bar    two  2.081165  0.968236
foo
     A      B         C         D
0  foo    one  0.428460 -0.954949
2  foo    two  0.148060  1.409837
4  foo    two  0.062750  0.165301
6  foo    one -3.685025  0.032920
7  foo  three  0.622798 -0.231273


In [25]:
grouped.groups

{'bar': [1L, 3L, 5L], 'foo': [0L, 2L, 4L, 6L, 7L]}

#### group之后的实用操作

In [26]:
grouped.aggregate(np.sum)

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.009218,0.541656
foo,-2.422957,0.421836


In [34]:
grouped.aggregate(np.sum).reset_index()
# 对比上一个结果，不再把group key作为

Unnamed: 0,A,C,D
0,bar,0.009218,0.541656
1,foo,-2.422957,0.421836


In [32]:
grouped_b = grouped['B']
grouped_b.groups
# 只剩下B列

{'bar': [1L, 3L, 5L], 'foo': [0L, 2L, 4L, 6L, 7L]}

In [33]:
for name, group in grouped_b:
           print(name)
           print(group)

bar
1      one
3    three
5      two
Name: B, dtype: object
foo
0      one
2      two
4      two
6      one
7    three
Name: B, dtype: object


Another simple aggregation example is to compute the size of each group. This is included in GroupBy as the size method. It returns a Series whose index are the group names and whose values are the sizes of each group.

In [35]:
grouped.size()

A
bar    3
foo    5
dtype: int64

In [36]:
grouped.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,count,3.0,3.0
bar,mean,0.003073,0.180552
bar,std,1.888735,0.819171
bar,min,-1.609096,-0.666839
bar,25%,-1.035973,-0.21329
bar,50%,-0.462851,0.240259
bar,75%,0.809157,0.604247
bar,max,2.081165,0.968236
foo,count,5.0,5.0
foo,mean,-0.484591,0.084367


In [37]:
 grouped['C'].agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,sum,mean,std
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,0.009218,0.003073,1.888735
foo,-2.422957,-0.484591,1.802954


In [38]:
grouped['D'].agg({'result1' : np.sum,
                      'result2' : np.mean})

Unnamed: 0_level_0,result2,result1
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.180552,0.541656
foo,0.084367,0.421836


In [39]:
grouped.agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,C,C,C,D,D,D
Unnamed: 0_level_1,sum,mean,std,sum,mean,std
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bar,0.009218,0.003073,1.888735,0.541656,0.180552,0.819171
foo,-2.422957,-0.484591,1.802954,0.421836,0.084367,0.858178


### Group by range

* 在处理实时数据的时候，一种需要经常用到的方式是将原始log按每五分钟group在一起再计算
* 其他情况例如按某个range跨度group等等。。


In [40]:
# testing data: data/20151006
# 4730 20.00 2015-10-06 00:00:03
# 9087 10.00 2015-10-06 00:00:09
import pandas as pd

In [41]:
realtime = pd.read_csv('D:/20151006', header=None, sep=' ')
realtime["time"] = realtime[2].map(str) + ' ' + realtime[3]
realtime["time"] = pd.to_datetime(realtime.time)
realtime.drop([2,3], inplace=True, axis=1)
realtime.columns = ['counter','volumn','time']

以上：
* line1 默认read_csv的sep是逗号，因此当源log的分隔使用空格时需要特别指定；
* 其实那个map不是必须的，map可以跟一个方法并实践到每一个shell里面，这里的作用是将realtime[2]列转换成string然后直接使用加号相连，如果它是int或者float而且没被转换的话就会出错。
* to_datetime()是十分的好用，把string等转变为datetime类型，可以指定format同时能指定出现错误时候的处理方法
** errors : {‘ignore’, ‘raise’, ‘coerce’}, default ‘raise’
** If ‘raise’, then invalid parsing will raise an exception
** If ‘coerce’, then invalid parsing will be set as NaT
**　If ‘ignore’, then invalid parsing will return the input

参考http://pandas.pydata.org/pandas-docs/stable/generated/pandas.to_datetime.html


In [42]:
realtime.head()

Unnamed: 0,counter,volumn,time
0,4730,20,2015-10-06 00:00:03
1,9087,10,2015-10-06 00:00:09
2,2240,100,2015-10-06 00:00:16
3,2240,100,2015-10-06 00:00:16
4,6808,50,2015-10-06 00:00:16


In [43]:
realtime.dtypes

counter             int64
volumn            float64
time       datetime64[ns]
dtype: object

In [44]:
time_groups =  realtime.groupby(pd.Grouper(key='time',freq='5Min'))

In [None]:
time_groups.groups

In [47]:
time_groups.get_group('2015-10-06 10:00:00').head(5)

Unnamed: 0,counter,volumn,time
10642,7571,50,2015-10-06 10:00:01
10643,32066,20,2015-10-06 10:00:02
10644,22808,50,2015-10-06 10:00:02
10645,19524,50,2015-10-06 10:00:02
10646,7674,40,2015-10-06 10:00:02


一些时间相关的操作：



In [48]:
from datetime import datetime, timedelta
date_object = datetime.strptime('20151006', '%Y%m%d')

In [50]:
time_line = date_object + timedelta(minutes=1*5)
time_line

datetime.datetime(2015, 10, 6, 0, 5)

In [51]:
# 可以直接使用datetime作为key
time_groups.get_group(time_line).head(5)

Unnamed: 0,counter,volumn,time
103,298,100,2015-10-06 00:05:02
104,8934,50,2015-10-06 00:05:03
105,9279,100,2015-10-06 00:05:07
106,5786,150,2015-10-06 00:05:08
107,21831,100,2015-10-06 00:05:08


In [52]:
# 另外一个时间用法是
i = datetime.now()
i = i.replace(hour=0, minute=0, second=0, microsecond=0)
print i + timedelta(minutes=5*241)

2015-11-05 20:05:00


In [53]:
counter_groups = realtime.groupby(pd.Grouper(key='counter'))

In [None]:
counter_groups.groups

In [58]:
# 某些使用场景，直接使用value_counts() 有可能会更快
realtime.counter.value_counts()[realtime.counter.value_counts()>500]

17163    1988
dtype: int64