# chapter 9 : 数据聚合与分组计算

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

In [2]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})

In [5]:
grouped = df.groupby(df['key1'])
print(grouped.mean())

grouped2 = df['data1'].groupby(df['key2'])
print(grouped2.median())

         data1     data2
key1                    
a    -0.012554  0.138652
b    -0.259696  0.152595
key2
one   -0.345399
two   -0.059033
Name: data1, dtype: float64


In [6]:
df['data1'].groupby([df['key1'], df['key2']]).mean()

key1  key2
a     one    -0.046795
      two     0.055929
b     one    -0.345399
      two    -0.173994
Name: data1, dtype: float64

In [7]:
df.groupby('key1').mean() # so, there is no need for df['key1']

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.012554,0.138652
b,-0.259696,0.152595


In [8]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [9]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -1.037185  0.260148
1    a  two  0.055929 -1.432500
4    a  one  0.943596  1.588308
b
  key1 key2     data1     data2
2    b  one -0.345399  0.952972
3    b  two -0.173994 -0.647781


In [10]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print(k1, k2)
    print(group)

a one
  key1 key2     data1     data2
0    a  one -1.037185  0.260148
4    a  one  0.943596  1.588308
a two
  key1 key2     data1   data2
1    a  two  0.055929 -1.4325
b one
  key1 key2     data1     data2
2    b  one -0.345399  0.952972
b two
  key1 key2     data1     data2
3    b  two -0.173994 -0.647781


In [11]:
pieces = dict(list(df.groupby('key1')))
print(pieces)

{'a':   key1 key2     data1     data2
0    a  one -1.037185  0.260148
1    a  two  0.055929 -1.432500
4    a  one  0.943596  1.588308, 'b':   key1 key2     data1     data2
2    b  one -0.345399  0.952972
3    b  two -0.173994 -0.647781}


In [12]:
grouped_col = df.groupby(df.dtypes, axis=1)
dict(list(grouped_col))

{dtype('float64'):       data1     data2
 0 -1.037185  0.260148
 1  0.055929 -1.432500
 2 -0.345399  0.952972
 3 -0.173994 -0.647781
 4  0.943596  1.588308, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

In [21]:
print(df['data1'].groupby(df['key1']).mean())
print(df[['data1']].groupby(df['key1']).mean())
print('\n')
print(type(df['data1']))
print(type(df[['data1']]))

key1
a   -0.012554
b   -0.259696
Name: data1, dtype: float64
         data1
key1          
a    -0.012554
b    -0.259696


<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [22]:
people = DataFrame(np.random.randn(5, 5),
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])

groupby_map = {'a' : 'red', 'b' : 'blue', 'c' : 'red', 'd' : 'red', 'e' : 'blue'}
by_column = people.groupby(groupby_map, axis=1)

In [25]:
print(by_column.mean())
print(by_column.count())

            blue       red
Joe     0.304454  0.345136
Steve  -0.176742 -0.301336
Wes    -0.274221  0.054564
Jim     0.859158 -0.843103
Travis -0.254842 -0.924578
        blue  red
Joe        2    3
Steve      2    3
Wes        2    3
Jim        2    3
Travis     2    3


In [27]:
sl = ['sam', 'jack', 'luc', 'rat']
len(sl[0])

people.groupby(len).sum()

3

In [28]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.438113,0.176505,-1.320249,-0.668175,-0.724947
3,two,-0.516204,0.842492,-0.125942,-1.887164,0.875823
5,one,-0.563468,-0.087251,0.165671,-0.50621,-0.266233
6,two,-0.330661,0.159476,-1.966509,-0.476565,-0.66916


In [30]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                     [1, 3, 5, 1, 3]], names=['cty', 'tenor'])
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)

hier_df.groupby(level='cty', axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


In [31]:
hier_df.groupby(level='cty', axis=1).size()

cty
JP    2
US    3
dtype: int64

In [32]:
def peak_to_peak(arr):
    return arr.max() - arr.min()
    
df.groupby('key1').agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.980781,3.020808
b,0.171405,1.600753


In [35]:
df.groupby('key1').describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,-0.012554,0.992164,-1.037185,-0.490628,0.055929,0.499762,0.943596,3.0,0.138652,1.514064,-1.4325,-0.586176,0.260148,0.924228,1.588308
b,2.0,-0.259696,0.121202,-0.345399,-0.302548,-0.259696,-0.216845,-0.173994,2.0,0.152595,1.131904,-0.647781,-0.247593,0.152595,0.552784,0.952972


In [38]:
grouped_summary = df.groupby('key1').describe()
grouped_summary['data1']['count']

key1
a    3.0
b    2.0
Name: count, dtype: float64

In [41]:
tips = pd.read_csv('tips.csv', header=0)
print(tips.columns)

Index(['total_bill', 'tip', 'smoker', 'day', 'time', 'size'], dtype='object')


In [47]:
grouped = tips.groupby(['day', 'smoker'])
grouped_pct = grouped['tip']
print(grouped_pct.agg('mean'))
print(grouped_pct.agg(peak_to_peak))

day   smoker
Fri   No        2.812500
      Yes       2.714000
Sat   No        3.102889
      Yes       2.875476
Sun   No        3.167895
      Yes       3.516842
Thur  No        2.673778
      Yes       3.030000
Name: tip, dtype: float64
day   smoker
Fri   No        2.00
      Yes       3.73
Sat   No        8.00
      Yes       9.00
Sun   No        4.99
      Yes       5.00
Thur  No        5.45
      Yes       3.00
Name: tip, dtype: float64


In [48]:
grouped_pct.agg(['mean', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,2.8125,2.0
Fri,Yes,2.714,3.73
Sat,No,3.102889,8.0
Sat,Yes,2.875476,9.0
Sun,No,3.167895,4.99
Sun,Yes,3.516842,5.0
Thur,No,2.673778,5.45
Thur,Yes,3.03,3.0


In [52]:
grouped_pct.agg([('foo', np.mean), ('bar', np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,2.8125,0.898494
Fri,Yes,2.714,1.077668
Sat,No,3.102889,1.642088
Sat,Yes,2.875476,1.63058
Sun,No,3.167895,1.224785
Sun,Yes,3.516842,1.261151
Thur,No,2.673778,1.282964
Thur,Yes,3.03,1.113491


In [51]:
np.mean([1, 2, 3, 4])

2.5

In [53]:
grouped.agg({'tip' : np.max, 'size' : 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [55]:
grouped.agg({'tip' : ['min', 'max', 'mean', 'std'], 'size' : 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip,tip,tip,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,1.5,3.5,2.8125,0.898494,9
Fri,Yes,1.0,4.73,2.714,1.077668,31
Sat,No,1.0,9.0,3.102889,1.642088,115
Sat,Yes,1.0,10.0,2.875476,1.63058,104
Sun,No,1.01,6.0,3.167895,1.224785,167
Sun,Yes,1.5,6.5,3.516842,1.261151,49
Thur,No,1.25,6.7,2.673778,1.282964,112
Thur,Yes,2.0,5.0,3.03,1.113491,40


In [56]:
tips.groupby(['day', 'smoker'], as_index=True).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,18.42,2.8125,2.25
Fri,Yes,16.813333,2.714,2.066667
Sat,No,19.661778,3.102889,2.555556
Sat,Yes,21.276667,2.875476,2.47619
Sun,No,20.506667,3.167895,2.929825
Sun,Yes,24.12,3.516842,2.578947
Thur,No,17.113111,2.673778,2.488889
Thur,Yes,19.190588,3.03,2.352941


In [58]:
tips.groupby(['day', 'smoker'], as_index=False).mean()

Unnamed: 0,day,smoker,total_bill,tip,size
0,Fri,No,18.42,2.8125,2.25
1,Fri,Yes,16.813333,2.714,2.066667
2,Sat,No,19.661778,3.102889,2.555556
3,Sat,Yes,21.276667,2.875476,2.47619
4,Sun,No,20.506667,3.167895,2.929825
5,Sun,Yes,24.12,3.516842,2.578947
6,Thur,No,17.113111,2.673778,2.488889
7,Thur,Yes,19.190588,3.03,2.352941


In [60]:
k1_means = df.groupby('key1').mean()
print(k1_means)

         data1     data2
key1                    
a    -0.012554  0.138652
b    -0.259696  0.152595


In [61]:
k1_means.add_prefix('mean_')

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.012554,0.138652
b,-0.259696,0.152595


In [64]:
people.groupby(['one', 'two', 'one', 'two', 'one']).transform(np.mean)

Unnamed: 0,a,b,c,d,e
Joe,0.307943,0.238251,-0.381687,-0.451133,-0.38799
Steve,-0.539836,0.377621,0.019864,-1.196687,0.304795
Wes,0.307943,0.238251,-0.381687,-0.451133,-0.38799
Jim,-0.539836,0.377621,0.019864,-1.196687,0.304795
Travis,0.307943,0.238251,-0.381687,-0.451133,-0.38799


In [67]:
def demean(arr):
    return arr - arr.mean()

demeaned = people.groupby(['one', 'two', 'one', 'two', 'one']).transform(demean)
print(demeaned)

               a         b         c         d         e
Joe    -0.746055  0.140520  2.523382 -0.217042  0.618128
Steve  -0.023632 -0.464872  0.145806  0.690477 -0.571028
Wes     1.384660 -0.061746 -0.938561  0.242473 -0.336957
Jim     0.023632  0.464872 -0.145806 -0.690477  0.571028
Travis -0.638604 -0.078775 -1.584821 -0.025431 -0.281170


In [69]:
frame = DataFrame({'data1' : np.random.randn(1000),
                   'data2' : np.random.randn(1000)})

factor = pd.cut(frame.data1, 4)

def get_stats(group):
    return {'min' : group.min(), 'max' : group.max(), 'count' : group.count(), 'mean' : group.mean()}

grouped = frame.data2.groupby(factor)

grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.491, -1.723]",-1.919176,1.318596,32.0,-0.342536
"(-1.723, 0.0373]",-2.758056,2.915914,458.0,0.073108
"(0.0373, 1.798]",-2.928448,3.16873,474.0,0.00959
"(1.798, 3.559]",-1.485034,1.647399,36.0,0.124687
