In [2]:
import seaborn as sns
import numpy as np
import pandas as pd

In [3]:
planets = sns.load_dataset('planets')
planets.shape

(1035, 6)

In [4]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


## 简单的累计功能

In [5]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64

In [6]:
ser.sum()

2.811925491708157

In [7]:
ser.mean()

0.5623850983416314

In [9]:
df = pd.DataFrame({'A':rng.rand(5),
                   'B':rng.rand(5)})
df

Unnamed: 0,A,B
0,0.183405,0.611853
1,0.304242,0.139494
2,0.524756,0.292145
3,0.431945,0.366362
4,0.291229,0.45607


In [11]:
df.mean()#默认按照列进行统计

A    0.347115
B    0.373185
dtype: float64

In [12]:
df.mean(axis =1)

0    0.397629
1    0.221868
2    0.408451
3    0.399153
4    0.373650
dtype: float64

In [13]:
planets.dropna().describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


In [14]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)}, columns=['key', 'data'])
df


Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [15]:
df.groupby('key')

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x000001BE6E090EB8>

In [16]:
#通过延时计算使大多数常见的累计操作通过一种对用户而言几乎是透明的
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [17]:
#按列取值
planets.groupby('method')

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x000001BE6E0C14E0>

In [18]:
planets.groupby('method')['orbital_period']

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000001BE6E0C3EB8>

In [19]:
planets.groupby('method')['orbital_period'].median()

method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64

In [27]:
##按组迭代
for (method,group) in planets.groupby('method'):
    print("{0:50s} shape={1}".format(method,group.shape))

Astrometry                                         shape=(2, 6)
Eclipse Timing Variations                          shape=(9, 6)
Imaging                                            shape=(38, 6)
Microlensing                                       shape=(23, 6)
Orbital Brightness Modulation                      shape=(3, 6)
Pulsar Timing                                      shape=(5, 6)
Pulsation Timing Variations                        shape=(1, 6)
Radial Velocity                                    shape=(553, 6)
Transit                                            shape=(397, 6)
Transit Timing Variations                          shape=(4, 6)


In [39]:
##调用方法
planets.groupby('method')['year'].describe().unstack()#unstack()行列互换

       method                       
count  Astrometry                          2.000000
       Eclipse Timing Variations           9.000000
       Imaging                            38.000000
       Microlensing                       23.000000
       Orbital Brightness Modulation       3.000000
       Pulsar Timing                       5.000000
       Pulsation Timing Variations         1.000000
       Radial Velocity                   553.000000
       Transit                           397.000000
       Transit Timing Variations           4.000000
mean   Astrometry                       2011.500000
       Eclipse Timing Variations        2010.000000
       Imaging                          2009.131579
       Microlensing                     2009.782609
       Orbital Brightness Modulation    2011.666667
       Pulsar Timing                    1998.400000
       Pulsation Timing Variations      2007.000000
       Radial Velocity                  2007.518987
       Transit             

In [40]:
#累计、过滤、转换和应用
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                  columns=['key', 'data1', 'data2'])
df


Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [42]:
##一次性获取多个累计值
df.groupby('key').aggregate(['min',np.median,max])#支持多种表达方式

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,3,4.0,5
B,1,2.5,4,0,3.5,7
C,2,3.5,5,3,6.0,9


In [43]:
#方法2
df.groupby('key').aggregate({'data1': 'min',
                             'data2': 'max'})


Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,7
C,2,9


In [44]:
#过滤
def filter_func(x):
    return x['data2'].std() > 4

print(df);
print(df.groupby('key').std())
print(df.groupby('key').filter(filter_func))


  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
       data1     data2
key                   
A    2.12132  1.414214
B    2.12132  4.949747
C    2.12132  4.242641
  key  data1  data2
1   B      1      0
2   C      2      3
4   B      4      7
5   C      5      9


In [45]:
#转换
df.groupby('key').transform(lambda x: x - x.mean())


Unnamed: 0,data1,data2
0,-1.5,1.0
1,-1.5,-3.5
2,-1.5,-3.0
3,1.5,-1.0
4,1.5,3.5
5,1.5,3.0


In [46]:
df.groupby('key').mean()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,1.5,4.0
B,2.5,3.5
C,3.5,6.0


In [48]:
#apply方法
def norm_by_data2(x):
    x['data1'] /= x['data2'].sum()
    return x
print(df)
print(df.groupby('key').sum())
print(df.groupby('key').apply(norm_by_data2))


  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
     data1  data2
key              
A        3      8
B        5      7
C        7     12
  key     data1  data2
0   A  0.000000      5
1   B  0.142857      0
2   C  0.166667      3
3   A  0.375000      3
4   B  0.571429      7
5   C  0.416667      9
