In [1]:
import pandas as pd
import numpy as np

# 透视
## pivot_table
比groupby更灵活。

- 可以设定列；
- margin控制合计列；

In [25]:
data = {
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
    'year':[2000, 2001, 2002, 2001, 2002],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9],
    'area': [5.5, 7.7, 6.6, 4.4, 9.9],
    'num':['one', 'two', 'three', 'four', 'five']
}

df = pd.DataFrame(data)

df.head()

Unnamed: 0,state,year,pop,area,num
0,Ohio,2000,1.5,5.5,one
1,Ohio,2001,1.7,7.7,two
2,Ohio,2002,3.6,6.6,three
3,Nevada,2001,2.4,4.4,four
4,Nevada,2002,2.9,9.9,five


### 设定行

In [33]:
pd.pivot_table(df, 
               index=['num', 'state'],       # 行标签
               values=['pop', 'area'])       # 聚合数据

Unnamed: 0_level_0,Unnamed: 1_level_0,area,pop
num,state,Unnamed: 2_level_1,Unnamed: 3_level_1
five,Nevada,9.9,2.9
four,Nevada,4.4,2.4
one,Ohio,5.5,1.5
three,Ohio,6.6,3.6
two,Ohio,7.7,1.7


### 增加聚合方式

In [37]:
pd.pivot_table(df, 
               index=['num', 'state'],       # 行标签
               values=['pop', 'area'],       # 聚合数据
               aggfunc=[np.sum, np.mean]
              ) 

  pd.pivot_table(df,
  pd.pivot_table(df,


Unnamed: 0_level_0,Unnamed: 1_level_0,sum,sum,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,area,pop,area,pop
num,state,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
five,Nevada,9.9,2.9,9.9,2.9
four,Nevada,4.4,2.4,4.4,2.4
one,Ohio,5.5,1.5,5.5,1.5
three,Ohio,6.6,3.6,6.6,3.6
two,Ohio,7.7,1.7,7.7,1.7


### 设定行列

In [40]:
pd.pivot_table(df, 
               index=['num'],                # 行标签
               columns = ['state'],
               values=['pop', 'area'],       # 聚合数据
               aggfunc=[np.sum],
               fill_value=0
              ) 

  pd.pivot_table(df,


Unnamed: 0_level_0,sum,sum,sum,sum
Unnamed: 0_level_1,area,area,pop,pop
state,Nevada,Ohio,Nevada,Ohio
num,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
five,9.9,0.0,2.9,0.0
four,4.4,0.0,2.4,0.0
one,0.0,5.5,0.0,1.5
three,0.0,6.6,0.0,3.6
two,0.0,7.7,0.0,1.7


In [41]:
pd.pivot_table(df, 
               index=['num'],                # 行标签
               columns = ['state'],
               values=['pop', 'area'],       # 聚合数据
               aggfunc=[np.sum],
               fill_value=0,
               margins=1                     # 汇总行列
              ) 

  pd.pivot_table(df,
  pd.pivot_table(df,
  pd.pivot_table(df,


Unnamed: 0_level_0,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,area,area,area,pop,pop,pop
state,Nevada,Ohio,All,Nevada,Ohio,All
num,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
five,9.9,0.0,9.9,2.9,0.0,2.9
four,4.4,0.0,4.4,2.4,0.0,2.4
one,0.0,5.5,5.5,0.0,1.5,1.5
three,0.0,6.6,6.6,0.0,3.6,3.6
two,0.0,7.7,7.7,0.0,1.7,1.7
All,14.3,19.8,34.1,5.3,6.8,12.1


## groupby

In [45]:
df.groupby(['num'])[['area', 'pop']].agg(np.sum).fillna(0)

  df.groupby(['num'])[['area', 'pop']].agg(np.sum).fillna(0)


Unnamed: 0_level_0,area,pop
num,Unnamed: 1_level_1,Unnamed: 2_level_1
five,9.9,2.9
four,4.4,2.4
one,5.5,1.5
three,6.6,3.6
two,7.7,1.7


# 逆透视
## stack
将列名转换为普通的列。

In [13]:
data = {
    'product': list('ABCDE'),
    'Jan': np.random.randn(5),
    'Feb': np.random.randn(5),
    'Mar': np.random.randn(5),
    'Apr': np.random.randn(5),
    'May': np.random.randn(5),
    'Jun': np.random.randn(5)
}

df = pd.DataFrame(data)

df.head()

Unnamed: 0,product,Jan,Feb,Mar,Apr,May,Jun
0,A,0.927846,-0.285866,0.701058,-0.532041,0.734615,-1.243142
1,B,-1.14857,-1.360192,-0.440771,0.818754,-1.002218,-0.691591
2,C,-0.066699,0.542037,-0.729462,-0.467078,1.102197,2.384517
3,D,-0.499626,-1.717789,0.125577,0.469721,0.686662,1.069873
4,E,-2.234766,-0.660656,1.794696,-0.090528,0.151282,0.753376


In [14]:
df_stacked = df.set_index('product').stack().reset_index()    # 先将分类变量设为索引，再stack

df_stacked.columns = ['product', 'month', 'sale_amt']
df_stacked.head()

Unnamed: 0,product,month,sale_amt
0,A,Jan,0.927846
1,A,Feb,-0.285866
2,A,Mar,0.701058
3,A,Apr,-0.532041
4,A,May,0.734615


## 多个分类指标

In [15]:
data = {
    'product': list('ABCDE'),
    'cat': list('一二一二一'),
    'Jan': np.random.randn(5),
    'Feb': np.random.randn(5),
    'Mar': np.random.randn(5),
    'Apr': np.random.randn(5),
    'May': np.random.randn(5),
    'Jun': np.random.randn(5)
}

df = pd.DataFrame(data)

df.head()

Unnamed: 0,product,cat,Jan,Feb,Mar,Apr,May,Jun
0,A,一,-0.125164,-1.262792,-0.822153,-0.433824,0.713923,0.084344
1,B,二,-1.595572,1.456221,0.856915,1.536777,1.48505,1.427357
2,C,一,3.402358,-0.416169,0.195352,-2.158873,-0.764844,2.043812
3,D,二,-0.921769,1.153112,-0.67325,0.228433,-1.405622,0.05665
4,E,一,0.84755,-0.345598,-0.801974,-1.082217,0.369749,0.034031


In [16]:
df_stacked = df.set_index(['cat', 'product']).stack().reset_index()
df_stacked.columns = ['cat', 'product', 'month', 'sale_amt']

df_stacked.head()

Unnamed: 0,cat,product,month,sale_amt
0,一,A,Jan,-0.125164
1,一,A,Feb,-1.262792
2,一,A,Mar,-0.822153
3,一,A,Apr,-0.433824
4,一,A,May,0.713923
