In [14]:
## pandas多级搜索
import pandas as pd
import numpy as np


In [3]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]

index =pd.MultiIndex.from_tuples(index)
index


MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [5]:
pop = pd.Series(populations, index)
pop


California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [6]:
pop.reindex(index)

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [7]:
pop[:,2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

## 高纬数据的多级搜索
unstack()可以快速的将一个多级索引的Serias转化为普通索引的DataFrame

In [9]:
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


stack操作相反，将普通索引转化为多级

In [10]:
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [11]:
pop_df = pd.DataFrame({'total': pop, 'under18': [9267089, 9264094,
                                                 4687374, 4139022,
                                                 5906301, 6879014]})
pop_df


Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9264094
New York,2000,18976457,4687374
New York,2010,19378102,4139022
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [12]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()


Unnamed: 0,2000,2010
California,0.273594,0.248674
New York,0.24701,0.213593
Texas,0.283251,0.273568


## 创建多级索引

In [15]:
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2']
                  )
df


Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.648671,0.523094
a,2,0.921697,0.322052
b,1,0.879016,0.706391
b,2,0.469114,0.060732


In [17]:
df.stack()

a  1  data1    0.648671
      data2    0.523094
   2  data1    0.921697
      data2    0.322052
b  1  data1    0.879016
      data2    0.706391
   2  data1    0.469114
      data2    0.060732
dtype: float64

In [18]:
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]], names=['year', 'visit'])

columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']], names=['subject', 'type'])

#模拟数据
data = np.round(np.random.rand(4, 6), 1)
data[:, ::2] * 10  #切片操作会改变原来数据
data += 37

#创建DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data


Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,37.4,37.6,37.5,37.4,37.1,37.3
2013,2,37.7,37.6,37.7,37.4,37.8,37.1
2014,1,38.0,37.0,37.6,37.3,37.9,37.2
2014,2,37.6,37.6,37.7,37.4,37.9,37.9


In [19]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,37.5,37.4
2013,2,37.7,37.4
2014,1,37.6,37.3
2014,2,37.7,37.4
