# Hierarchical Indexing

In [1]:
import pandas as pd
import numpy as np

## A Multiply Indexed Series

### 1. 안좋은 방법

In [2]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.DataFrame({'populations' : populations}, index=index)
pop

Unnamed: 0,populations
"(California, 2000)",33871648
"(California, 2010)",37253956
"(New York, 2000)",18976457
"(New York, 2010)",19378102
"(Texas, 2000)",20851820
"(Texas, 2010)",25145561


In [3]:
pop.index

Index([('California', 2000), ('California', 2010),   ('New York', 2000),
         ('New York', 2010),      ('Texas', 2000),      ('Texas', 2010)],
      dtype='object')

In [4]:
# error
pop.loc[('California', 2000)]

KeyError: 'California'

In [5]:
pop.iloc[pop.index.get_loc(('California', 2000))]

populations    33871648
Name: (California, 2000), dtype: int64

In [6]:
# 2000년도 데이터만 가져오기
pop.iloc[[pop.index.get_loc(i) for i in pop.index if i[1] == 2010]]

Unnamed: 0,populations
"(California, 2010)",37253956
"(New York, 2010)",19378102
"(Texas, 2010)",25145561


### 2. 올바른 방법

In [7]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [8]:
pop2 = pop.reindex(index)
pop2

Unnamed: 0,Unnamed: 1,populations
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [9]:
pop2.loc['California']

Unnamed: 0,populations
2000,33871648
2010,37253956


In [10]:
pop2.index.set_names(['city', 'year'], inplace=True)
pop2

Unnamed: 0_level_0,Unnamed: 1_level_0,populations
city,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [11]:
pop2.query('year==2000')

Unnamed: 0_level_0,Unnamed: 1_level_0,populations
city,year,Unnamed: 2_level_1
California,2000,33871648
New York,2000,18976457
Texas,2000,20851820


### MultiIndex as extra dimension

In [12]:
pop_df = pop2.unstack()
pop_df

Unnamed: 0_level_0,populations,populations
year,2000,2010
city,Unnamed: 1_level_2,Unnamed: 2_level_2
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [13]:
pop_df.stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,populations
city,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


## Methods of MultiIndex Creation

In [14]:
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.43353,0.778077
a,2,0.564813,0.197471
b,1,0.601648,0.464812
b,2,0.759808,0.551236


In [15]:
data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

### Explicit MultiIndex constructors

In [16]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [17]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )