# Hierarchical Indexing

* Incorporate multiple index levels within w/in a single index.

# A. A Multiply Indexed Series

In [1]:
import pandas as pd
import numpy as np

### A.1. Pandas `MultiIndex`



In [2]:
index = [('California', 2010), ('California', 2020),
         ('New York', 2010), ('New York', 2020),
         ('Texas', 2010), ('Texas', 2020)]

populations = [37253956, 39538223,
                19378102, 20201249,
                25145561, 29145505]

index = pd.MultiIndex.from_tuples(index)
pop = pd.Series(populations, index=index)
pop


California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [3]:
pop[:, 2020]

California    39538223
New York      20201249
Texas         29145505
dtype: int64

### A.2. MuliIndex as ExtraDimension

* The `unstack` method will quickly convert a multiply indexed Series into a conventionally indexed DataFrame.

In [4]:
pop_df = pop.unstack()
pop_df

Unnamed: 0,2010,2020
California,37253956,39538223
New York,19378102,20201249
Texas,25145561,29145505


In [5]:
pop_df.stack()

California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [7]:
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9284094, 8898092,
                                   4318033, 4181528,
                                   6879014, 7432474]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2010,37253956,9284094
California,2020,39538223,8898092
New York,2010,19378102,4318033
New York,2020,20201249,4181528
Texas,2010,25145561,6879014
Texas,2020,29145505,7432474


In [8]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()

Unnamed: 0,2010,2020
California,0.249211,0.22505
New York,0.222831,0.206994
Texas,0.273568,0.255013


In [9]:
f_u18

California  2010    0.249211
            2020    0.225050
New York    2010    0.222831
            2020    0.206994
Texas       2010    0.273568
            2020    0.255013
dtype: float64

# B. Methods of MultiIndex Creation

In [11]:
# pass a list of 2 or more index arrays to the constructor
df = pd.DataFrame(np.random.rand(4,2), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.887582,0.791925
a,2,0.094555,0.247174
b,1,0.046357,0.508852
b,2,0.859103,0.259699


In [12]:
# Pass a dictionary w/ appropriate tuples as keys
data = {('California', 2010): 37253956,
        ('California', 2020): 39538223,
        ('New York', 2010): 19378102,
        ('New York', 2020): 20201249,
        ('Texas', 2010): 25145561,
        ('Texas', 2020): 29145505}
pd.Series(data)

California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

### B.1. Explicit MultiIndex Constructors

* Any of these objects can be passed as the index argument when creating a Series or DataFrame, or be passed to the `reindex` method of an existing Series or DataFrame.

In [13]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [14]:
# Construct from a list of tuples
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [15]:
# Construct from a cartesian product of single indices
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [16]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

### B.2. MultiIndex Level Names

In [18]:
pop

California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [20]:
pop.index.names = ['state', 'year']
pop

state       year
California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

### B.3. MultiIndex for Columns


In [21]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                    names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                    names=['subject', 'type'])

data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
data

array([[32. , 36.5, 43. , 37.9, 49. , 37.7],
       [37. , 36.5, 26. , 37.2, 28. , 37.6],
       [16. , 35.4, 23. , 38.2, 33. , 35.6],
       [22. , 36.9, 27. , 35.9, 54. , 38.7]])

In [22]:
# Create Datafram

health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,32.0,36.5,43.0,37.9,49.0,37.7
2013,2,37.0,36.5,26.0,37.2,28.0,37.6
2014,1,16.0,35.4,23.0,38.2,33.0,35.6
2014,2,22.0,36.9,27.0,35.9,54.0,38.7


In [23]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,43.0,37.9
2013,2,26.0,37.2
2014,1,23.0,38.2
2014,2,27.0,35.9


# C. Indexing and Slicing a MultiIndex

### C.1. Multiple Indexed Series

In [24]:
pop

state       year
California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [25]:
pop['California', 2010]

37253956

In [26]:
pop['California']

year
2010    37253956
2020    39538223
dtype: int64

In [27]:
pop.loc['California':'New York']

state       year
California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
dtype: int64

In [28]:
pop[:, 2010]

state
California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [29]:
pop[pop > 22000000]

state       year
California  2010    37253956
            2020    39538223
Texas       2010    25145561
            2020    29145505
dtype: int64

In [30]:
pop[['California', 'Texas']]

state       year
California  2010    37253956
            2020    39538223
Texas       2010    25145561
            2020    29145505
dtype: int64

### C.2. Multiple Indexed DataFrames

In [31]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,32.0,36.5,43.0,37.9,49.0,37.7
2013,2,37.0,36.5,26.0,37.2,28.0,37.6
2014,1,16.0,35.4,23.0,38.2,33.0,35.6
2014,2,22.0,36.9,27.0,35.9,54.0,38.7


In [32]:
health_data['Guido', 'HR']

year  visit
2013  1        43.0
      2        26.0
2014  1        23.0
      2        27.0
Name: (Guido, HR), dtype: float64

In [34]:
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,32.0,36.5
2013,2,37.0,36.5


In [35]:
health_data.loc[:, ('Bob', 'HR')]

year  visit
2013  1        32.0
      2        37.0
2014  1        16.0
      2        22.0
Name: (Bob, HR), dtype: float64

In [36]:
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,32.0,43.0,49.0
2014,1,16.0,23.0,33.0


# D. Rearranging Multi-Indexes

# D.1. Sorted and Unsorted Indices:
* Many of the MultiIndex Slicing operations will fail if the index is not sorted.

In [37]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.928030
      2      0.617498
c     1      0.816240
      2      0.843884
b     1      0.120676
      2      0.017223
dtype: float64

In [38]:
try:
    data['a':'b']
except KeyError as e:
    print("KeyError", e)

KeyError 'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [39]:
data = data.sort_index()
data

char  int
a     1      0.928030
      2      0.617498
b     1      0.120676
      2      0.017223
c     1      0.816240
      2      0.843884
dtype: float64

In [40]:
data['a':'b']

char  int
a     1      0.928030
      2      0.617498
b     1      0.120676
      2      0.017223
dtype: float64

### D.2 Stacking and Unstacking Indices

In [41]:
pop

state       year
California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [44]:
pop.unstack(level=0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010,37253956,19378102,25145561
2020,39538223,20201249,29145505


In [45]:
pop.unstack(level=1)


year,2010,2020
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,37253956,39538223
New York,19378102,20201249
Texas,25145561,29145505


### D.3. Index Setting and Resetting


In [46]:
pop_flat = pop.reset_index(name='population')
pop_flat

Unnamed: 0,state,year,population
0,California,2010,37253956
1,California,2020,39538223
2,New York,2010,19378102
3,New York,2020,20201249
4,Texas,2010,25145561
5,Texas,2020,29145505


In [47]:
pop_flat.set_index(['state', 'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2010,37253956
California,2020,39538223
New York,2010,19378102
New York,2020,20201249
Texas,2010,25145561
Texas,2020,29145505
