In [1]:
### Hierarchial indexing is an important feature of pandas enabling us to have multiple (two or more) index levels on an axis.

### It provides a way for us to work with higher dimensional data in a lower dimensional form.


In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.Series(np.random.randn(10), index = [['a','a','a','b','b','b','c','c','d','d'], [1,2,3,1,2,3,1,2,2,3]])

In [4]:
data

a  1    0.477359
   2   -1.469327
   3   -0.084139
b  1    0.170079
   2   -0.509953
   3    0.567489
c  1   -0.603541
   2    0.776937
d  2   -0.247770
   3   -1.509999
dtype: float64

In [5]:
### This is a prettified view of a Series with Multi-index as its index.

data.index

MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [6]:
### With a hierarchially indexed object, partial indexing is possible enabling us to concisely select subsets of the data

In [7]:
data['a']

1    0.477359
2   -1.469327
3   -0.084139
dtype: float64

In [8]:
data['a':'c']

a  1    0.477359
   2   -1.469327
   3   -0.084139
b  1    0.170079
   2   -0.509953
   3    0.567489
c  1   -0.603541
   2    0.776937
dtype: float64

In [9]:
data.ix[['b','d']]

b  1    0.170079
   2   -0.509953
   3    0.567489
d  2   -0.247770
   3   -1.509999
dtype: float64

In [10]:
### Selection is even possible in some cases from an "inner" level

data[:,2]

a   -1.469327
b   -0.509953
c    0.776937
d   -0.247770
dtype: float64

In [11]:
data[:,2]

a   -1.469327
b   -0.509953
c    0.776937
d   -0.247770
dtype: float64

In [12]:
### Hierarchial indexing plays a critical role in reshaping data and group-based operations like formatting a pivot table.

### For example, the data Series could be rearranged into a DataFrame using its unstack method

data.unstack()

Unnamed: 0,1,2,3
a,0.477359,-1.469327,-0.084139
b,0.170079,-0.509953,0.567489
c,-0.603541,0.776937,
d,,-0.24777,-1.509999


In [13]:
### The reverse operation of unstack is stack

data.unstack().stack()

a  1    0.477359
   2   -1.469327
   3   -0.084139
b  1    0.170079
   2   -0.509953
   3    0.567489
c  1   -0.603541
   2    0.776937
d  2   -0.247770
   3   -1.509999
dtype: float64

In [14]:
### With a DataFrame, either axis can have a hierarchial index

df = pd.DataFrame(np.random.randn(4,3), index = [['a','a','b','b'], [1,2,1,2]], 
                  columns=[['Ohio','Ohio','Colorado'], ['Green','Red','Green']])

In [15]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,-1.671588,1.189699,0.974884
a,2,0.110755,-0.137042,0.466734
b,1,-0.464769,0.112533,-1.028509
b,2,-0.199491,0.294608,-0.535509


In [16]:
df.index

MultiIndex(levels=[[u'a', u'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [17]:
df.columns

MultiIndex(levels=[[u'Colorado', u'Ohio'], [u'Green', u'Red']],
           labels=[[1, 1, 0], [0, 1, 0]])

In [18]:
### The hierarchial levels can have names(as strings or any Python objects). 

### This will assign names to the levels .i.e. a,b.. and 1,2...
df.index.names = ['key1','key2']

In [19]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,-1.671588,1.189699,0.974884
a,2,0.110755,-0.137042,0.466734
b,1,-0.464769,0.112533,-1.028509
b,2,-0.199491,0.294608,-0.535509


In [20]:
### Now lets assign names to the columns
df.columns.names = ['state','color']

In [21]:
df

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,-1.671588,1.189699,0.974884
a,2,0.110755,-0.137042,0.466734
b,1,-0.464769,0.112533,-1.028509
b,2,-0.199491,0.294608,-0.535509


In [22]:
### With partial column indexing you can similarly select groups of columns
df['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,-1.671588,1.189699
a,2,0.110755,-0.137042
b,1,-0.464769,0.112533
b,2,-0.199491,0.294608


In [87]:
cust_index

MultiIndex(levels=[[u'Colorado', u'Ohio'], [u'Green', u'Red']],
           labels=[[1, 1, 0], [0, 1, 0]],
           names=[u'state', u'color'])

In [28]:
df = pd.DataFrame(np.random.randn(3,4), index = cust_index)

In [37]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
state,color,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ohio,Green,-0.587313,0.307269,-1.961853,0.028354
Ohio,Red,0.228699,2.510734,0.443295,-0.408094
Colorado,Green,0.3076,-0.706806,-1.135073,0.135542


In [38]:
### For detailed info, refer http://pandas.pydata.org/pandas-docs/stable/advanced.html

In [54]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [55]:
arrays

[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
 ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [56]:
tuples = list(zip(*arrays))

In [57]:
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [58]:
cust_index1 = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

In [59]:
cust_index1

MultiIndex(levels=[[u'bar', u'baz', u'foo', u'qux'], [u'one', u'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=[u'first', u'second'])

In [60]:
s = pd.Series(np.random.randn(8), index=cust_index1)

In [61]:
s

first  second
bar    one       0.004966
       two       0.496378
baz    one      -2.015924
       two      -0.149823
foo    one       0.368122
       two      -0.104881
qux    one       0.676339
       two      -0.164165
dtype: float64

In [62]:
### When you want every pairing of the elements in two iterables, it can be easier to use the MultiIndex.from_product function


In [63]:
iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']]

In [65]:
from_itr = pd.MultiIndex.from_product(iterables, names=['first', 'second'])

In [71]:
df = pd.DataFrame(np.random.randn(8,2), index=from_itr)

In [72]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.147693,0.113693
bar,two,0.635327,-0.621988
baz,one,0.169449,-0.124641
baz,two,1.227051,0.13963
foo,one,-0.320438,1.155455
foo,two,-0.483355,0.635989
qux,one,-1.691191,0.299617
qux,two,-1.205191,-0.150242


In [73]:
### As a convenience, you can pass a list of arrays directly into Series or DataFrame to construct a MultiIndex automatically

In [75]:
from_arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']),
          np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])]

In [79]:
s_fa = pd.Series(np.random.randn(8), index=from_arrays)

In [80]:
s_fa

bar  one   -0.134619
     two   -0.359423
baz  one    0.224108
     two   -0.868649
foo  one   -1.310265
     two   -0.323248
qux  one   -1.237893
     two   -0.312859
dtype: float64

In [81]:
df_fa = pd.DataFrame(np.random.randn(8, 4), index=from_arrays)

In [82]:
df_fa

Unnamed: 0,Unnamed: 1,0,1,2,3
bar,one,0.209345,-0.547609,-0.985437,-0.526135
bar,two,-1.804567,0.495107,-0.185384,-0.182224
baz,one,-0.938063,0.572297,-0.348711,-0.345012
baz,two,0.141238,0.959861,-1.084878,-0.365826
foo,one,-1.083868,-0.67118,-0.379585,-0.462061
foo,two,-1.431173,2.644074,-0.322535,-0.0902
qux,one,1.204697,0.475341,-0.995881,0.095592
qux,two,-0.282913,-0.320403,0.099258,-0.351209


In [83]:
### All of the MultiIndex constructors accept a names argument which stores string names for the levels themselves. 
### If no names are provided, None will be assigned
df_fa.index.names

FrozenList([None, None])

In [85]:
df_fac = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=from_arrays)

In [86]:
df_fac

Unnamed: 0_level_0,bar,bar,baz,baz,foo,foo,qux,qux
Unnamed: 0_level_1,one,two,one,two,one,two,one,two
A,1.071872,2.224337,-0.410465,0.105868,-0.136011,-0.868307,0.285897,-0.342632
B,1.368716,-0.487384,-0.056665,-0.410722,-0.277962,-0.78729,-0.361756,1.007742
C,0.281273,-0.066509,0.447445,-1.053085,-0.989202,0.283299,-1.251567,0.512025


In [None]:
### So going back to our first example

### df = pd.DataFrame(np.random.randn(4,3), index = [['a','a','b','b'], [1,2,1,2]], 
    ###              columns=[['Ohio','Ohio','Colorado'], ['Green','Red','Green']])

### This can be created by using our cust_index as follows

In [88]:
df_e1 = pd.DataFrame(np.random.randn(4,3), index = [['a','a','b','b'], [1,2,1,2]], 
                  columns=cust_index)

In [89]:
df_e1

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
a,1,-0.386131,-0.566485,-0.357676
a,2,0.036605,-0.560146,-0.615468
b,1,0.145183,-0.01155,-2.030255
b,2,-0.151022,-0.743094,-0.275676


In [90]:
### This is the same output thats produced by
df_o = pd.DataFrame(np.random.randn(4,3), index = [['a','a','b','b'], [1,2,1,2]], 
                    columns=[['Ohio','Ohio','Colorado'], ['Green','Red','Green']])

In [91]:
df_o

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0.66278,-0.934671,0.814717
a,2,-1.112588,-0.282671,1.275469
b,1,-0.63398,1.379521,0.15112
b,2,-2.852114,-0.354502,-0.022666


In [92]:
### except that we haven't assigned names yet. Which we can as follows

df_o.columns.names = ['state','color']

In [93]:
df_o

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
a,1,0.66278,-0.934671,0.814717
a,2,-1.112588,-0.282671,1.275469
b,1,-0.63398,1.379521,0.15112
b,2,-2.852114,-0.354502,-0.022666


In [94]:
### Now df_o is the same as df_e1

df_e1

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
a,1,-0.386131,-0.566485,-0.357676
a,2,0.036605,-0.560146,-0.615468
b,1,0.145183,-0.01155,-2.030255
b,2,-0.151022,-0.743094,-0.275676


In [None]:
### Awesome!!!!

In [None]:
### Now lets make it much more simple

In [95]:
arrays_simple = [['a', 'a', 'b', 'b'],[1,2,1,2]]

In [96]:
arrays_simple

[['a', 'a', 'b', 'b'], [1, 2, 1, 2]]

In [97]:
tuples_simple = list(zip(*arrays_simple))

In [98]:
tuples_simple

[('a', 1), ('a', 2), ('b', 1), ('b', 2)]

In [None]:
### Now lets modify the definition of df_e1
### which is currently df_e1 = pd.DataFrame(np.random.randn(4,3), index = [['a','a','b','b'], [1,2,1,2]], columns=cust_index)

In [101]:
df_e2 = pd.DataFrame(np.random.randn(4,3), index = arrays_simple, 
                  columns=cust_index)

In [102]:
df_e2

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
a,1,0.038693,-1.653746,-0.802216
a,2,0.800885,-1.745823,-0.118305
b,1,-0.325193,0.289078,-0.626914
b,2,-0.118794,0.204492,1.498627


In [104]:
### if you assign the new tuple as the index of our new data frame, it will look like this

df_e3 = pd.DataFrame(np.random.randn(4,3), index = tuples_simple, 
                  columns=cust_index)

In [105]:
df_e3

state,Ohio,Ohio,Colorado
color,Green,Red,Green
"(a, 1)",-0.56554,0.011967,0.684804
"(a, 2)",-0.876472,-1.21726,0.539996
"(b, 1)",-0.92806,-1.765981,0.149603
"(b, 2)",0.208658,-0.685984,-0.45246


In [None]:
### The index can back any axis of a pandas object, and the number of levels of the index is up to you

In [109]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

In [110]:
index

MultiIndex(levels=[[u'bar', u'baz', u'foo', u'qux'], [u'one', u'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=[u'first', u'second'])

In [111]:
df_levels = pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6])

In [112]:
df_levels

Unnamed: 0_level_0,first,bar,bar,baz,baz,foo,foo
Unnamed: 0_level_1,second,one,two,one,two,one,two
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
bar,one,0.177796,-0.199724,2.413897,0.451433,-0.312347,-0.05774
bar,two,1.027775,0.36735,1.433007,0.151289,0.763538,-1.306808
baz,one,0.968945,-1.117478,1.163834,-0.618145,-0.188487,0.998391
baz,two,-0.668283,0.597563,-0.710684,-0.446482,-0.389792,0.794367
foo,one,-0.160965,1.038695,0.575504,-0.819334,-1.245296,0.793721
foo,two,-0.512341,-1.172836,0.654917,-0.088157,0.452898,0.37695


In [None]:
### We’ve “sparsified” the higher levels of the indexes to make the console output a bit easier on the eyes.

### It’s worth keeping in mind that there’s nothing preventing you from using tuples as atomic labels on an axis

In [113]:
ye_ex = pd.Series(np.random.randn(8), index=tuples)

In [114]:
ye_ex

(bar, one)    0.338553
(bar, two)   -0.155343
(baz, one)    0.136066
(baz, two)   -0.150944
(foo, one)   -0.902864
(foo, two)    0.263390
(qux, one)   -0.228368
(qux, two)   -0.563821
dtype: float64

In [None]:
### We have covered the basics pretty much