# Hierarchical Indexing
1. A multiply index series
2. methods of multiIndex creation
3. indexing and slicing a multiIndex
4. rearranging multi-Indices
5. Data Aggregations on multi-Indices

In [3]:
# A multiply Indexed Series
# A BAD way : tuple base indexing
import pandas as pd

index = [('California', 2000), ('California', 2010),
 ('New York', 2000), ('New York', 2010),
 ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
             20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [6]:
pop[('California', 2000):('Texas', 2000)]

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [7]:
# if you need to retrieve all value of 2010
pop[[i for i in pop.index if i[1]==2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [8]:
# The batter way : Pandas multiIndex
index=pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [9]:
#If we reindex our series with this MultiIndex, we see the hierarchical representation of the data:
pop=pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [10]:
# to all access all data for which the second index is 2010 ,we can simply use the pandas slicing notation
pop[:,2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [11]:
# MultiIndex as extra dimension



In [13]:
# unstack() method will quickly convert a multipl Indexed series in to conventionally indexed dataframe

pop_df=pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [14]:
# the stack() method provide the opposite operation
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

# Data Wrangling:Join,Combine and Reshape

In [3]:
#Hierarchical Indexing
#--- it is a imp feature of pandas that enables you yo have multiple(two or more) index levels on an axis.it provides a way
#------you to work with high dimensional data in a lower dimensional form
import pandas as pd
import numpy as np
data=pd.Series(np.random.randn(9),index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],[1, 2, 3, 1, 3, 1, 2, 2, 3]])
data

a  1   -0.248576
   2   -0.014273
   3    1.391910
b  1   -0.162114
   3    0.720321
c  1   -1.079288
   2    0.789252
d  2   -1.066543
   3   -0.700604
dtype: float64

In [4]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [5]:
data['b']

1   -0.162114
3    0.720321
dtype: float64

In [6]:
data[1:3]

a  2   -0.014273
   3    1.391910
dtype: float64

In [7]:
data[1:4]

a  2   -0.014273
   3    1.391910
b  1   -0.162114
dtype: float64

In [8]:
data['b':'c']

b  1   -0.162114
   3    0.720321
c  1   -1.079288
   2    0.789252
dtype: float64

In [10]:
data.loc[['b','c']]

b  1   -0.162114
   3    0.720321
c  1   -1.079288
   2    0.789252
dtype: float64

In [11]:
data[:,2]

a   -0.014273
c    0.789252
d   -1.066543
dtype: float64

In [12]:
data.unstack()

Unnamed: 0,1,2,3
a,-0.248576,-0.014273,1.39191
b,-0.162114,,0.720321
c,-1.079288,0.789252,
d,,-1.066543,-0.700604


In [13]:
data

a  1   -0.248576
   2   -0.014273
   3    1.391910
b  1   -0.162114
   3    0.720321
c  1   -1.079288
   2    0.789252
d  2   -1.066543
   3   -0.700604
dtype: float64

In [14]:
data.unstack().stack()

a  1   -0.248576
   2   -0.014273
   3    1.391910
b  1   -0.162114
   3    0.720321
c  1   -1.079288
   2    0.789252
d  2   -1.066543
   3   -0.700604
dtype: float64

In [15]:
#With a DataFrame, either axis can have a hierarchical index:
frame=pd.DataFrame(np.arange(12).reshape(4,3),index=[['a','a','b','b'],[1,2,1,2]],columns=[['Ohio', 'Ohio', 'Colorado'],['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [16]:
frame.index.names=['key1','key2'] # give the column name to the multiindexed

In [17]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [18]:
# give the name to the columns
frame.columns.names=['state','color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [20]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [21]:
# Reordering and Sorting Levels


In [None]:
# The swaplevel takes two level numbers or names
#---and returns a new object with the levels interchanged (but the data is otherwise unaltered):

In [22]:
frame.swaplevel('key1','key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [None]:
# sort_index ,sorts the data using only the values in a single value.when swapping levels,it's not 
#---------uncommon to also use sort_index so that the result us lexicogrphically sorted by the indicated level

In [23]:
frame.sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [24]:
frame.swaplevel(0,1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [25]:
#Data selection performance is much better on hierarchically
#----indexed objects if the index is lexicographically sorted starting with
#-------the outermost level—that is, the result of calling
#--------sort_index(level=0) or sort_index().
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [26]:
frame.sum(level='color',axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [27]:
frame=pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one', 'one', 'one', 'two', 'two','two', 'two'],'d':[0,1,2,0,1,2,3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [29]:
frame2=frame.set_index(['c','d'])
frame2
#DataFrame’s set_index function will create a new DataFrame using one or more of its columns as the index:


Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [30]:
#By default the columns are removed from the DataFrame, though you can leave them in:
frame.set_index(['c','d'],drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [31]:
#reset_index, on the other hand, does the opposite of set_index; the hierarchical index levels are moved into the columns:
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1
