<h2> Hierarchical Indexing </h2>

<p> Hierarchical indexing is useful for reshaping data and manipulating group based operations (like pivot tables)
 <p\>

In [1]:
import pandas as pd 
import numpy as np

<h3> Inner indexing  </h3>

In [2]:
data = pd.Series(np.random.randn(9),
...: index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
...: [1, 2, 3, 1, 3, 1, 2, 2, 3]])

data

a  1   -1.193596
   2    0.868104
   3   -1.639469
b  1    1.438855
   3    0.864007
c  1   -0.900131
   2    0.846841
d  2   -1.685845
   3    0.746799
dtype: float64

In [3]:
data.loc[:, 2] #inner indexing 

a    0.868104
c    0.846841
d   -1.685845
dtype: float64

In [4]:
data.unstack() # reshape my Series to a Dataframe

Unnamed: 0,1,2,3
a,-1.193596,0.868104,-1.639469
b,1.438855,,0.864007
c,-0.900131,0.846841,
d,,-1.685845,0.746799


In [5]:
data.unstack().stack() #turn back to the initial Series

a  1   -1.193596
   2    0.868104
   3   -1.639469
b  1    1.438855
   3    0.864007
c  1   -0.900131
   2    0.846841
d  2   -1.685845
   3    0.746799
dtype: float64

In [6]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
....: index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
....: columns=[['Ohio', 'Ohio', 'Colorado'],
....: ['Green', 'Red', 'Green']])

frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [7]:
frame.index.names = ['key1', 'key2'] # set the names for the multi-index
frame.columns.names = ['State', 'Color'] # set the names for the column 

frame

Unnamed: 0_level_0,State,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


<h3> Aggregating values at a certain level </h3>

In [8]:
frame.sum(level = 'key1')

State,Ohio,Ohio,Colorado
Color,Green,Red,Green
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [9]:
frame.sum(level= 'State', axis = 1)

Unnamed: 0_level_0,State,Ohio,Colorado
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1,2
a,2,7,5
b,1,13,8
b,2,19,11


In [10]:
frame.swaplevel(0,1).sort_index(0) 
#could also be written as frame.swaplevel('key1', 'key2').sort_index(0)

Unnamed: 0_level_0,State,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


<h3> Deeper indexing using the dataframe's columns </h3>

In [11]:
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
....: 'c': ['one', 'one', 'one', 'two', 'two',
....: 'two', 'two'],
....: 'd': [0, 1, 2, 0, 1, 2, 3]})

In [12]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [13]:
frame2 = frame.set_index(['c','d'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [14]:
frame.set_index(['c','d'], drop = False) 
#hold the initial columns of the dataframe
frame2.index.names = ['first' ,'second']
#we can now rename the index names so that we do not 
# mess up with the column names
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [15]:
frame2.reset_index() 
#return back to the initial object

Unnamed: 0,first,second,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


<h3> Combining and merging datasets </h3>

<ul>
    <li> pandas.merge() merges rows of different dataframes based on index values 
    <li> pandas.concat() simply 'stacks' together objects along an axis (0,1) 
    <li> combine_first fills values of one object with values from another object 

In [30]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
....: 'data1': range(7)})

df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [31]:
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
....: 'data2': range(3)})

df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [32]:
pd.merge(df1,df1)  #many to one join

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [33]:
pd.merge(df1, df2, on = "key") #good practice to specify on which column we will merge 

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [34]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
....: 'data1': range(7)})
df3

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [35]:
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
....: 'data2': range(3)})

df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [36]:
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey') #different keys so we specify each of them 

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


<p> Notice that until now merge() does by default an inner join, we need to specify excplicitly 
    an outer join.

In [37]:
pd.merge(df1, df2, how = "outer") #outer join, this obviously gives us some NaN values
                                  #in the non overlaping data

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


<ul>
    <li> merge() can have non intuitive results on a many-to-many relationship between two 
        dataframes, usually in this occasion if three rows appear with the same values in two rows of 
        second dataframe then the fibal output will be 3x2 = rows.
    <li> merge() can also be used between overlaping indexes or a mix of index and column values 
         in this occasion syntax can go like pd.merge(df1, df2, lkey = "...", right_index = "..." how =          ['inner' |'outer'| 'left'])  
    <li> the .join can be applied by default on indices and in more than two dataframes.
        Syntax goes like df1.join(df2, how = "...", on = " "). Where the on attribute is used in case
        we want to join the index of the passed dataframe with the column of the called dataframe.
       

<h3> Concat dataframes and series </h3>

<p> concat() works by default along 0 axis.

In [38]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

In [41]:
pd.concat([s1,s2,s3] )

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [42]:
pd.concat([s1,s2,s3], axis=1 )

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [44]:
s4 = pd.concat([s1, s3])
s4

a    0
b    1
f    5
g    6
dtype: int64

In [46]:
pd.concat([s1,s4], axis = 1, join= 'inner') #take the intersectio of concatenation 

Unnamed: 0,0,1
a,0,0
b,1,1


In [47]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
....: columns=['one', 'two'])

df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
....: columns=['three', 'four'])

In [51]:
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'], join= 'inner')

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5,6
c,4,5,7,8


In [52]:
# pd.concat({'level1': df1, 'level2': df2}, axis=1) 
# same as
# pd.concat([df1, df2], keys = ['level1, level2'], axis = 1)

In [53]:
 df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])

In [56]:
pd.concat([df1,df2], ignore_index= True) #reformating the index

Unnamed: 0,a,b,c,d
0,1.3458,0.624877,-1.313254,2.297402
1,0.16148,0.283213,-0.138357,-0.872183
2,-0.334826,-0.274476,-1.129067,-0.643784
3,-0.881489,-1.802397,,0.390486
4,2.025335,0.000781,,0.608936


<h3> Combining data with overlap </h3>

<p> combine_first() in the case of a dataframe 'patches' missing data column by column in the calling object with data from the object you pass.
    


<h3> Reshaping and pivoting  </h3>

In [57]:
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
.....: index=pd.Index(['Ohio', 'Colorado'], name='state'),
.....: columns=pd.Index(['one', 'two', 'three'],
.....: name='number'))

In [58]:
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [61]:
result = data.stack()
result    #stack rotates the columns into rows resulting in hierarchical intexed Series 

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [63]:
result.unstack() #by default the innnermost level will remain unstack

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [64]:
result.unstack(0) #select which level I would like to unstack

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


<p> Unstacking can introduce missing values  

In [67]:
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])

data2 = pd.concat([s1,s2], keys = ['key1', 'key2'])
data2

key1  a    0
      b    1
      c    2
      d    3
key2  c    4
      d    5
      e    6
dtype: int64

In [70]:
data2.unstack() #innermost level would get the same results as the number of the values 
                #within the innermost level are not the same. 

Unnamed: 0,a,b,c,d,e
key1,0.0,1.0,2.0,3.0,
key2,,,4.0,5.0,6.0


In [72]:
data2.unstack().stack() #from here we see that the stack method by default drops values 

key1  a    0.0
      b    1.0
      c    2.0
      d    3.0
key2  c    4.0
      d    5.0
      e    6.0
dtype: float64

In [74]:
data2.unstack().stack(dropna = False) 

key1  a    0.0
      b    1.0
      c    2.0
      d    3.0
      e    NaN
key2  a    NaN
      b    NaN
      c    4.0
      d    5.0
      e    6.0
dtype: float64

In [75]:
df = pd.DataFrame({'left': result, 'right': result + 5},
.....: columns=pd.Index(['left', 'right'], name='side'))

In [76]:
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [85]:
df.unstack(0)

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [87]:
df.unstack(0).stack(0) #unstacking and stacking at the same level can bring different results

Unnamed: 0_level_0,state,Colorado,Ohio
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7
