# Chapter 8: Data Wrangling: Join, Combine, and Reshape

## 8.1 Hierarchical Indexing

In [23]:
import pandas as pd
import numpy as np

In [24]:
data = pd.Series(np.random.randn(9),
        index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                [1,2,3,1,3,1,2,2,3]])
data

a  1   -0.814611
   2   -0.118912
   3    1.433833
b  1   -0.414861
   3   -0.315389
c  1    0.562111
   2   -0.446989
d  2   -0.754253
   3    0.377300
dtype: float64

In [25]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [26]:
data['b']

1   -0.414861
3   -0.315389
dtype: float64

In [27]:
data['b':'c']

b  1   -0.414861
   3   -0.315389
c  1    0.562111
   2   -0.446989
dtype: float64

In [28]:
data.loc[['b','d']]

b  1   -0.414861
   3   -0.315389
d  2   -0.754253
   3    0.377300
dtype: float64

In [29]:
data.loc[:,2]

a   -0.118912
c   -0.446989
d   -0.754253
dtype: float64

In [30]:
data.unstack()

Unnamed: 0,1,2,3
a,-0.814611,-0.118912,1.433833
b,-0.414861,,-0.315389
c,0.562111,-0.446989,
d,,-0.754253,0.3773


In [31]:
data.unstack().stack()

a  1   -0.814611
   2   -0.118912
   3    1.433833
b  1   -0.414861
   3   -0.315389
c  1    0.562111
   2   -0.446989
d  2   -0.754253
   3    0.377300
dtype: float64

In [32]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                     columns=[['Ohio', 'Ohio', 'Colorado'],
                              ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [33]:
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [34]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [35]:
MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']], 
                    names=['state', 'color'])

NameError: name 'MultiIndex' is not defined

## #What module is MultiIndex from?

### Reordering and Sorting Levels

In [None]:
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [None]:
frame.sort_index(level=1)


Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [None]:
frame.swaplevel(0, 1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


### Summary Statistics by Level

In [None]:
frame.sum(level='key2')


state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [None]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


### Indexing with a DataFrame's columns

In [None]:
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                      'c': ['one', 'one', 'one', 'two', 'two',
                            'two', 'two'],
                      'd': [0, 1, 2, 0, 1, 2, 3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [None]:
frame2 = frame.set_index(['c', 'd'])
frame2

In [None]:
frame.set_index(['c', 'd'], drop=False)

In [None]:
frame2.reset_index()

## 8.2 Combining and Merging Datasets