In [1]:
# data analysis with pandas

In [2]:
## Join, Combining and Reshap Data Frames 

In [3]:
### 8.1 Hierarchical Indexing

In [74]:
import pandas as pd
import numpy as np
data = pd.Series(np.random.randn(10),
        index=[['a', 'a', 'a','a', 'b', 'b', 'c', 'c', 'd', 'd'], ##follow the sequence
        [1, 2, 3, 1, 3, 1, 2, 2, 3,1] ] )
display(data)
#hierarchically indexed object partial indexing is possible


a  1   -0.603664
   2   -0.656246
   3    1.725656
   1   -0.722533
b  3   -0.105886
   1   -1.246304
c  2   -0.839721
   2    0.206758
d  3    0.217927
   1   -0.859156
dtype: float64

In [75]:
display( data['b'] )
#data['b':'c']
#data.loc[['b', 'd']]

#Selection is even possible from an “inner” level:
#data.loc[:, 2]


3   -0.105886
1   -1.246304
dtype: float64

In [76]:
data['b':'c']

b  3   -0.105886
   1   -1.246304
c  2   -0.839721
   2    0.206758
dtype: float64

In [77]:
data.loc[['b', 'd']]

b  3   -0.105886
   1   -1.246304
d  3    0.217927
   1   -0.859156
dtype: float64

In [78]:
data.loc[:, 2]   # :first dimenshion , second dimenshion and 2 index value

a   -0.656246
c   -0.839721
c    0.206758
dtype: float64

In [79]:
#Hierarchical indexing plays an important role in reshaping data and group-based
#operations like forming a pivot table. For example, 
#you could rearrange the data into
#a DataFrame using its unstack method:
df =  data.unstack()
display(df)
#df.stack()

ValueError: Index contains duplicate entries, cannot reshape

In [80]:
data.unstack().stack()

ValueError: Index contains duplicate entries, cannot reshape

In [117]:
#With a DataFrame, either axis can have a hierarchical index
# [18]
import pandas as pd
import numpy as np
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
        index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
        columns=[['Ohio', 'Ohio', 'Colorado'],
        ['Green', 'Red', 'Green']])
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
display(frame)
frame.loc[:,'Ohio']

#Be careful to distinguish the index names 'state' and 'color'
#from the row labels

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [108]:
frame = pd.DataFrame(np.arange(16).reshape((4, 4)),
        index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
        columns=[['Ohio', 'Ohio', 'Colorado','Colarado'],
        ['Green', 'Red', 'Green','Red']])
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
display(frame)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado,Colarado
Unnamed: 0_level_1,color,Green,Red,Green,Red
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [104]:
frame['Ohio']


Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,4,5
b,1,8,9
b,2,12,13


In [118]:
#@@  Reordering and sorting Levels
frame.swaplevel('key1', 'key2')
#can save in the same frame variables 
# or another variables like
# frame = frame.swaplevel('key1', 'key2')


Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [123]:
#sort_index , on the other hand, sorts the data using only the values in a single level.
#When swapping levels, it’s not uncommon to also use sort_index so that the result is
#lexicographically sorted by the indicated level:
frame.sort_index(level=1)
# compare the above statement output with following statement output
frame.swaplevel(0, 1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


### Summary Statistics by Level
aggregate by level on either the rows or columns

In [57]:
display(frame)
frame.sum(level='key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [58]:
display(frame)

frame.sum(level='color', axis=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [125]:
# Indexing with a DataFrame’s columns
frame = pd.DataFrame({'a': range(7), 
                      'b': range(7, 0, -1),
                      'c': ['one', 'one', 'one', 'two', 'two', 
                            'two', 'two'],
                      'd': [0, 1, 2, 0, 1, 2, 3]}
                    )
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [126]:
frame2 = frame.set_index(['c', 'd'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [127]:
#By default the columns are removed from the DataFrame, 
#though you can leave them in:
frame.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [128]:
#reset_index , on the other hand, does the opposite of set_index ; the hierarchical
#index levels are moved into the columns
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


### Combining and Merging Datasets
* pandas.merge connects rows in DataFrames based on one or more keys.
* pandas.concat concatenates or “stacks” together objects along an axis
* The combine_first instance method enables splicing together overlapping data  to fill in missing values in one object with values from another.

In [132]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
print(df1)
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],'data2': range(3)})
print(df2)
pd.merge(df1, df2, on='key') # on='key',how='Outer'
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
        'data2': range(3)})
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   a      5
6   b      6
  key  data2
0   a      0
1   b      1
2   d      2


Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [31]:
df1 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df2 = pd.DataFrame({'rkey': ['a', 'b', 'd'],'data2': range(3)})
pd.merge(df1, df2, left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [134]:
# remmebr to import libararies if required

left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                      'value': range(6)} )
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

#pd.merge(left1, right1, left_on='key', right_index=True)
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')
#pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


### Concatenating Along an Axis
* Another kind of data combination operation is referred to interchangeably as concat‐
enation, binding, or stacking

In [140]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
pd.concat([s1, s2, s3])
#By default concat works along axis=0
#pd.concat([s1, s2, s3], axis=1)
#In this case there is no overlap on the other axis, which as you can see is the sorted
#union (the 'outer' join) of the indexes. You can instead intersect them by passing
#join='inner' :


a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [141]:
pd.concat([s1, s2, s3], axis=1) # axis 0 rows axis 1 column

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [71]:
#In [87]: s4 = pd.concat([s1, s3])
#In [89]: pd.concat([s1, s4], axis=1)
#In [90]: pd.concat([s1, s4], axis=1, join='inner')

#In this last example, the 'f' and 'g' labels disappeared because of the join='inner'
#option

#You can even specify the axes to be used on the other axes with join_axes :
#In [91]: pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])

In [144]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
result = pd.concat([s1, s2, s3], keys=['one', 'two', 'three'])
result

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64

In [None]:
#A last consideration concerns DataFrames in which the row index 
#does not contain any relevant data:

In [75]:
df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
display(df1)
display( df2)
pd.concat([df1, df2], ignore_index=True)
pd.concat([df1, df2], ignore_index=True, sort=True)


Unnamed: 0,a,b,c,d
0,-0.616162,-0.334748,1.830146,-0.558541
1,1.59066,0.427168,1.124861,1.867629
2,-1.588619,-0.001291,-0.891064,-1.452449


Unnamed: 0,b,d,a
0,0.981109,1.348128,0.404504
1,-0.90402,1.225098,-1.44344


Unnamed: 0,a,b,c,d
0,-0.616162,-0.334748,1.830146,-0.558541
1,1.59066,0.427168,1.124861,1.867629
2,-1.588619,-0.001291,-0.891064,-1.452449
3,0.404504,0.981109,,1.348128
4,-1.44344,-0.90402,,1.225098


In [147]:
### Pivoting “Long” to “Wide” Format
data = pd.read_csv('macrodata.csv')
data.head()

FileNotFoundError: [Errno 2] File b'macrodata.csv' does not exist: b'macrodata.csv'

In [89]:
periods = pd.PeriodIndex(year=data.year, 
                         quarter=data.quarter, name='date')
columns = pd.Index(['realgdp', 'infl', 'unemp'], name='item')
data = data.reindex(columns=columns)
data.index = periods.to_timestamp('D', 'end')
ldata = data.stack().reset_index().rename(columns={0: 'value'})
ldata[:10]

Unnamed: 0,date,item,value
0,1959-03-31,realgdp,2710.349
1,1959-03-31,infl,0.0
2,1959-03-31,unemp,5.8
3,1959-06-30,realgdp,2778.801
4,1959-06-30,infl,2.34
5,1959-06-30,unemp,5.1
6,1959-09-30,realgdp,2775.488
7,1959-09-30,infl,2.74
8,1959-09-30,unemp,5.3
9,1959-12-31,realgdp,2785.204


### Pivoting “Wide” to “Long” Format
* An inverse operation to pivot for DataFrames is pandas.melt . Rather than transforming one column into many in a new DataFrame, it merges multiple columns into one, producing a DataFrame that is longer than the input.


In [92]:
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]})
display(df)
melted = pd.melt(df, ['key'])
melted

Unnamed: 0,key,A,B,C
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9
