# Concatenating Along Axis:

In [1]:
import numpy as np
import pandas as pd
arr=np.arange(12).reshape(4,3)
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [7]:
np.concatenate([arr,arr],axis=0)

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [8]:
np.concatenate([arr,arr],axis=1)

array([[ 0,  1,  2,  0,  1,  2],
       [ 3,  4,  5,  3,  4,  5],
       [ 6,  7,  8,  6,  7,  8],
       [ 9, 10, 11,  9, 10, 11]])

In [9]:
s1=pd.Series([1,3],index=['a','b'])
s1

a    1
b    3
dtype: int64

In [10]:
s2=pd.Series([2,5,6],index=['a','b','c'])
s2

a    2
b    5
c    6
dtype: int64

In [12]:
s3=pd.Series([7,8,9],index=['e','f','g'])
s3

e    7
f    8
g    9
dtype: int64

In [14]:
s4=pd.concat([s1,s2,s3])
s4

a    1
b    3
a    2
b    5
c    6
e    7
f    8
g    9
dtype: int64

In [15]:
s5=pd.concat([s1,s2,s3],axis=1)
s5

Unnamed: 0,0,1,2
a,1.0,2.0,
b,3.0,5.0,
c,,6.0,
e,,,7.0
f,,,8.0
g,,,9.0


In [19]:
s6=pd.concat([s1,s3])
s6

a    1
b    3
e    7
f    8
g    9
dtype: int64

In [20]:
s7=pd.concat([s1,s6],axis=1)
s7

Unnamed: 0,0,1
a,1.0,1
b,3.0,3
e,,7
f,,8
g,,9


In [23]:
s8=pd.concat([s1,s6],axis=1,join='inner')
s8

Unnamed: 0,0,1
a,1,1
b,3,3


In [29]:
s9=pd.concat([s2,s3],keys=['one','two'])
s9

one  a    2
     b    5
     c    6
two  e    7
     f    8
     g    9
dtype: int64

In [31]:
s9.unstack()

Unnamed: 0,a,b,c,e,f,g
one,2.0,5.0,6.0,,,
two,,,,7.0,8.0,9.0


In [32]:
pd.concat([s1,s2,s3],keys=['one','two','three'])

one    a    1
       b    3
two    a    2
       b    5
       c    6
three  e    7
       f    8
       g    9
dtype: int64

In [33]:
pd.concat([s1,s2,s3],keys=['one','two','three']).unstack()

Unnamed: 0,a,b,c,e,f,g
one,1.0,3.0,,,,
two,2.0,5.0,6.0,,,
three,,,,7.0,8.0,9.0


In [2]:
data=pd.DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])
data

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [3]:
data1=pd.DataFrame(5+np.arange(4).reshape(2,2),index=['a','b'],columns=['three','four'])
data1

Unnamed: 0,three,four
a,5,6
b,7,8


In [4]:
data2=pd.concat([data,data1],axis=1,keys=['level1','level2'])
data2

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,7.0,8.0
c,4,5,,


In [5]:
data3=pd.concat([data,data1],axis=1,keys=['level1','level2'],names=['upper','lower'])
data3

upper,level1,level1,level2,level2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,7.0,8.0
c,4,5,,


In [8]:
df=pd.DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df

Unnamed: 0,a,b,c,d
0,-0.473916,-1.468063,-0.226965,-1.662722
1,1.254443,0.179534,0.747055,-1.263843
2,-0.440367,1.11038,0.754686,1.442067


In [9]:
df1=pd.DataFrame(np.random.randn(2,3),columns=['b','c','d'])
df1

Unnamed: 0,b,c,d
0,0.721133,-0.204445,1.063718
1,1.031525,-1.005619,-2.09245


In [11]:
df3=pd.concat([df,df1],ignore_index=True)
df3

Unnamed: 0,a,b,c,d
0,-0.473916,-1.468063,-0.226965,-1.662722
1,1.254443,0.179534,0.747055,-1.263843
2,-0.440367,1.11038,0.754686,1.442067
3,,0.721133,-0.204445,1.063718
4,,1.031525,-1.005619,-2.09245


# Combining Data With Overlape 

In [12]:
a=pd.Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],index=['f','e','d','c','b','a'])
a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [15]:
b=pd.Series(np.arange(len(a),dtype=np.float64),index=['f','e','d','c','b','a'])
b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

In [16]:
b[-1]=np.nan

In [17]:
b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64

In [19]:
#the null value of a is filled the value of b according to index
np.where(pd.isnull(a),b,a)

array([0. , 2.5, 2. , 3.5, 4.5, nan])

In [20]:
b[:-2]

f    0.0
e    1.0
d    2.0
c    3.0
dtype: float64

In [21]:
a[2:]

d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [22]:
b[:-2].combine_first(a[2:])

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

In [23]:
df1=pd.DataFrame({'a':[1,np.nan,5,np.nan],'b':[np.nan,2,np.nan,6],'c':np.arange(2,18,4)})
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [24]:
df2=pd.DataFrame({'a':[5,4,np.nan,3,7],'b':[np.nan,3,4,6,8]})
df2

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


In [25]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


# Reshaping With Hierarchical Indexing

In [22]:
data4=pd.DataFrame(np.arange(6).reshape(2,3),index=pd.Index(['colorado','florida'],name='state'),\
                   columns=pd.Index(['one','two','three'],name='number'))
data4

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
colorado,0,1,2
florida,3,4,5


In [23]:
result=data4.stack()
result

state     number
colorado  one       0
          two       1
          three     2
florida   one       3
          two       4
          three     5
dtype: int32

In [24]:
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
colorado,0,1,2
florida,3,4,5


In [25]:
result.unstack(0)

state,colorado,florida
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [26]:
result.unstack('state')

state,colorado,florida
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [27]:
result.unstack('number')

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
colorado,0,1,2
florida,3,4,5


In [28]:
result.unstack(1)

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
colorado,0,1,2
florida,3,4,5


In [29]:
s1=pd.Series([1,2,3,4],index=['a','b','c','d'])
s1

a    1
b    2
c    3
d    4
dtype: int64

In [30]:
s2=pd.Series([5,6,7],index=['c','d','e'])
s2

c    5
d    6
e    7
dtype: int64

In [31]:
s3=pd.concat([s1,s2],keys=['one','two'])
s3

one  a    1
     b    2
     c    3
     d    4
two  c    5
     d    6
     e    7
dtype: int64

In [32]:
s3.unstack()

Unnamed: 0,a,b,c,d,e
one,1.0,2.0,3.0,4.0,
two,,,5.0,6.0,7.0


In [42]:
s3.unstack().stack()

one  a    1.0
     b    2.0
     c    3.0
     d    4.0
two  c    5.0
     d    6.0
     e    7.0
dtype: float64

In [33]:
s3.unstack().stack().reset_index()

Unnamed: 0,level_0,level_1,0
0,one,a,1.0
1,one,b,2.0
2,one,c,3.0
3,one,d,4.0
4,two,c,5.0
5,two,d,6.0
6,two,e,7.0


In [44]:
result

state     number
colorado  one       0
          two       1
          three     2
florida   one       3
          two       4
          three     5
dtype: int32

In [48]:
df5=pd.DataFrame({'left':result,'right':result+5},columns=pd.Index(['left','right'],name='side'))
df5

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
colorado,one,0,5
colorado,two,1,6
colorado,three,2,7
florida,one,3,8
florida,two,4,9
florida,three,5,10


In [49]:
#the level unstack become the lower level
df5.unstack('state')

side,left,left,right,right
state,colorado,florida,colorado,florida
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [51]:
df5.unstack('number')

side,left,left,left,right,right,right
number,one,two,three,one,two,three
state,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
colorado,0,1,2,5,6,7
florida,3,4,5,8,9,10


In [53]:
df5.unstack('number').stack('side')

Unnamed: 0_level_0,number,one,three,two
state,side,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
colorado,left,0,2,1
colorado,right,5,7,6
florida,left,3,5,4
florida,right,8,10,9


In [54]:
df.unstack('state').stack('side')

Unnamed: 0_level_0,state,colorado,florida
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,0,3
one,right,5,8
two,left,1,4
two,right,6,9
three,left,2,5
three,right,7,10


# Pivoting "Long" to "Wide" format

In [2]:
df6=pd.read_csv(r'C:\Users\Shubhamay\Downloads\pydata-book-master_ch07_macrodata.csv')
df6.head()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959,1,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959,2,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959,3,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959,4,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960,1,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [3]:
periods = pd.PeriodIndex(year=df6.year, quarter=df6.quarter, name='date')

In [4]:
columns = pd.Index(['realgdp', 'infl', 'unemp'], name='item')

In [5]:
df6 = df6.reindex(columns=columns)

In [6]:
df6.index=periods.to_timestamp('D','end')
df6

item,realgdp,infl,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31 23:59:59.999999999,2710.349,0.00,5.8
1959-06-30 23:59:59.999999999,2778.801,2.34,5.1
1959-09-30 23:59:59.999999999,2775.488,2.74,5.3
1959-12-31 23:59:59.999999999,2785.204,0.27,5.6
1960-03-31 23:59:59.999999999,2847.699,2.31,5.2
...,...,...,...
2008-09-30 23:59:59.999999999,13324.600,-3.16,6.0
2008-12-31 23:59:59.999999999,13141.920,-8.79,6.9
2009-03-31 23:59:59.999999999,12925.410,0.94,8.1
2009-06-30 23:59:59.999999999,12901.504,3.37,9.2


In [7]:
df6.stack()

date                           item   
1959-03-31 23:59:59.999999999  realgdp     2710.349
                               infl           0.000
                               unemp          5.800
1959-06-30 23:59:59.999999999  realgdp     2778.801
                               infl           2.340
                                            ...    
2009-06-30 23:59:59.999999999  infl           3.370
                               unemp          9.200
2009-09-30 23:59:59.999999999  realgdp    12990.341
                               infl           3.560
                               unemp          9.600
Length: 609, dtype: float64

In [8]:
df6.stack().reset_index()

Unnamed: 0,date,item,0
0,1959-03-31 23:59:59.999999999,realgdp,2710.349
1,1959-03-31 23:59:59.999999999,infl,0.000
2,1959-03-31 23:59:59.999999999,unemp,5.800
3,1959-06-30 23:59:59.999999999,realgdp,2778.801
4,1959-06-30 23:59:59.999999999,infl,2.340
...,...,...,...
604,2009-06-30 23:59:59.999999999,infl,3.370
605,2009-06-30 23:59:59.999999999,unemp,9.200
606,2009-09-30 23:59:59.999999999,realgdp,12990.341
607,2009-09-30 23:59:59.999999999,infl,3.560


In [9]:
ldata=df6.stack().reset_index().rename(columns={0:'values'})
ldata

Unnamed: 0,date,item,values
0,1959-03-31 23:59:59.999999999,realgdp,2710.349
1,1959-03-31 23:59:59.999999999,infl,0.000
2,1959-03-31 23:59:59.999999999,unemp,5.800
3,1959-06-30 23:59:59.999999999,realgdp,2778.801
4,1959-06-30 23:59:59.999999999,infl,2.340
...,...,...,...
604,2009-06-30 23:59:59.999999999,infl,3.370
605,2009-06-30 23:59:59.999999999,unemp,9.200
606,2009-09-30 23:59:59.999999999,realgdp,12990.341
607,2009-09-30 23:59:59.999999999,infl,3.560


In [14]:
pivoted=ldata.pivot('date','item','values')
pivoted

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31 23:59:59.999999999,0.00,2710.349,5.8
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2
...,...,...,...
2008-09-30 23:59:59.999999999,-3.16,13324.600,6.0
2008-12-31 23:59:59.999999999,-8.79,13141.920,6.9
2009-03-31 23:59:59.999999999,0.94,12925.410,8.1
2009-06-30 23:59:59.999999999,3.37,12901.504,9.2


In [15]:
ldata['values2']=np.random.randn(len(ldata))

In [16]:
ldata

Unnamed: 0,date,item,values,values2
0,1959-03-31 23:59:59.999999999,realgdp,2710.349,-2.517337
1,1959-03-31 23:59:59.999999999,infl,0.000,-0.848785
2,1959-03-31 23:59:59.999999999,unemp,5.800,-0.249510
3,1959-06-30 23:59:59.999999999,realgdp,2778.801,-0.020568
4,1959-06-30 23:59:59.999999999,infl,2.340,0.084296
...,...,...,...,...
604,2009-06-30 23:59:59.999999999,infl,3.370,0.322498
605,2009-06-30 23:59:59.999999999,unemp,9.200,0.541263
606,2009-09-30 23:59:59.999999999,realgdp,12990.341,-0.256847
607,2009-09-30 23:59:59.999999999,infl,3.560,-0.139335


In [17]:
pivoted=ldata.pivot('date','item')
pivoted

Unnamed: 0_level_0,values,values,values,values2,values2,values2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31 23:59:59.999999999,0.00,2710.349,5.8,-0.848785,-2.517337,-0.249510
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1,0.084296,-0.020568,0.254730
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3,-0.203623,-0.748108,-0.117239
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6,-1.877428,3.260848,-0.098786
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2,-0.780659,-0.196482,-1.323986
...,...,...,...,...,...,...
2008-09-30 23:59:59.999999999,-3.16,13324.600,6.0,0.256524,-0.756048,-2.181188
2008-12-31 23:59:59.999999999,-8.79,13141.920,6.9,-1.119455,1.763896,-0.597286
2009-03-31 23:59:59.999999999,0.94,12925.410,8.1,1.641899,0.427961,0.437522
2009-06-30 23:59:59.999999999,3.37,12901.504,9.2,0.322498,0.640368,0.541263


In [20]:
unstack=ldata.set_index(['item','date']).unstack('item')

In [21]:
unstack

Unnamed: 0_level_0,values,values,values,values2,values2,values2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31 23:59:59.999999999,0.00,2710.349,5.8,-0.848785,-2.517337,-0.249510
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1,0.084296,-0.020568,0.254730
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3,-0.203623,-0.748108,-0.117239
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6,-1.877428,3.260848,-0.098786
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2,-0.780659,-0.196482,-1.323986
...,...,...,...,...,...,...
2008-09-30 23:59:59.999999999,-3.16,13324.600,6.0,0.256524,-0.756048,-2.181188
2008-12-31 23:59:59.999999999,-8.79,13141.920,6.9,-1.119455,1.763896,-0.597286
2009-03-31 23:59:59.999999999,0.94,12925.410,8.1,1.641899,0.427961,0.437522
2009-06-30 23:59:59.999999999,3.37,12901.504,9.2,0.322498,0.640368,0.541263


# Pivoting "Wide" to "Long" Format:

In [25]:
df=pd.DataFrame({'key':['foo','bar','baz'],'A':[1,2,3],'B':[4,5,6],'C':[7,8,9]})
df

Unnamed: 0,key,A,B,C
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


In [27]:
df1=pd.melt(df,['key'])
df1

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9


In [32]:
reshape=df1.pivot('key','variable','value')
reshape

variable,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2,5,8
baz,3,6,9
foo,1,4,7


In [33]:
df1.reset_index()

Unnamed: 0,index,key,variable,value
0,0,foo,A,1
1,1,bar,A,2
2,2,baz,A,3
3,3,foo,B,4
4,4,bar,B,5
5,5,baz,B,6
6,6,foo,C,7
7,7,bar,C,8
8,8,baz,C,9


In [35]:
df.reset_index()

Unnamed: 0,index,key,A,B,C
0,0,foo,1,4,7
1,1,bar,2,5,8
2,2,baz,3,6,9


In [41]:
pd.melt(df,value_vars=['A','B','C'])

Unnamed: 0,variable,value
0,A,1
1,A,2
2,A,3
3,B,4
4,B,5
5,B,6
6,C,7
7,C,8
8,C,9


In [46]:
pd.melt(df,id_vars='key',value_vars=['A','B','C'])

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9
