In [2]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

<h2>Initialize data frame</h2>

In [2]:
import pandas as pd
import numpy as np
from IPython.display import display

df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'])
display(df)

Unnamed: 0,ff,ee,tt,uuu
0,0.836274,0.304906,0.828242,0.189388
1,0.185621,0.923344,0.910663,0.127473
2,0.043625,0.276137,0.783036,0.720145
3,0.14508,0.033931,0.279397,0.922013


In [3]:
data = {'ID': [1, 2, 3],'Names': ['Kate', 'John', 'Max'],'Age': [50, 25, 41], 'Values': [3, 4, 8]}
df = pd.DataFrame(data=data, columns = ['ID', 'Names', 'Age', 'Values', 'Country'])
display(df)
df.set_index('ID',inplace=True)
df.index.name = None # remove index name
display(df)

Unnamed: 0,ID,Names,Age,Values,Country
0,1,Kate,50,3,
1,2,John,25,4,
2,3,Max,41,8,


Unnamed: 0,Names,Age,Values,Country
1,Kate,50,3,
2,John,25,4,
3,Max,41,8,


<h3>Frames/series with hierarchical index<h3/>

In [3]:
import pandas as pd
import numpy as np
from IPython.display import display

srs = pd.Series(data=np.random.rand(6), index = [['a','a','a','b','b','c'], [1,2,3] * 2])
display(srs)
srs['b':'c']
srs.loc[:,2]
df = pd.DataFrame(data = np.arange(12).reshape((4,3))
                  ,index = [['a','a','b','b',], [1,2] * 2]
                  ,columns = [['col1','col1','col2'], ['ff','gg','ff']])
#display(df)
#display(df['col1'])
display_side_by_side(df,df['col1'])



a  1    0.207674
   2    0.894987
   3    0.310570
b  1    0.449072
   2    0.476562
c  3    0.342421
dtype: float64

Unnamed: 0_level_0,Unnamed: 1_level_0,col1,col1,col2
Unnamed: 0_level_1,Unnamed: 1_level_1,ff,gg,ff
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11

Unnamed: 0,Unnamed: 1,ff,gg
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


<h3>Slicing data frame</h3>

In [5]:
import numpy as np
from IPython.display import display

df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'], index = ['first','second','third','fourth'])

print('The data frame')
display(df)

print('Selecting columns')
df['ff']
df[['ff','tt']]
display(df.loc[:,['ee','tt']])
display(df.iloc[:,0]) # by position in frame

print('Selecting rows')
display(df.loc[['first','fourth']])
display(df[0:3])
display(df.iloc[0]) # by position in frame


The data frame


Unnamed: 0,ff,ee,tt,uuu
first,0.143934,0.336,0.231618,0.020785
second,0.370011,0.978159,0.556351,0.81159
third,0.094567,0.179212,0.224872,0.3612
fourth,0.306962,0.778874,0.141975,0.463785


Selecting columns


Unnamed: 0,ee,tt
first,0.336,0.231618
second,0.978159,0.556351
third,0.179212,0.224872
fourth,0.778874,0.141975


first     0.143934
second    0.370011
third     0.094567
fourth    0.306962
Name: ff, dtype: float64

Selecting rows


Unnamed: 0,ff,ee,tt,uuu
first,0.143934,0.336,0.231618,0.020785
fourth,0.306962,0.778874,0.141975,0.463785


Unnamed: 0,ff,ee,tt,uuu
first,0.143934,0.336,0.231618,0.020785
second,0.370011,0.978159,0.556351,0.81159
third,0.094567,0.179212,0.224872,0.3612


ff     0.143934
ee     0.336000
tt     0.231618
uuu    0.020785
Name: first, dtype: float64

<h3>Mappings for data frame columns</h3>

In [6]:
# Mapping from user defined dict
import pandas as pd
data = {'ID': [1, 2, 3,4],'Names': ['Kate', 'John', 'Max','Kate'],'Age': [50, 25, 41,89], 'Values': [3, 4, 8, 12]}
df = pd.DataFrame(data=data)
age_mapping = {25:0, 41:1, 50:2, 89:3}
df['AgeOrder'] = df['Age'].map(age_mapping)
df

Unnamed: 0,Age,ID,Names,Values,AgeOrder
0,50,1,Kate,3,2
1,25,2,John,4,0
2,41,3,Max,8,1
3,89,4,Kate,12,3


In [7]:
# Map unique column values to index values
d = {key: value for (key, value) in zip(df['Names'].unique(),range(len(df['Names'].unique())))}
df['NameID'] = df['Names'].map(d)
df

Unnamed: 0,Age,ID,Names,Values,AgeOrder,NameID
0,50,1,Kate,3,2,0
1,25,2,John,4,0,1
2,41,3,Max,8,1,2
3,89,4,Kate,12,3,0


<h3>Dropping fom data frame</h3>

In [8]:
import pandas as pd
from IPython.display import display
data = {'ID': [1, 2, 3],'Names': ['Kate', 'John', 'Max'],'Age': [50, 25, 41], 'Values': [3, 4, 8]}
df = pd.DataFrame(data=data, index = ['a','b','c'])
display(df)

# Drop columns by column label
df.drop(['Age', 'Names'], axis = 1, inplace = True)

# Drop column by fancy indexing
df.drop(df.columns[0], axis = 1, inplace = True)

# Drop rows by index label
df.drop(['b'], axis = 0 , inplace = True)

# Drop rows by index number
df.drop(df.index[[0]], axis = 0 , inplace = True)

display(df)

Unnamed: 0,Age,ID,Names,Values
a,50,1,Kate,3
b,25,2,John,4
c,41,3,Max,8


Unnamed: 0,Values
c,8


<h2>Database-like joins</h2>

In [9]:
from IPython.display import display
df = pd.DataFrame(data = np.arange(12).reshape((3,4)), columns = [['col1','col2','col3','col4']])
df2 = pd.DataFrame({'key': [0,1,2], 'vals': [4,6,8]})
display_side_by_side(df,df2)

Unnamed: 0,col1,col2,col3,col4
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11

Unnamed: 0,key,vals
0,0,4
1,1,6
2,2,8


In [10]:
# Different merges
df3 = pd.merge(df,df2,left_index = True, right_on = 'key',how = 'left')
df4 = pd.merge(df,df2,left_on = 'col2', right_on = 'key',how = 'left')
df5 = pd.merge(df,df2,left_on = 'col2', right_on = 'key',how = 'inner')
df6 = pd.merge(df,df2,left_on = 'col2', right_on = 'key',how = 'outer')
display_side_by_side(df3,df4,df5,df6)

Unnamed: 0,col1,col2,col3,col4,key,vals
0,0,1,2,3,0,4
1,4,5,6,7,1,6
2,8,9,10,11,2,8

Unnamed: 0,col1,col2,col3,col4,key,vals
0,0,1,2,3,1.0,6.0
1,4,5,6,7,,
2,8,9,10,11,,

Unnamed: 0,col1,col2,col3,col4,key,vals
0,0,1,2,3,1,6

Unnamed: 0,col1,col2,col3,col4,key,vals
0,0.0,1.0,2.0,3.0,1.0,6.0
1,4.0,5.0,6.0,7.0,,
2,8.0,9.0,10.0,11.0,,
3,,,,,0.0,4.0
4,,,,,2.0,8.0


<h2>Aggregating data frame</h2>

In [11]:
import pandas as pd
import numpy as np
from IPython.display import display

df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'], index = ['first','second','third','fourth'])
df.iloc[:2,1] = np.nan

print('The data frame')
display(df)

print('Mean by column, chnage axis=0 to get rows')
display(df.mean(axis=1, skipna=False))
display(df.mean(axis=1, skipna=True))


The data frame


Unnamed: 0,ff,ee,tt,uuu
first,0.13786,,0.268174,0.51773
second,0.683569,,0.21612,0.892158
third,0.891695,0.83075,0.24169,0.067319
fourth,0.972094,0.326149,0.774039,0.294477


Mean by column, chnage axis=0 to get rows


first          NaN
second         NaN
third     0.507863
fourth    0.591690
dtype: float64

first     0.307921
second    0.597282
third     0.507863
fourth    0.591690
dtype: float64

<h2>Data cleaning</h2>

In [12]:
import pandas as pd
import numpy as np
from IPython.display import display

df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'], index = ['first','second','third','fourth'])
df.iloc[2,1] = np.nan
display(df)
df.fillna(df['ee'].mean(),inplace=True)
display(df)

Unnamed: 0,ff,ee,tt,uuu
first,0.97862,0.628413,0.591251,0.907172
second,0.941204,0.515872,0.844508,0.013046
third,0.155279,,0.000618,0.282132
fourth,0.119215,0.495846,0.342536,0.739485


Unnamed: 0,ff,ee,tt,uuu
first,0.97862,0.628413,0.591251,0.907172
second,0.941204,0.515872,0.844508,0.013046
third,0.155279,0.54671,0.000618,0.282132
fourth,0.119215,0.495846,0.342536,0.739485


<h3>Import Excel file</h3>

In [13]:
import pandas as pd
import os
path = r'C:\myfolder'
file = 'myfile.xlsx'
df = pd.read_excel(os.path.join(path,file))

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\myfolder\\myfile.xlsx'