In [30]:
# Dependencies, auxiliary functions etc
import pandas as pd
import numpy as np
from IPython.display import display

def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    html_str = html_str.replace('table','table style="display:inline; border:0px"')          
    display_html(html_str,raw=True)

<h2>Initialize data frame</h2>

In [29]:
df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'])
display(df)

Unnamed: 0,ff,ee,tt,uuu
0,0.237892,0.102305,0.816686,0.052665
1,0.030196,0.087708,0.95114,0.574003
2,0.225154,0.769876,0.859193,0.081462
3,0.970363,0.038464,0.159075,0.17553


In [15]:
data = {'ID': [1, 2, 3],'Names': ['Kate', 'John', 'Max'],'Age': [50, 25, 41], 'Values': [3, 4, 8]}
df = pd.DataFrame(data=data, columns = ['ID', 'Names', 'Age', 'Values', 'Country'])
df2 = df.set_index('ID')
df2.index.name = None # remove index name
display_side_by_side(df,df2)

Unnamed: 0,ID,Names,Age,Values,Country
0,1,Kate,50,3,
1,2,John,25,4,
2,3,Max,41,8,

Unnamed: 0,Names,Age,Values,Country
1,Kate,50,3,
2,John,25,4,
3,Max,41,8,


<h3>Hierarchical indexing</h3>

In [18]:
srs = pd.Series(data=np.random.rand(6), index = [['a','a','a','b','b','c'], [1,2,3] * 2])
display(srs)
srs['b':'c']
srs.loc[:,2]
df = pd.DataFrame(data = np.arange(12).reshape((4,3))
                  ,index = [['a','a','b','b',], [1,2] * 2]
                  ,columns = [['col1','col1','col2'], ['ff','gg','ff']])
display_side_by_side(df)

a  1    0.405561
   2    0.233002
   3    0.348025
b  1    0.581109
   2    0.128420
c  3    0.379645
dtype: float64

Unnamed: 0_level_0,Unnamed: 1_level_0,col1,col1,col2
Unnamed: 0_level_1,Unnamed: 1_level_1,ff,gg,ff
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


<h3>Slicing data frame</h3>

In [28]:
df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'], index = ['first','second','third','fourth'])

# Selecting columns
df['ff']
df[['ff','tt']]
df.loc[:,['ee','tt']]
df.iloc[:,0] # by position in frame

# Selecting rows
df.loc[['first','fourth']] # by index name
df[0:3]
df.iloc[0:2] # by position in frame

# The rough rule is any time you see back-to-back square brackets,
# ][, you're in asking for trouble. Replace that with a .loc[] or .iloc[]
# and you'll be set.

display_side_by_side(df, df.loc[:,['ee','tt']], df.iloc[0:2])

Unnamed: 0,ff,ee,tt,uuu
first,0.591025,0.480597,0.853743,0.782283
second,0.354305,0.173832,0.082631,0.370556
third,0.413977,0.32697,0.729281,0.443804
fourth,0.863846,0.307669,0.255678,0.06441

Unnamed: 0,ee,tt
first,0.480597,0.853743
second,0.173832,0.082631
third,0.32697,0.729281
fourth,0.307669,0.255678

Unnamed: 0,ff,ee,tt,uuu
first,0.591025,0.480597,0.853743,0.782283
second,0.354305,0.173832,0.082631,0.370556


<h3>Mappings for data frame columns</h3>

In [31]:
# Mapping from user defined dict
data = {'ID': [1, 2, 3,4],'Names': ['Kate', 'John', 'Max','Kate'],'Age': [50, 25, 41,89], 'Values': [3, 4, 8, 12]}
df = pd.DataFrame(data=data)
age_mapping = {25:0, 41:1, 50:2, 89:3}
df['AgeOrder'] = df['Age'].map(age_mapping)
display(df)

Unnamed: 0,Age,ID,Names,Values,AgeOrder
0,50,1,Kate,3,2
1,25,2,John,4,0
2,41,3,Max,8,1
3,89,4,Kate,12,3


In [7]:
# Map unique column values to index values
d = {key: value for (key, value) in zip(df['Names'].unique(),range(len(df['Names'].unique())))}
df['NameID'] = df['Names'].map(d)
df

Unnamed: 0,Age,ID,Names,Values,AgeOrder,NameID
0,50,1,Kate,3,2,0
1,25,2,John,4,0,1
2,41,3,Max,8,1,2
3,89,4,Kate,12,3,0


<h3>Dropping fom data frame</h3>

In [34]:
data = {'ID': [1, 2, 3],'Names': ['Kate', 'John', 'Max'],'Age': [50, 25, 41], 'Values': [3, 4, 8]}
df_orig = pd.DataFrame(data=data, index = ['a','b','c'])
df = df_orig.copy()

# Drop columns by column label
df.drop(['Age', 'Names'], axis = 1, inplace = True)

# Drop column by fancy indexing
df.drop(df.columns[0], axis = 1, inplace = True)

# Drop rows by index label
df.drop(['b'], axis = 0 , inplace = True)

# Drop rows by index number
df.drop(df.index[[0]], axis = 0 , inplace = True)

display_side_by_side(df_orig,df)

Unnamed: 0,Age,ID,Names,Values
a,50,1,Kate,3
b,25,2,John,4
c,41,3,Max,8

Unnamed: 0,Values
c,8


<h2>Database-like joins</h2>

In [9]:
# Create date
df = pd.DataFrame(data = np.arange(12).reshape((3,4)), columns = [['col1','col2','col3','col4']])
df2 = pd.DataFrame({'key': [0,1,2], 'vals': [4,6,8]})
display_side_by_side(df,df2)

Unnamed: 0,col1,col2,col3,col4
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11

Unnamed: 0,key,vals
0,0,4
1,1,6
2,2,8


In [10]:
# Different merges
df3 = pd.merge(df,df2,left_index = True, right_on = 'key',how = 'left')
df4 = pd.merge(df,df2,left_on = 'col2', right_on = 'key',how = 'left')
df5 = pd.merge(df,df2,left_on = 'col2', right_on = 'key',how = 'inner')
df6 = pd.merge(df,df2,left_on = 'col2', right_on = 'key',how = 'outer')
display_side_by_side(df3,df4,df5,df6)

Unnamed: 0,col1,col2,col3,col4,key,vals
0,0,1,2,3,0,4
1,4,5,6,7,1,6
2,8,9,10,11,2,8

Unnamed: 0,col1,col2,col3,col4,key,vals
0,0,1,2,3,1.0,6.0
1,4,5,6,7,,
2,8,9,10,11,,

Unnamed: 0,col1,col2,col3,col4,key,vals
0,0,1,2,3,1,6

Unnamed: 0,col1,col2,col3,col4,key,vals
0,0.0,1.0,2.0,3.0,1.0,6.0
1,4.0,5.0,6.0,7.0,,
2,8.0,9.0,10.0,11.0,,
3,,,,,0.0,4.0
4,,,,,2.0,8.0


<h2>Aggregating data frame</h2>

In [36]:
df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'], index = ['first','second','third','fourth'])
df.iloc[:2,1] = np.nan
display(df)

# Mean by column, chnage axis=0 to get rows
display(df.mean(axis=1, skipna=False))
display(df.mean(axis=1, skipna=True))

Unnamed: 0,ff,ee,tt,uuu
first,0.733675,,0.670106,0.399541
second,0.464315,,0.741816,0.310936
third,0.409676,0.725885,0.537836,0.696239
fourth,0.386578,0.024976,0.084876,0.172063


first          NaN
second         NaN
third     0.592409
fourth    0.167123
dtype: float64

first     0.601107
second    0.505689
third     0.592409
fourth    0.167123
dtype: float64

<h2>Data cleaning</h2>

In [38]:
df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'], index = ['first','second','third','fourth'])
df.iloc[2,1] = np.nan
df2 = df.fillna(df['ee'].mean()).copy()
display_side_by_side(df,df2)

Unnamed: 0,ff,ee,tt,uuu
first,0.634278,0.672163,0.58653,0.253065
second,0.706616,0.99472,0.524647,0.368629
third,0.452015,,0.733817,0.138504
fourth,0.654829,0.69196,0.631872,0.303609

Unnamed: 0,ff,ee,tt,uuu
first,0.634278,0.672163,0.58653,0.253065
second,0.706616,0.99472,0.524647,0.368629
third,0.452015,0.786281,0.733817,0.138504
fourth,0.654829,0.69196,0.631872,0.303609


<h3>Import Excel file</h3>

In [13]:
import pandas as pd
import os
path = r'C:\myfolder'
file = 'myfile.xlsx'
df = pd.read_excel(os.path.join(path,file))

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\myfolder\\myfile.xlsx'