In [1]:
# Dependencies, auxiliary functions etc
import pandas as pd
import numpy as np
from IPython.display import display_html
from IPython.display import display

def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    html_str = html_str.replace('table','table style="display:inline; border:0px"')          
    display_html(html_str,raw=True)

<h1>Basic DataFrame operations</h1>

<h2>DataFrame initialization</h2>

In [6]:
# Initialize DataFrame with dummy data and custom indices
df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'])
display(df)

Unnamed: 0,ff,ee,tt,uuu
0,0.904764,0.459753,0.888896,0.303693
1,0.113926,0.46483,0.186101,0.569967
2,0.568196,0.056763,0.915619,0.633067
3,0.885631,0.181725,0.749834,0.999536


In [5]:
# Initialize DataFrame with custom date, set one of the columns as index
data = {'ID': [1, 2, 3],'Names': ['Kate', 'John', 'Max'],'Age': [50, 25, 41], 'Values': [3, 4, 8]}
df = pd.DataFrame(data=data, columns = ['ID', 'Names', 'Age', 'Values', 'Country'])
df2 = df.set_index('ID')
df2.index.name = None # remove index name
display_side_by_side(df,df2)

Unnamed: 0,ID,Names,Age,Values,Country
0,1,Kate,50,3,
1,2,John,25,4,
2,3,Max,41,8,

Unnamed: 0,Names,Age,Values,Country
1,Kate,50,3,
2,John,25,4,
3,Max,41,8,


<h2>Dropping from DataFrame</h2>

In [14]:
data = {'ID': [1, 2, 3, 4],'Names': ['Kate', 'John', 'Max', 'Mary'],'Age': [50, 25, 41,99], 'Values': [3, 4, 8,4], 'col': [None]}
df_orig = pd.DataFrame(data=data, index = ['a','b','c','d'])
df = df_orig.copy()

# Drop columns by column label
df.drop(['Age', 'Names'], axis = 1, inplace = True)

# Drop column by fancy indexing
df.drop(df.columns[0], axis = 1, inplace = True)

# Drop column by ugin del; is directly inplace!
del df['col']

# Drop rows by index label
df.drop(['b'], axis = 0 , inplace = True)

# Drop rows by index number
df.drop(df.index[[0]], axis = 0 , inplace = True)

# Drop rows where value in column does not fulfill condition
df = df[df['Values'] > 5]

display_side_by_side(df_orig,df)

Unnamed: 0,Age,ID,Names,Values,col
a,50,1,Kate,3,
b,25,2,John,4,
c,41,3,Max,8,
d,99,4,Mary,4,

Unnamed: 0,Values
c,8


In [10]:
# Drop other columns except those we want to keep
data = {'ID': [1, 2, 3, 4],'Names': ['Kate', 'John', 'Max', 'Mary'],'Age': [50, 25, 41,99], 'Values': [3, 4, 8,4], 'col': [None]}
df_orig = pd.DataFrame(data=data, index = ['a','b','c','d'])
columns_to_keep = ['Names','Values']
df = df[columns_to_keep]
display(df)

Unnamed: 0,Names,Values
a,Kate,3
b,John,4
c,Max,8
d,Mary,4


<h2>Slicing data frame</h2>

In [8]:
df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'], index = ['first','second','third','fourth'])

# Selecting columns
# Using single square brackets selects a Series from DataFrame. 
# Using multiple square brackets selects a slice of data frame (possibly with muliple columns) 
df['ff'] # returns Series
df[['ff']] # returns DataFrame with one column
df[['ff','ee']] # returns DataFrame with two columns
df.loc[:,['ee','tt']]
df.iloc[:,0] # by position in frame

# Selecting rows
df.loc['first'] # by index
df.loc[['first','fourth']] # by index multiple rows
df[0:3]
df.iloc[0:2] # by position in frame

# Using data[2] or data['label'] indexes the series itself, skipping the iloc/loc
# attributes. This is syntatic sugar, but it is safer to use iloc/loc attributes
# explicitly in order avoid confusion querying between index label and index position.
# The rough rule is any time you see back-to-back square brackets,
# ][, you're in asking for trouble. Replace that with a .loc[] or .iloc[]
# and you'll be set.

# Example printed out
display_side_by_side(df, df.loc[:,['ee','tt']], df.iloc[0:2])

Unnamed: 0,ff,ee,tt,uuu
first,0.906347,0.055478,0.573543,0.516064
second,0.971785,0.287883,0.994565,0.760811
third,0.747426,0.912622,0.914198,0.14196
fourth,0.749906,0.578614,0.273687,0.665843

Unnamed: 0,ee,tt
first,0.055478,0.573543
second,0.287883,0.994565
third,0.912622,0.914198
fourth,0.578614,0.273687

Unnamed: 0,ff,ee,tt,uuu
first,0.906347,0.055478,0.573543,0.516064
second,0.971785,0.287883,0.994565,0.760811


<h1>Hierarchical indexing</h1>

<h3>Initialize DataFrame with hierarchical indexing</h3>

In [11]:
srs = pd.Series(data=np.random.rand(6), index = [['a','a','a','b','b','c'], [1,2,3] * 2])
display(srs)
srs['b':'c']
srs.loc[:,2]
df = pd.DataFrame(data = np.arange(12).reshape((4,3))
                  ,index = [['a','a','b','b',], [1,2] * 2]
                  ,columns = [['col1','col1','col2'], ['ff','gg','ff']])
display_side_by_side(df)

a  1    0.020218
   2    0.832620
   3    0.778157
b  1    0.870012
   2    0.978618
c  3    0.799159
dtype: float64

Unnamed: 0_level_0,Unnamed: 1_level_0,col1,col1,col2
Unnamed: 0_level_1,Unnamed: 1_level_1,ff,gg,ff
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


<h3>Add second index to existing DataFrame</h3>

In [8]:
df = pd.DataFrame(data = np.arange(12).reshape((4,3))
                  ,columns = ['col1','col1','col2'])
df2 = df.copy()
df2.set_index([df2.index, 'col2'],inplace=True)
df2.index.names = ['Index1', 'Index2']
display_side_by_side(df,df2)

Unnamed: 0,col1,col1.1,col2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11

Unnamed: 0_level_0,Unnamed: 1_level_0,col1,col1
Index1,Index2,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,0,1
1,5,3,4
2,8,6,7
3,11,9,10


<h1>Conditional indexing</h1>

In [7]:
np.random.seed(0)
df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'], index = ['first','second','third','fourth'])

# Conditional indexing by dropping rows with no match
df2 = df[df['ee'] > 0.5].copy()

# Multiple conditions
df[(df['ee'] > 0.5 ) & (df['tt'] < 8)]

# Conditional indexing by preserving original dimensions
df3 = df.where(df['ee'] > 0.5).copy()

display_side_by_side(df,df2, df3)

Unnamed: 0,ff,ee,tt,uuu
first,0.548814,0.715189,0.602763,0.544883
second,0.423655,0.645894,0.437587,0.891773
third,0.963663,0.383442,0.791725,0.528895
fourth,0.568045,0.925597,0.071036,0.087129

Unnamed: 0,ff,ee,tt,uuu
first,0.548814,0.715189,0.602763,0.544883
second,0.423655,0.645894,0.437587,0.891773
fourth,0.568045,0.925597,0.071036,0.087129

Unnamed: 0,ff,ee,tt,uuu
first,0.548814,0.715189,0.602763,0.544883
second,0.423655,0.645894,0.437587,0.891773
third,,,,
fourth,0.568045,0.925597,0.071036,0.087129


<h3>Mappings for data frame columns</h3>

In [31]:
# Mapping from user defined dict
data = {'ID': [1, 2, 3,4],'Names': ['Kate', 'John', 'Max','Kate'],'Age': [50, 25, 41,89], 'Values': [3, 4, 8, 12]}
df = pd.DataFrame(data=data)
age_mapping = {25:0, 41:1, 50:2, 89:3}
df['AgeOrder'] = df['Age'].map(age_mapping)
display(df)

Unnamed: 0,Age,ID,Names,Values,AgeOrder
0,50,1,Kate,3,2
1,25,2,John,4,0
2,41,3,Max,8,1
3,89,4,Kate,12,3


In [7]:
# Map unique column values to index values
d = {key: value for (key, value) in zip(df['Names'].unique(),range(len(df['Names'].unique())))}
df['NameID'] = df['Names'].map(d)
df

Unnamed: 0,Age,ID,Names,Values,AgeOrder,NameID
0,50,1,Kate,3,2,0
1,25,2,John,4,0,1
2,41,3,Max,8,1,2
3,89,4,Kate,12,3,0


<h2>Altering DataFrame values</h2>

In [29]:
df = pd.DataFrame(np.arange(12).reshape(6,2),columns = ['ff', 'er'])
df2 = df.copy()
df2['ff'] *= 0.8
df2['er'] -= 2
display_side_by_side(df,df2)

Unnamed: 0,ff,er
0,0,1
1,2,3
2,4,5
3,6,7
4,8,9
5,10,11

Unnamed: 0,ff,er
0,0.0,-1
1,1.6,1
2,3.2,3
3,4.8,5
4,6.4,7
5,8.0,9


<h2>Database-like joins</h2>

In [9]:
# Create date
df = pd.DataFrame(data = np.arange(12).reshape((3,4)), columns = [['col1','col2','col3','col4']])
df2 = pd.DataFrame({'key': [0,1,2], 'vals': [4,6,8]})
display_side_by_side(df,df2)

Unnamed: 0,col1,col2,col3,col4
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11

Unnamed: 0,key,vals
0,0,4
1,1,6
2,2,8


In [10]:
# Different merges
df3 = pd.merge(df,df2,left_index = True, right_on = 'key',how = 'left')
df4 = pd.merge(df,df2,left_on = 'col2', right_on = 'key',how = 'left')
df5 = pd.merge(df,df2,left_on = 'col2', right_on = 'key',how = 'inner')
df6 = pd.merge(df,df2,left_on = 'col2', right_on = 'key',how = 'outer')
display_side_by_side(df3,df4,df5,df6)

Unnamed: 0,col1,col2,col3,col4,key,vals
0,0,1,2,3,0,4
1,4,5,6,7,1,6
2,8,9,10,11,2,8

Unnamed: 0,col1,col2,col3,col4,key,vals
0,0,1,2,3,1.0,6.0
1,4,5,6,7,,
2,8,9,10,11,,

Unnamed: 0,col1,col2,col3,col4,key,vals
0,0,1,2,3,1,6

Unnamed: 0,col1,col2,col3,col4,key,vals
0,0.0,1.0,2.0,3.0,1.0,6.0
1,4.0,5.0,6.0,7.0,,
2,8.0,9.0,10.0,11.0,,
3,,,,,0.0,4.0
4,,,,,2.0,8.0


<h1>Aggregating DataFrame</h1>

In [38]:
df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'], index = ['first','second','third','fourth'])
df.iloc[:2,1] = np.nan
display(df)

# Mean by column, chnage axis=0 to get rows
display(df.mean(axis=1, skipna=False))
display(df.mean(axis=1, skipna=True))

Unnamed: 0,ff,ee,tt,uuu
first,0.577229,,0.934214,0.613966
second,0.535633,,0.730122,0.311945
third,0.398221,0.209844,0.186193,0.944372
fourth,0.739551,0.490459,0.227415,0.254356


first          NaN
second         NaN
third     0.434658
fourth    0.427945
dtype: float64

first     0.708470
second    0.525900
third     0.434658
fourth    0.427945
dtype: float64

<h2>"Group by" in various forms</h2>

In [15]:
data = {'ID': [1, 1, 2],'Names': ['Kate', 'John', 'Max'],'Age': [50, 25, 40], 'Values': [3, 4, 8]}
df = pd.DataFrame(data=data, index = ['first','second','first'])
df.index.name = 'Index'

result1 = df.groupby(['ID'])['Values'].sum()
result1 = pd.DataFrame(result1)

result2 = df.groupby([df.index])['Age'].mean()
result2 = pd.DataFrame(result2)

result3 = df.groupby([df.index])['Age'].count()
result3 = pd.DataFrame(result3)

display_side_by_side(df, result1,result2,result3)

Unnamed: 0_level_0,Age,ID,Names,Values
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
first,50,1,Kate,3
second,25,1,John,4
first,40,2,Max,8

Unnamed: 0_level_0,Values
ID,Unnamed: 1_level_1
1,7
2,8

Unnamed: 0_level_0,Age
Index,Unnamed: 1_level_1
first,45
second,25

Unnamed: 0_level_0,Age
Index,Unnamed: 1_level_1
first,2
second,1


In [12]:
# Select sum of first greatest N values within group
data = {'ID': [1, 2, 1, 2, 1], 'Values': [3, 9, 8, 7, 3]}
df = pd.DataFrame(data=data)

df2 = df.copy()
df2.sort_values(['ID','Values'],ascending=[True, False],inplace=True)
df2 = df2.groupby('ID').head(2)
df2 = df2.groupby(['ID'])['Values'].sum().sort_values(ascending = False)

df2 = pd.DataFrame(df2)
display_side_by_side(df,df2)

Unnamed: 0,ID,Values
0,1,3
1,2,9
2,1,8
3,2,7
4,1,3

Unnamed: 0_level_0,Values
ID,Unnamed: 1_level_1
2,16
1,11


<h2>Data cleaning</h2>

In [24]:
df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'], index = ['first','second','third','fourth'])
df.iloc[2,1] = np.nan
df.iloc[1,3] = np.nan
df.iloc[0,0] = np.nan
df2 = df.copy()

# Fill NaN by mean of the column
df2['ee'].fillna(df['ee'].mean(), inplace = True)

# Fill using value from previuous row
df2['uuu'].fillna(method='bfill', inplace = True)

# Fill using value from next row
df2['ff'].fillna(method='bfill', inplace = True)

display_side_by_side(df,df2)

Unnamed: 0,ff,ee,tt,uuu
first,,0.590873,0.574325,0.653201
second,0.652103,0.431418,0.896547,
third,0.435865,,0.806194,0.703889
fourth,0.100227,0.919483,0.714241,0.998847

Unnamed: 0,ff,ee,tt,uuu
first,0.652103,0.590873,0.574325,0.653201
second,0.652103,0.431418,0.896547,0.703889
third,0.435865,0.647258,0.806194,0.703889
fourth,0.100227,0.919483,0.714241,0.998847


<h3>Import Excel file</h3>

In [13]:
import pandas as pd
import os
path = r'C:\myfolder'
file = 'myfile.xlsx'
df = pd.read_excel(os.path.join(path,file))

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\myfolder\\myfile.xlsx'