<h3>Initialize data frame</h3>

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'])
display(df)

Unnamed: 0,ff,ee,tt,uuu
0,0.597796,0.355494,0.886386,0.721845
1,0.164886,0.330527,0.504979,0.033931
2,0.991173,0.378892,0.556317,0.685392
3,0.521153,0.914954,0.385518,0.079338


In [2]:
data = {'ID': [1, 2, 3],'Names': ['Kate', 'John', 'Max'],'Age': [50, 25, 41], 'Values': [3, 4, 8]}
df = pd.DataFrame(data=data, columns = ['ID', 'Names', 'Age', 'Values', 'Country'])
display(df)
df.set_index('ID',inplace=True)
df.index.name = None # remove index name
display(df)

Unnamed: 0,ID,Names,Age,Values,Country
0,1,Kate,50,3,
1,2,John,25,4,
2,3,Max,41,8,


Unnamed: 0,Names,Age,Values,Country
1,Kate,50,3,
2,John,25,4,
3,Max,41,8,


<h3>Slicing data frame</h3>

In [3]:
import numpy as np
from IPython.display import display

df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'], index = ['first','second','third','fourth'])

print('The data frame')
display(df)

print('Selecting columns')
df['ff']
df[['ff','tt']]
display(df.loc[:,['ee','tt']])
display(df.iloc[:,0]) # by position in frame

print('Selecting rows')
display(df.loc[['first','fourth']])
display(df[0:3])
display(df.iloc[0]) # by position in frame


The data frame


Unnamed: 0,ff,ee,tt,uuu
first,0.774571,0.875179,0.81623,0.016936
second,0.313244,0.162631,0.476464,0.22091
third,0.754587,0.386446,0.827384,0.745314
fourth,0.842574,0.408784,0.442023,0.13059


Selecting columns


Unnamed: 0,ee,tt
first,0.875179,0.81623
second,0.162631,0.476464
third,0.386446,0.827384
fourth,0.408784,0.442023


first     0.774571
second    0.313244
third     0.754587
fourth    0.842574
Name: ff, dtype: float64

Selecting rows


Unnamed: 0,ff,ee,tt,uuu
first,0.774571,0.875179,0.81623,0.016936
fourth,0.842574,0.408784,0.442023,0.13059


Unnamed: 0,ff,ee,tt,uuu
first,0.774571,0.875179,0.81623,0.016936
second,0.313244,0.162631,0.476464,0.22091
third,0.754587,0.386446,0.827384,0.745314


ff     0.774571
ee     0.875179
tt     0.816230
uuu    0.016936
Name: first, dtype: float64

<h3>Mappings for data frame columns</h3>

In [4]:
# Mapping from user defined dict
import pandas as pd
data = {'ID': [1, 2, 3,4],'Names': ['Kate', 'John', 'Max','Kate'],'Age': [50, 25, 41,89], 'Values': [3, 4, 8, 12]}
df = pd.DataFrame(data=data)
age_mapping = {25:0, 41:1, 50:2, 89:3}
df['AgeOrder'] = df['Age'].map(age_mapping)
df

Unnamed: 0,Age,ID,Names,Values,AgeOrder
0,50,1,Kate,3,2
1,25,2,John,4,0
2,41,3,Max,8,1
3,89,4,Kate,12,3


In [5]:
# Map unique column values to index values
d = {key: value for (key, value) in zip(df['Names'].unique(),range(len(df['Names'].unique())))}
df['NameID'] = df['Names'].map(d)
df

Unnamed: 0,Age,ID,Names,Values,AgeOrder,NameID
0,50,1,Kate,3,2,0
1,25,2,John,4,0,1
2,41,3,Max,8,1,2
3,89,4,Kate,12,3,0


<h3>Dropping fom data frame</h3>

In [6]:
import pandas as pd
from IPython.display import display
data = {'ID': [1, 2, 3],'Names': ['Kate', 'John', 'Max'],'Age': [50, 25, 41], 'Values': [3, 4, 8]}
df = pd.DataFrame(data=data, index = ['a','b','c'])
display(df)

# Drop columns by column label
df.drop(['Age', 'Names'], axis = 1, inplace = True)

# Drop column by fancy indexing
df.drop(df.columns[0], axis = 1, inplace = True)

# Drop rows by index label
df.drop(['b'], axis = 0 , inplace = True)

# Drop rows by index number
df.drop(df.index[[0]], axis = 0 , inplace = True)

display(df)

Unnamed: 0,Age,ID,Names,Values
a,50,1,Kate,3
b,25,2,John,4
c,41,3,Max,8


Unnamed: 0,Values
c,8


<h2>Aggregating data frame</h2>

In [7]:
import pandas as pd
import numpy as np
from IPython.display import display

df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'], index = ['first','second','third','fourth'])
df.iloc[:2,1] = np.nan

print('The data frame')
display(df)

print('Mean by column, chnage axis=0 to get rows')
display(df.mean(axis=1, skipna=False))
display(df.mean(axis=1, skipna=True))


The data frame


Unnamed: 0,ff,ee,tt,uuu
first,0.708487,,0.232488,0.000689
second,0.41238,,0.609906,0.233965
third,0.427902,0.075782,0.645744,0.633493
fourth,0.506208,0.205261,0.836643,0.775193


Mean by column, chnage axis=0 to get rows


first          NaN
second         NaN
third     0.445730
fourth    0.580826
dtype: float64

first     0.313888
second    0.418750
third     0.445730
fourth    0.580826
dtype: float64

<h2>Data cleaning</h2>

In [8]:
import pandas as pd
import numpy as np
from IPython.display import display

df = pd.DataFrame(np.random.rand(4,4), columns = ['ff', 'ee', 'tt', 'uuu'], index = ['first','second','third','fourth'])
df.iloc[2,1] = np.nan
display(df)
df.fillna(df['ee'].mean(),inplace=True)
display(df)

Unnamed: 0,ff,ee,tt,uuu
first,0.166914,0.606136,0.182982,0.676472
second,0.735339,0.599391,0.411002,0.280654
third,0.855802,,0.492494,0.991136
fourth,0.589054,0.436399,0.27199,0.464906


Unnamed: 0,ff,ee,tt,uuu
first,0.166914,0.606136,0.182982,0.676472
second,0.735339,0.599391,0.411002,0.280654
third,0.855802,0.547309,0.492494,0.991136
fourth,0.589054,0.436399,0.27199,0.464906


<h3>Import Excel file</h3>

In [9]:
import pandas as pd
import os
path = r'C:\myfolder'
file = 'myfile.xlsx'
df = pd.read_excel(os.path.join(path,file))