Cleaning Data - Casting Datatypes and Handling Missing Values

In [2]:
import pandas as pd
import numpy as np

In [13]:
# Working with example dataframe
people = {
    "first" : ["Anish", "Ramish",None, "Samish", np.nan,"Bamish", np.nan, 'NA'],
    "last" : ["Khadka", "Mainali", "Shrestha", "Karki", "Mainali", np.nan, np.nan, 'Missing'],
    "email" : [None, "mainaliramish89@gmail.com", 
               "shresthasamish28@gmail.com", None,"bamishkarki819@gmail.com",
               "bamishmainali78@gmail.com", np.nan, 'Missing'],
    "age": ['23', '24', '22', '26', '27', None, None, 'Missing']

}
mydf = pd.DataFrame(people)
mydf

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,,26
4,,Mainali,bamishkarki819@gmail.com,27
5,Bamish,,bamishmainali78@gmail.com,
6,,,,
7,,Missing,Missing,Missing


In [14]:
# simply drop NA
mydf.dropna()

Unnamed: 0,first,last,email,age
1,Ramish,Mainali,mainaliramish89@gmail.com,24
7,,Missing,Missing,Missing


In [15]:
# index : drop the rows
# drop any row with a missing value
# how = all, when all are missing
mydf.dropna(axis='index', how='any')

Unnamed: 0,first,last,email,age
1,Ramish,Mainali,mainaliramish89@gmail.com,24
7,,Missing,Missing,Missing


In [16]:
mydf.dropna(axis='index', how='all')

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,,26
4,,Mainali,bamishkarki819@gmail.com,27
5,Bamish,,bamishmainali78@gmail.com,
7,,Missing,Missing,Missing


In [None]:
mydf.dropna(axis='columns', how='all') # no column with all values empty

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,,26
4,,Mainali,bamishkarki819@gmail.com,27
5,Bamish,,bamishmainali78@gmail.com,
6,,,,
7,,Missing,Missing,Missing


In [None]:
mydf.dropna(axis='index', how='any', subset=['email']) # check if there is email

Unnamed: 0,first,last,email,age
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,,Shrestha,shresthasamish28@gmail.com,22
4,,Mainali,bamishkarki819@gmail.com,27
5,Bamish,,bamishmainali78@gmail.com,
7,,Missing,Missing,Missing


In [None]:
mydf.dropna(axis='index', how='all', subset=['last', 'email']) # both columns must be NaN to be dropped
# any would mean if any value is missing, the row will be dropped
# use inplace=True to change the dataframe in place

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,,26
4,,Mainali,bamishkarki819@gmail.com,27
5,Bamish,,bamishmainali78@gmail.com,
7,,Missing,Missing,Missing


In [26]:
# How to deal with 'NA' or 'Missing' values in the dataframe?
mydf.replace('NA', np.nan, inplace=True)
mydf.replace('Missing', np.nan, inplace=True)
mydf

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23.0
1,Ramish,Mainali,mainaliramish89@gmail.com,24.0
2,,Shrestha,shresthasamish28@gmail.com,22.0
3,Samish,Karki,,26.0
4,,Mainali,bamishkarki819@gmail.com,27.0
5,Bamish,,bamishmainali78@gmail.com,
6,,,,
7,,,,


In [27]:
# run the previous line again
mydf.dropna(axis='index', how='all', subset=['last', 'email'])

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23.0
1,Ramish,Mainali,mainaliramish89@gmail.com,24.0
2,,Shrestha,shresthasamish28@gmail.com,22.0
3,Samish,Karki,,26.0
4,,Mainali,bamishkarki819@gmail.com,27.0
5,Bamish,,bamishmainali78@gmail.com,


In [28]:
# To check isna()
mydf.isna()

Unnamed: 0,first,last,email,age
0,False,False,True,False
1,False,False,False,False
2,True,False,False,False
3,False,False,True,False
4,True,False,False,False
5,False,True,False,True
6,True,True,True,True
7,True,True,True,True


In [29]:
# You could replace NA with anything
mydf.fillna('MISSING')

Unnamed: 0,first,last,email,age
0,Anish,Khadka,MISSING,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,MISSING,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,MISSING,26
4,MISSING,Mainali,bamishkarki819@gmail.com,27
5,Bamish,MISSING,bamishmainali78@gmail.com,MISSING
6,MISSING,MISSING,MISSING,MISSING
7,MISSING,MISSING,MISSING,MISSING


In [30]:
mydf.fillna(0)

Unnamed: 0,first,last,email,age
0,Anish,Khadka,0,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,0,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,0,26
4,0,Mainali,bamishkarki819@gmail.com,27
5,Bamish,0,bamishmainali78@gmail.com,0
6,0,0,0,0
7,0,0,0,0


In [None]:
# Casting data types
