Cleaning Data - Casting Datatypes and Handling Missing Values

In [2]:
import pandas as pd
import numpy as np

In [13]:
# Working with example dataframe
people = {
    "first" : ["Anish", "Ramish",None, "Samish", np.nan,"Bamish", np.nan, 'NA'],
    "last" : ["Khadka", "Mainali", "Shrestha", "Karki", "Mainali", np.nan, np.nan, 'Missing'],
    "email" : [None, "mainaliramish89@gmail.com", 
               "shresthasamish28@gmail.com", None,"bamishkarki819@gmail.com",
               "bamishmainali78@gmail.com", np.nan, 'Missing'],
    "age": ['23', '24', '22', '26', '27', None, None, 'Missing']

}
mydf = pd.DataFrame(people)
mydf

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,,26
4,,Mainali,bamishkarki819@gmail.com,27
5,Bamish,,bamishmainali78@gmail.com,
6,,,,
7,,Missing,Missing,Missing


In [14]:
# simply drop NA
mydf.dropna()

Unnamed: 0,first,last,email,age
1,Ramish,Mainali,mainaliramish89@gmail.com,24
7,,Missing,Missing,Missing


In [15]:
# index : drop the rows
# drop any row with a missing value
# how = all, when all are missing
mydf.dropna(axis='index', how='any')

Unnamed: 0,first,last,email,age
1,Ramish,Mainali,mainaliramish89@gmail.com,24
7,,Missing,Missing,Missing


In [16]:
mydf.dropna(axis='index', how='all')

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,,26
4,,Mainali,bamishkarki819@gmail.com,27
5,Bamish,,bamishmainali78@gmail.com,
7,,Missing,Missing,Missing


In [None]:
mydf.dropna(axis='columns', how='all') # no column with all values empty

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,,26
4,,Mainali,bamishkarki819@gmail.com,27
5,Bamish,,bamishmainali78@gmail.com,
6,,,,
7,,Missing,Missing,Missing


In [None]:
mydf.dropna(axis='index', how='any', subset=['email']) # check if there is email

Unnamed: 0,first,last,email,age
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,,Shrestha,shresthasamish28@gmail.com,22
4,,Mainali,bamishkarki819@gmail.com,27
5,Bamish,,bamishmainali78@gmail.com,
7,,Missing,Missing,Missing


In [None]:
mydf.dropna(axis='index', how='all', subset=['last', 'email']) # both columns must be NaN to be dropped
# any would mean if any value is missing, the row will be dropped
# use inplace=True to change the dataframe in place

Unnamed: 0,first,last,email,age
0,Anish,Khadka,,23
1,Ramish,Mainali,mainaliramish89@gmail.com,24
2,,Shrestha,shresthasamish28@gmail.com,22
3,Samish,Karki,,26
4,,Mainali,bamishkarki819@gmail.com,27
5,Bamish,,bamishmainali78@gmail.com,
7,,Missing,Missing,Missing
