# Cleaning Data - Handling Missing Values

In [1]:
import pandas as pd
import numpy as np

In [2]:
people = {
    "first":['Ali','Jane','Jhon','Chris', np.nan, None, 'NA'],
    "last":['Raza','Doe','Doe','Smith', np.nan, np.nan, 'Missing'],
    "email":['aliraza@gmail.com','janeDoe@gmail.com','JhonDoe@hotmail.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    "age":['33','55','63','36', None, None, 'Missing']
}

In [22]:
df = pd.DataFrame(people)

In [23]:
df.replace('NA', np.nan, inplace = True)
df.replace('Missing', np.nan, inplace = True)

In [24]:
df

Unnamed: 0,first,last,email,age
0,Ali,Raza,aliraza@gmail.com,33.0
1,Jane,Doe,janeDoe@gmail.com,55.0
2,Jhon,Doe,JhonDoe@hotmail.com,63.0
3,Chris,Smith,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [11]:
df.dropna(axis = 'columns', how = 'any')

0
1
2
3
4
5
6


In [14]:
df.dropna(axis = 'index', how = 'any') # default

Unnamed: 0,first,last,email,age
0,Ali,Raza,aliraza@gmail.com,33
1,Jane,Doe,janeDoe@gmail.com,55
2,Jhon,Doe,JhonDoe@hotmail.com,63
6,,Missing,,Missing


In [26]:
df.dropna(axis = 'index', how = 'any', subset = ['email']) # Specific for 'email' column, would remove that missing row.

Unnamed: 0,first,last,email,age
0,Ali,Raza,aliraza@gmail.com,33.0
1,Jane,Doe,janeDoe@gmail.com,55.0
2,Jhon,Doe,JhonDoe@hotmail.com,63.0
5,,,Anonymous@email.com,


In [25]:
df.dropna(axis = 'index', how = 'all', subset = ['email', 'last']) # If we either need values of 'email' or 'last' but not of both.

Unnamed: 0,first,last,email,age
0,Ali,Raza,aliraza@gmail.com,33.0
1,Jane,Doe,janeDoe@gmail.com,55.0
2,Jhon,Doe,JhonDoe@hotmail.com,63.0
3,Chris,Smith,,36.0
5,,,Anonymous@email.com,


In [27]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [28]:
df.fillna('MISSING')

Unnamed: 0,first,last,email,age
0,Ali,Raza,aliraza@gmail.com,33
1,Jane,Doe,janeDoe@gmail.com,55
2,Jhon,Doe,JhonDoe@hotmail.com,63
3,Chris,Smith,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,Anonymous@email.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


In [30]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [31]:
df['age'].mean()

TypeError: can only concatenate str (not "int") to str

In [33]:
type(np.nan)

float

In [35]:
df['age'] = df['age'].astype(float)

In [36]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [37]:
df['age'].mean()

46.75