In [6]:
import pandas as pd
import numpy as np

In [7]:
people = {
    "first": ["Corey", "Jane", "John", "Chris", np.nan, None, 'NA'],
    "last": ["Schafer", "Doe", "Doe", "Schafer", np.nan, np.nan, 'Missing'],
    "email": ["CoreyMSchafer@gmail.com", "JaneDoe@email.com", "JohnDoe@email.com", None, np.nan, "Anonymous@email.com", "NA"],
    "age": ['33', '55', '63', '36', None, None, "Missing"],
}

In [19]:
df = pd.DataFrame(people)

# Replace Custom missing or na values with numpy nan
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)

In [20]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [21]:
# Using dropna to remove na values
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [22]:
# Using 'any' drops the whole row/columns if one field contains na value
df.dropna(axis='index', how='all') 

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [23]:
# Drop rows if email field in None
df.dropna(axis='index', how='all', subset=['last', 'email']) 

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [24]:
# Check for isna
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [26]:
# Use fillna for specify values for na values
df.fillna(0)

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0
6,0,0,0,0


In [28]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [29]:
df['age'].mean()

TypeError: can only concatenate str (not "int") to str

In [30]:
# Casting data types
df['age'] = df['age'].astype(int)
# Casting none to int datatypes return an error

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'

In [32]:
df['age'] = df['age'].astype(float)

In [33]:
df['age'].mean()

46.75