In [1]:
import pandas as pd
import numpy as np

In [2]:
people = {
    'first' : ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'],
    'last' : ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'],
    'email' : ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age' : ['33', '55', '63', '36', None, None, 'Missing']
}

In [3]:
df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [4]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [5]:
# dropna with default args
# axis can be set to index or columns. When set to index, it drop rows with missing values
# when set to columns, it drop columns with missing values
# how refers to the criteria used to drop a row/column
df.dropna(axis='index', how='any')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [6]:
# If we use how with 'all' then it will drop a record only if all fields are missing

df.dropna(axis='index', how='all')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [7]:
# drop records with missing email address
df.dropna(axis='index', how='any', subset=['email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [8]:
# In the above how is not achieving much and a paramter of 'any' or 'all'
# does not make much of a diff. It will drop records based on the subset value

In [9]:
df.dropna(axis='index', how='all', subset=['last', 'email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [10]:
# The above one drops if both last and email are missing. here how has a purpose

In [11]:
# Handling custom missing values

In [12]:
#  We will replace the actual data's 'NA' and 'Missing' string with NaN
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)

In [13]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [14]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [15]:
df.dropna(axis='index', how='all', subset=['last', 'email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [16]:
# We can compare the results from our executions above, and the results here

In [17]:
# to figure out if a value falls under the criteria of being classified as na or not
# we can run the following:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [18]:
# filling na with some other data

In [19]:
df.fillna('MISSING')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,Anonymous@email.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


In [22]:
# In case of numeric fields, it is sensible to fillna values with 0/-1
# This however does not stop us from filling string na values with 0/-1 
df.fillna(0)

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0
6,0,0,0,0


In [23]:
# Data casting

In [25]:
# Get avg age of all the ppl in the sample df
# Right now all the values in our df are strings
# object datatype usually refers to strings or a mix of diff data types
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [26]:
# If we apply average age, it would throw an error now, bcz it is of type str/object
df['age'].mean()

TypeError: can only concatenate str (not "int") to str

In [27]:
# When we have nan values in a column that we are trying to convert to a number,
# then we need to use a float data type, that is because nan is actually a float
# under the hood

type(np.nan)

float

In [28]:
# If we try to convert the age col to integer it will throw an error while trying to convert the nan values
df['age'] = df['age'].astype(int) #trying to change dtype of age

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'

In [29]:
# the nan values lead to the above error

In [30]:
# If code doesn't have any missing values, then the above code will work fine
# If col has missing values then we can convert those to something else, like
# a zero using fillna, or we can cast the col to float.
# In this case we are trying to compute average and hence filling with 0, might not be a good idea.
df['age'] = df['age'].astype(float)

In [32]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [33]:
df['age'].mean()

46.75

In [34]:
# If we have an entire df of numbers that we are trying to convert as once, then 
# the df object has an astype method as well

# df.astype()

# We wont do using astype here because our dataset is mixed