<h3>Missing Values</h3>

In [21]:
import numpy as np
import pandas as pd

In [22]:
data = {"names":["Steve", "John", "Richard", "Sarah", "Randy", "Michael", "Julie"],
        "ages":[20, 22, 20, 21, 24, 23, 222],
        "gender":["M", "M", "M", "F", "M", "M", "F"],
        "rank":[2, 1, 4, 5, 3, 7 ,6]}

ranking_df = pd.DataFrame(data)
print(ranking_df)

# creating a basic df

     names  ages gender  rank
0    Steve    20      M     2
1     John    22      M     1
2  Richard    20      M     4
3    Sarah    21      F     5
4    Randy    24      M     3
5  Michael    23      M     7
6    Julie   222      F     6


In [23]:
ranking_df.iloc[2:5, 1] = np.nan
ranking_df.iloc[3:6, 3] = np.nan
ranking_df.iloc[3, :] = np.nan
print(ranking_df)

# artificially adding some NaN values

     names   ages gender  rank
0    Steve   20.0      M   2.0
1     John   22.0      M   1.0
2  Richard    NaN      M   4.0
3      NaN    NaN    NaN   NaN
4    Randy    NaN      M   NaN
5  Michael   23.0      M   NaN
6    Julie  222.0      F   6.0


In [24]:
ranking_df.isnull()

# asking if there are NaN values in the df (True indicates presence of a missing value)

Unnamed: 0,names,ages,gender,rank
0,False,False,False,False
1,False,False,False,False
2,False,True,False,False
3,True,True,True,True
4,False,True,False,True
5,False,False,False,True
6,False,False,False,False


In [25]:
ranking_df.notnull()

# asking if there are not NaN values in the df (True indicates presence of a non-missing value) - this is the opposite of isnull()

Unnamed: 0,names,ages,gender,rank
0,True,True,True,True
1,True,True,True,True
2,True,False,True,True
3,False,False,False,False
4,True,False,True,False
5,True,True,True,False
6,True,True,True,True


In [26]:
bool_series = pd.isnull(ranking_df["ages"])
ranking_df[bool_series]

# returns the rows containing NaN values from the ages column

Unnamed: 0,names,ages,gender,rank
2,Richard,,M,4.0
3,,,,
4,Randy,,M,


In [27]:
ranking_df.fillna(0)

# horribly basic method of filling in NaN values with a defined value (in this case 0)

Unnamed: 0,names,ages,gender,rank
0,Steve,20.0,M,2.0
1,John,22.0,M,1.0
2,Richard,0.0,M,4.0
3,0,0.0,0,0.0
4,Randy,0.0,M,0.0
5,Michael,23.0,M,0.0
6,Julie,222.0,F,6.0


In [28]:
ranking_df.ffill()

# replaces NaN calues with the previous valid non-NaN value in the column (forward fill)
# there is also a bfill() method which does the opposite (fills backwards from the next valid value)

Unnamed: 0,names,ages,gender,rank
0,Steve,20.0,M,2.0
1,John,22.0,M,1.0
2,Richard,22.0,M,4.0
3,Richard,22.0,M,4.0
4,Randy,22.0,M,4.0
5,Michael,23.0,M,4.0
6,Julie,222.0,F,6.0


In [29]:
ranking_df.infer_objects()

# attempts to convert object columns to more specific types (e.g., int, float) if possible

Unnamed: 0,names,ages,gender,rank
0,Steve,20.0,M,2.0
1,John,22.0,M,1.0
2,Richard,,M,4.0
3,,,,
4,Randy,,M,
5,Michael,23.0,M,
6,Julie,222.0,F,6.0


In [30]:
ranking_df.dropna()

# removes rows where a NaN value is present in any column.

Unnamed: 0,names,ages,gender,rank
0,Steve,20.0,M,2.0
1,John,22.0,M,1.0
6,Julie,222.0,F,6.0


In [31]:
ranking_df.dropna(how= "all")

# a less aggressive dropna() method whereby only rows with all values as NaN are removed (here row index 3)

Unnamed: 0,names,ages,gender,rank
0,Steve,20.0,M,2.0
1,John,22.0,M,1.0
2,Richard,,M,4.0
4,Randy,,M,
5,Michael,23.0,M,
6,Julie,222.0,F,6.0
