In [1]:
import numpy as np
import pandas as pd

# Dealing with Missing Values (NAs)

In [4]:
#create sample data
df = pd.DataFrame([[1,np.nan,2],[2,3,5],[np.nan,4,6]]) #data has no values (NAs) in some cells
#we need to remove NA or NaN from the data frame

In [5]:
print(df)

     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6


In [7]:
## check if the data frames contains NA
print(df.isnull())

       0      1      2
0  False   True  False
1  False  False  False
2   True  False  False


In [8]:
## Drop NA values
## all rows with cells containing NA will be dropped
print(df.dropna())

     0    1  2
1  2.0  3.0  5


In [9]:
## Drop columns where cells have NAs
print(df.dropna(axis=1))

   2
0  2
1  5
2  6


In [10]:
print(df.dropna(axis=1,how="all"))

     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6


In [11]:
## thresh parameter lets you specify a minimum number of non-null values for the row/column to be kept
print(df.dropna(thresh=2))

     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6


## Filling no values (NA) cells

In [12]:
#fill NA entries with zero
print(df.fillna(0))

     0    1  2
0  1.0  0.0  2
1  2.0  3.0  5
2  0.0  4.0  6


In [13]:
#specify a forward-fill to propagate the previous value forward
print(df.fillna(method="ffill"))

     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  2.0  4.0  6


In [14]:
#fill forward column wise
print(df.fillna(method="ffill",axis=1))

     0    1    2
0  1.0  1.0  2.0
1  2.0  3.0  5.0
2  NaN  4.0  6.0


In [15]:
#back-fill to propagate the next values backward
print(df.fillna(method="bfill"))

     0    1  2
0  1.0  3.0  2
1  2.0  3.0  5
2  NaN  4.0  6
