# HANDLING MISSING DATA in PANDAS

In [2]:
import numpy as np
import pandas as pd

## Manually NaN or missing value creation !!

In [4]:
arr1=np.array([1,2,np.nan,4,5])

In [5]:
arr2=np.array([np.nan,14,15,np.nan,np.nan])

In [6]:
arr3=np.array([10,12,np.nan,np.nan,5])

In [10]:
arr5=np.vstack((arr1,arr2,arr3))

In [11]:
arr5

array([[ 1.,  2., nan,  4.,  5.],
       [nan, 14., 15., nan, nan],
       [10., 12., nan, nan,  5.]])

In [12]:
p1=pd.DataFrame(arr5,['r1','r2','r3'],['c1','c2','c3','c4','c5'])

In [13]:
p1

Unnamed: 0,c1,c2,c3,c4,c5
r1,1.0,2.0,,4.0,5.0
r2,,14.0,15.0,,
r3,10.0,12.0,,,5.0


## Check for Missing Values
To make detecting missing values easier (and across different array dtypes), Pandas provides the isnull() and notnull() functions, which are also methods on Series and DataFrame objects −

In [15]:
p1.isnull()

Unnamed: 0,c1,c2,c3,c4,c5
r1,False,False,True,False,False
r2,True,False,False,True,True
r3,False,False,True,True,False


In [16]:
p1.notnull()

Unnamed: 0,c1,c2,c3,c4,c5
r1,True,True,False,True,True
r2,False,True,True,False,False
r3,True,True,False,False,True


In [17]:
p1['c1'].isnull()

r1    False
r2     True
r3    False
Name: c1, dtype: bool

In [22]:
p1.loc['r1'].notnull()

c1     True
c2     True
c3    False
c4     True
c5     True
Name: r1, dtype: bool

## Calculations with Missing Data
* When summing data, NA will be treated as Zero
* If the data are all NA, then the result will be NA

In [27]:
p2=pd.DataFrame(index=['a','b','c'],columns=[1,2,3])

In [28]:
p2

Unnamed: 0,1,2,3
a,,,
b,,,
c,,,


In [31]:
p2[1].sum()

0

## Cleaning / Filling Missing Data
Pandas provides various methods for cleaning the missing values. The fillna function can “fill in” NA values with non-null data in a couple of ways, which we have illustrated in the following sections

## *  Replace NaN with a Scalar Value

In [37]:
p1

Unnamed: 0,c1,c2,c3,c4,c5
r1,1.0,2.0,,4.0,5.0
r2,,14.0,15.0,,
r3,10.0,12.0,,,5.0


In [38]:
p1.fillna(5)

Unnamed: 0,c1,c2,c3,c4,c5
r1,1.0,2.0,5.0,4.0,5.0
r2,5.0,14.0,15.0,5.0,5.0
r3,10.0,12.0,5.0,5.0,5.0


#### * padfill and Backfill 

In [39]:
p1.fillna(method='pad')

Unnamed: 0,c1,c2,c3,c4,c5
r1,1.0,2.0,,4.0,5.0
r2,1.0,14.0,15.0,4.0,5.0
r3,10.0,12.0,15.0,4.0,5.0


In [40]:
p1.fillna(method='backfill')

Unnamed: 0,c1,c2,c3,c4,c5
r1,1.0,2.0,15.0,4.0,5.0
r2,10.0,14.0,15.0,,5.0
r3,10.0,12.0,,,5.0


## Drop NaN using dropna()

If you want to simply exclude the missing values, then use the dropna function along with the axis argument. By default, axis=0, i.e., along row, which means that if any value within a row is NA then the whole row is excluded.

In [42]:
p1.dropna()

Unnamed: 0,c1,c2,c3,c4,c5


In [43]:
p1

Unnamed: 0,c1,c2,c3,c4,c5
r1,1.0,2.0,,4.0,5.0
r2,,14.0,15.0,,
r3,10.0,12.0,,,5.0


In [44]:
p1.dropna(axis=1)

Unnamed: 0,c2
r1,2.0
r2,14.0
r3,12.0


## Replace Missing (or) Generic Values
Many times, we have to replace a generic value with some specific value. We can achieve this by applying the replace method.

In [47]:
p1

Unnamed: 0,c1,c2,c3,c4,c5
r1,1.0,2.0,,4.0,5.0
r2,,14.0,15.0,,
r3,10.0,12.0,,,5.0


In [49]:
p8=p1.fillna(5)

In [50]:
p8

Unnamed: 0,c1,c2,c3,c4,c5
r1,1.0,2.0,5.0,4.0,5.0
r2,5.0,14.0,15.0,5.0,5.0
r3,10.0,12.0,5.0,5.0,5.0


##### replace all 5 with 1000

In [51]:
p8.replace({5.0:1000.0})

Unnamed: 0,c1,c2,c3,c4,c5
r1,1.0,2.0,1000.0,4.0,1000.0
r2,1000.0,14.0,15.0,1000.0,1000.0
r3,10.0,12.0,1000.0,1000.0,1000.0


# to change the current object always pass inplace=True