# Missing data convension :
   1. mask 2. sentinal value

**mask** : globally indicates missing value
**sentinal** : indicates a missing entry

In [2]:
# None: none is object in python so it will not give any error 
# none is sentinal value
import numpy as np
import pandas as pd
vals1 = np.array([1, None, 3, 4])
vals1

array([1, None, 3, 4], dtype=object)

In [3]:
# but if we want to perform any aggregation fucntion( sum(),avg(),min(),max() etc.) then it will show an error
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [7]:
# NaN: missing numerical data 
#------ NaN is a special floating point value
import numpy as np
import pandas as pd
vals1 = np.array([1,np.NaN, 3, 4])
vals1.dtype

dtype('float64')

In [8]:
# but if we want to perform any aggregation fucntion( sum(),avg(),min(),max() etc.) then it will no show an error but it will not useful beacuse it show nan value
1+np.nan

nan

In [10]:
5*np.nan

nan

In [13]:
# using  aggregation fucntion( sum(),avg(),min(),max() )
vals1.sum()


nan

In [14]:
vals1.max()

nan

In [15]:

vals1.min()

nan

In [16]:
# So numpy provides some aggregation function to avoid the NaN value
np.nansum(vals1)

8.0

In [17]:
np.nanmax(vals1)

4.0

In [18]:
np.nanmin(vals1)

1.0

# NaN and None in pandas


In [19]:

pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [24]:
x=pd.Series(range(5),dtype=int)
x

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [25]:
x[0]=None
x

0    NaN
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

# Operation on NULL values

In [26]:
# serveral useful methods in pandas data structure
# 1. isnull() : generate a boolean mask indicating missing value
# 2. notnull() : opposite of isnull()
# 3. dropna() : returned a filtered version of the data
# 4. fillna() : returned a copy of data with missing value filled or imputed

In [3]:
import pandas as pd
import numpy as np
data=pd.Series([1,np.nan,'sachin',10,None])
data

0         1
1       NaN
2    sachin
3        10
4      None
dtype: object

In [4]:
data.isnull()

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [5]:
data[data.notnull()]

0         1
2    sachin
3        10
dtype: object

In [6]:
data.notnull()

0     True
1    False
2     True
3     True
4    False
dtype: bool

In [7]:
#**DROPPING NULL VALUE**

In [8]:
data.dropna() # it removes the null value

0         1
2    sachin
3        10
dtype: object

In [10]:
df=pd.DataFrame([[1,np.nan,2],[2,3,6],[3,4,np.nan]])
df

Unnamed: 0,0,1,2
0,1,,2.0
1,2,3.0,6.0
2,3,4.0,


In [11]:
# by default remove raw value 
df.dropna() # it removes the row and column which having nan value

Unnamed: 0,0,1,2
1,2,3.0,6.0


In [12]:
df.dropna(axis='columns') # it removes column value

Unnamed: 0,0
0,1
1,2
2,3


In [13]:
df[3]=np.nan
df

Unnamed: 0,0,1,2,3
0,1,,2.0,
1,2,3.0,6.0,
2,3,4.0,,


In [14]:
df.dropna(axis='columns',how='all') # how='all' which will only drop rows/columns that are all null value


Unnamed: 0,0,1,2
0,1,,2.0
1,2,3.0,6.0
2,3,4.0,


In [15]:
#For finer-grained control, the thresh parameter lets you specify a minimum number
#----------of non-null values for the row/column to be kept:
df.dropna(axis='rows',thresh=3)

Unnamed: 0,0,1,2,3
1,2,3.0,6.0,


In [16]:
# filling null values
data=pd.Series([1,np.nan,'sachin',10,None])
data

0         1
1       NaN
2    sachin
3        10
4      None
dtype: object

In [18]:
data.fillna(0) # fill a nan vlaue with 0

0         1
1         0
2    sachin
3        10
4         0
dtype: object

In [19]:
# forword fill : We can specify a forward-fill to propagate the previous value forward:
data.fillna(method='ffill')

0         1
1         1
2    sachin
3        10
4        10
dtype: object

In [20]:
# backword filling : Or we can specify a back-fill to propagate the next values backward:
data.fillna(method="bfill")

0         1
1    sachin
2    sachin
3        10
4      None
dtype: object

In [25]:
df=pd.DataFrame([[1,np.nan,2],[2,3,6],[3,4,np.nan]],columns=list('abc'))
df

Unnamed: 0,a,b,c
0,1,,2.0
1,2,3.0,6.0
2,3,4.0,


In [29]:
df.dropna(axis='columns',how='all')


Unnamed: 0,a,b,c
0,1,,2.0
1,2,3.0,6.0
2,3,4.0,


In [31]:
df

Unnamed: 0,a,b,c,3
0,1,,2.0,
1,2,3.0,6.0,
2,3,4.0,,


In [34]:
df.fillna(method='ffill',axis=1)

Unnamed: 0,a,b,c,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,6.0,6.0
2,3.0,4.0,4.0,4.0
