# Missing Data: Handling Missing Values

In [None]:
"""
                    HANDLING MISSING VALUES
--> In python, 'None' object is used to indicate undefined or missing values in a sequence.
--> IEEE standard for floats also has a concept of an undefined float.
    --> NAN or NaN (Not a Number)
    --> float('nan')
    --> math.nan
    --> np.nan
    --> pd.NA
        
                EQUALITY OF NAN: np.isnan()
--> Two NAN always compare False, i.e NAN == NAN will always returns False
    --> This is because we cannot compare two undefined values. There is no way to know whether or not they are equal.
        a = math.nan
        b = math.nan 
        a == b returns False
        a is b returns False
--> We use .isnan():
    --> math.isnan(np.nan) returns True
    --> np.isnan()
--> For a Pandas Series, None can be converted to nan in a Pandas Series, e.g
    --> pd.Series([1, 2, None, np.NAN])
    --> NAN in Pandas is defined as a float
--> If the series is a series of object, for example for Series of strings.
    --> e.g pd.Series(['a', 'b', None, np.nan]) returns an object type Series. Here, None is not converted to NAN values, so we test for missing values in Pandas using .isnull()
    
                    TESTING FOR MISSING DATA: df.isnull(), df.isna()
--> pd.isnull() is a universal function that operates on Series and DataFrames
    --> It returns True if the value is None or NAN.
--> pd.notnull() is similar to isnull() but opposite result.

                    REPLACING SERIES WITH MISSING DATA: df.fillna(value)
--> We can use loops to iterate and replace missing values
--> .fillna(value) allows us to replace missing value
--> some_series.fillna(method=)
    --> method = 'ffill' means forward fill
                null, 1, null, 2, null, 3, null: 1 fills the second null value, 2 fills the third and last null value
                e.g df.fillna(method='ffill', axis=0)
                    df.fillna(method='ffill', axis=1)
    --> method = 'bfill' means backward fill
    
                    INTERPOLATING MISSING DATA
--> More advanced techniques are:
    --> linear interpolation
    --> splines, etc.
    
                    DROPPING MISSING DATA
--> Firstly, we need o decide if we should drop the rows or drop the columns. We can't just drop values itself.
    --> df.dropna(axis=0) is default if axis is not specified. It is used for dropping the row where NA exists.
    --> df.dropna(axis=1) for dropping the column
    
"""

In [2]:
import pandas as pd
import numpy as np

In [17]:
ser = pd.Series(['a', 'b', None, np.nan])
print(ser)

0       a
1       b
2    None
3     NaN
dtype: object


In [21]:
ser.isnull()
print(ser.isna())

0    False
1    False
2     True
3     True
dtype: bool


In [None]:
df = pd.DataFrame(data=np.arange(10).reshape(2, 5),
                  columns=['A', 'B', 'C', 'D', 'E'],
                  index=['r1', 'r2'])
print(df)

In [24]:
print(ser.isnull() == ser.isna())

0    True
1    True
2    True
3    True
dtype: bool


In [1]:
float(1)

1.0

In [3]:
float('inf')

inf

In [4]:
type(float('inf'))

float

In [5]:
import math

In [6]:
math.inf

inf

In [7]:
np.inf

inf

In [163]:
np.inf

In [11]:
float('Nan'), float('nan'), float('NAN'), float('naN')

(nan, nan, nan, nan)

In [14]:
math.nan, np.nan

(nan, nan)

In [16]:
float('nan') == float('nan') # will return False

False

In [20]:
float('nan') is float('nan')

False

In [21]:
a = math.nan
b = np.nan
a == b

False

In [22]:
math.isnan(a)

True

In [23]:
np.isnan(a)

True

In [24]:
a = np.array([1, 2, np.nan, 3, math.nan])

In [28]:
np.isnan(a)

array([False, False,  True, False,  True])

In [34]:
s = pd.Series([3.14, 2.5, None, 5])
print(s)

0    3.14
1    2.50
2     NaN
3    5.00
dtype: float64


In [35]:
s.loc[2]

nan

In [33]:
type(s.loc[2])

numpy.float64

In [36]:
s.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [39]:
print(pd.Series([1, 2, 3, None]))

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64


In [41]:
s = pd.Series(['a', 'b', None, np.nan])
print(s)

0       a
1       b
2    None
3     NaN
dtype: object


In [43]:
print(s.isna())

0    False
1    False
2     True
3     True
dtype: bool


In [44]:
s[2] is None

True

In [45]:
s[3] is None

False

In [46]:
try:
    math.isnan(s.loc[2])
except TypeError as ex:
    print(f'Typerror: {ex}')

Typerror: must be real number, not NoneType


In [47]:
math.isnan(s.loc[3])

True

In [49]:
s.loc[s.isna()]

2    None
3     NaN
dtype: object

In [54]:
s = pd.Series(['aaa', 'bbb', None, 'ddd', np.nan],
              index=list('abcde'))
print(s)

a     aaa
b     bbb
c    None
d     ddd
e     NaN
dtype: object


In [55]:
pd.isnull(s)

a    False
b    False
c     True
d    False
e     True
dtype: bool

In [58]:
s[pd.isnull(s)]

c    None
e     NaN
dtype: object

In [59]:
s[~pd.isnull(s)]

a    aaa
b    bbb
d    ddd
dtype: object

In [60]:
pd.notnull(s)

a     True
b     True
c    False
d     True
e    False
dtype: bool

In [61]:
s[pd.notnull(s)]

a    aaa
b    bbb
d    ddd
dtype: object

In [62]:
s.dropna()

a    aaa
b    bbb
d    ddd
dtype: object

In [63]:
print(s)

a     aaa
b     bbb
c    None
d     ddd
e     NaN
dtype: object


In [65]:
print(s.fillna('missing'))

a        aaa
b        bbb
c    missing
d        ddd
e    missing
dtype: object


In [68]:
s.fillna(axis=0, method='ffill')

a    aaa
b    bbb
c    bbb
d    ddd
e    ddd
dtype: object

In [69]:
s.fillna(method='ffill')

a    aaa
b    bbb
c    bbb
d    ddd
e    ddd
dtype: object

In [70]:
s.fillna(method='bfill')

a    aaa
b    bbb
c    ddd
d    ddd
e    NaN
dtype: object

In [71]:
s.fillna(method='bfill').fillna(method='ffill')

a    aaa
b    bbb
c    ddd
d    ddd
e    ddd
dtype: object

In [74]:
s = pd.Series([1, 2, None, 4, None, 7])
print(s)

0    1.0
1    2.0
2    NaN
3    4.0
4    NaN
5    7.0
dtype: float64


In [76]:
print(s.interpolate(method='linear'))

0    1.0
1    2.0
2    3.0
3    4.0
4    5.5
5    7.0
dtype: float64


In [83]:
d = {
    'col1': {'row1': 1, 'row2': 10, 'row3': 100, 'row4': 1000, 'row5': 10000},
    'col2': {'row1': 2, 'row2': None, 'row3': None, 'row4': 2000, 'row5': 20000},
    'col3': {'row1': 3, 'row2': 30, 'row3': 300, 'row4': None, 'row5': 40000},
    'col4': {'row1': 4, 'row2': 40, 'row3': 400, 'row4': 4000, 'row5': 40000}
}
df = pd.DataFrame(d)
print(df)

       col1     col2     col3   col4
row1      1      2.0      3.0      4
row2     10      NaN     30.0     40
row3    100      NaN    300.0    400
row4   1000   2000.0      NaN   4000
row5  10000  20000.0  40000.0  40000


In [84]:
df.isna()

Unnamed: 0,col1,col2,col3,col4
row1,False,False,False,False
row2,False,True,False,False
row3,False,True,False,False
row4,False,False,True,False
row5,False,False,False,False


In [85]:
df.fillna(0)

Unnamed: 0,col1,col2,col3,col4
row1,1,2.0,3.0,4
row2,10,0.0,30.0,40
row3,100,0.0,300.0,400
row4,1000,2000.0,0.0,4000
row5,10000,20000.0,40000.0,40000


In [88]:
print(df)
print(df.fillna(method='ffill'))

       col1     col2     col3   col4
row1      1      2.0      3.0      4
row2     10      NaN     30.0     40
row3    100      NaN    300.0    400
row4   1000   2000.0      NaN   4000
row5  10000  20000.0  40000.0  40000
       col1     col2     col3   col4
row1      1      2.0      3.0      4
row2     10      2.0     30.0     40
row3    100      2.0    300.0    400
row4   1000   2000.0    300.0   4000
row5  10000  20000.0  40000.0  40000


In [89]:
print(df)
print(df.fillna(method='ffill', axis=0))

       col1     col2     col3   col4
row1      1      2.0      3.0      4
row2     10      NaN     30.0     40
row3    100      NaN    300.0    400
row4   1000   2000.0      NaN   4000
row5  10000  20000.0  40000.0  40000
       col1     col2     col3   col4
row1      1      2.0      3.0      4
row2     10      2.0     30.0     40
row3    100      2.0    300.0    400
row4   1000   2000.0    300.0   4000
row5  10000  20000.0  40000.0  40000


In [90]:
print(df)
print(df.fillna(method='ffill', axis=1))

       col1     col2     col3   col4
row1      1      2.0      3.0      4
row2     10      NaN     30.0     40
row3    100      NaN    300.0    400
row4   1000   2000.0      NaN   4000
row5  10000  20000.0  40000.0  40000
         col1     col2     col3     col4
row1      1.0      2.0      3.0      4.0
row2     10.0     10.0     30.0     40.0
row3    100.0    100.0    300.0    400.0
row4   1000.0   2000.0   2000.0   4000.0
row5  10000.0  20000.0  40000.0  40000.0


In [92]:
df.interpolate(method='linear') # axis=0 by default

Unnamed: 0,col1,col2,col3,col4
row1,1,2.0,3.0,4
row2,10,668.0,30.0,40
row3,100,1334.0,300.0,400
row4,1000,2000.0,20150.0,4000
row5,10000,20000.0,40000.0,40000


In [93]:
print(df)
df.interpolate(method='linear', axis=0)

       col1     col2     col3   col4
row1      1      2.0      3.0      4
row2     10      NaN     30.0     40
row3    100      NaN    300.0    400
row4   1000   2000.0      NaN   4000
row5  10000  20000.0  40000.0  40000


Unnamed: 0,col1,col2,col3,col4
row1,1,2.0,3.0,4
row2,10,668.0,30.0,40
row3,100,1334.0,300.0,400
row4,1000,2000.0,20150.0,4000
row5,10000,20000.0,40000.0,40000


In [94]:
print(df)
df.interpolate(method='linear', axis=1)

       col1     col2     col3   col4
row1      1      2.0      3.0      4
row2     10      NaN     30.0     40
row3    100      NaN    300.0    400
row4   1000   2000.0      NaN   4000
row5  10000  20000.0  40000.0  40000


Unnamed: 0,col1,col2,col3,col4
row1,1.0,2.0,3.0,4.0
row2,10.0,20.0,30.0,40.0
row3,100.0,200.0,300.0,400.0
row4,1000.0,2000.0,3000.0,4000.0
row5,10000.0,20000.0,40000.0,40000.0


In [95]:
df.dropna() #axis=0 by default

Unnamed: 0,col1,col2,col3,col4
row1,1,2.0,3.0,4
row5,10000,20000.0,40000.0,40000


In [96]:
df.dropna(axis=1)

Unnamed: 0,col1,col4
row1,1,4
row2,10,40
row3,100,400
row4,1000,4000
row5,10000,40000
