### Notes on working with `NaN` in `pandas`
File: `pd_04_nan.ipynb` <br>
Xuhua Huang <br>
Last updated: August 21, 2022 <br>
Created on: August 21, 2022

In [1]:
import numpy as np
import pandas as pd

#### Method `isnull()` and `isna()`

In [2]:
string_data: pd.Series = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
string_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
isinstance(string_data.isna(), pd.Series)

True

In [6]:
string_data.isna().value_counts()

False    3
True     1
dtype: int64

In [7]:
# calling all() function on a pd.Series will perform
# a logical AND operation on all of the elements
# use this to see if all of the elements are True
string_data.isna().all()

False

In [8]:
# calling any() function on a pd.Series will perform
# a logical OR operation on all of the elements
# use this to see if any of the element is True
string_data.isna().any()

True

In [9]:
isinstance(string_data, pd.Series)

True

#### Filtering out `np.nan` with `dropna()`

In [10]:
from numpy import nan as NA

In [11]:
# effectively NA is the same as np.nan
_data: pd.Series = pd.Series([1, NA, 3.5, np.nan, 7])
_data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [12]:
_data.isna()

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [13]:
# the following lines of code are equivalent
# _data[_data.notnull()]
_data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [14]:
# since calling notnull() on a pandas series will return the indexes of elements aren't NaN
# in the format of a new pandas series
# combining such with the use of subscript operator []
# becomes: series[indexes_of_not_na_elements]
type(_data.notnull())

pandas.core.series.Series

#### More on `dropna()`: parameter `how` and `axis`

In [15]:
_data: pd.Series = pd.DataFrame(
    [\
        [1., 6.5, 3.],
        [1., NA, NA],
        [NA, NA, NA],
        [NA, 6.5, 3.]
    ]
)

In [16]:
'''
parsed parameter:
    how:
        'any' will remove the rows/columns if any of the element is a NaN (OR operator)
        'all' will remove the rows/columns only if all of the elements are NaN (AND operator)
    axis:
        0 is the default value, perform operations on rows
        1 will perform the operation on columns
    inplace:
        True will return NoneType, _data will be modified
        False will return the filtered DataFrame back to the caller
'''
# cleaned_data: pd.DataFrame = _data.dropna(how='any')
cleaned_data_any: pd.DataFrame = _data.dropna(how='any', axis=0, inplace=False)

In [17]:
_data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
cleaned_data_any

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [19]:
'''
the following line is omitting 2 of the defaulted parameters:
    axis=0
    inplace=False
'''
cleaned_data_all: pd.DataFrame = _data.dropna(how='all')

In [20]:
cleaned_data_all

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [21]:
type(cleaned_data_all)

pandas.core.frame.DataFrame

In [22]:
print(type(cleaned_data_any))
print(type(cleaned_data_any) == type(cleaned_data_all))
print(type(cleaned_data_any) is type(cleaned_data_all))
# both variables (pointers) point to the same boolean True

<class 'pandas.core.frame.DataFrame'>
True
True
