In [1]:
import numpy as np
import pandas as pd

# Pandas ultility funcs

In [2]:
pd.isnull(np.nan)

True

In [3]:
pd.isnull(None)

True

In [4]:
pd.isna(np.nan)

True

In [5]:
pd.isna(None)

True

In [6]:
pd.notnull(None)

False

In [7]:
pd.notnull(np.nan)

False

In [8]:
pd.notnull(3)

True

In [9]:
pd.isnull(pd.Series([1, np.nan, 7]))

0    False
1     True
2    False
dtype: bool

In [10]:
pd.notnull(pd.Series([1, np.nan, 7]))

0     True
1    False
2     True
dtype: bool

In [11]:
pd.isnull(pd.DataFrame({
    'Col A': [1, np.nan, 7],
    'Col B': [np.nan, 7, 1],
    'Col C': [1, 7, np.nan]
}))

Unnamed: 0,Col A,Col B,Col C
0,False,True,False
1,True,False,False
2,False,False,True


In [12]:
# Working with Series
s = pd.Series([1,2,3, np.nan, np.nan, 4])

In [13]:
pd.notnull(s)

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [14]:
pd.notnull(s).sum()

4

In [15]:
s[s.notnull()]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [16]:
# Drop null values
s.dropna()

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [17]:
# Working with dataframes
df = pd.DataFrame({
    'Col A': [1, np.nan, 30, np.nan],
    'Col B': [np.nan,2 , 1, np.nan],
    'Col C': [1, 9, 30, np.nan],
    'Col D': [1, 20, 30, 14]
})

In [18]:
df

Unnamed: 0,Col A,Col B,Col C,Col D
0,1.0,,1.0,1
1,,2.0,9.0,20
2,30.0,1.0,30.0,30
3,,,,14


In [22]:
df.shape

(4, 4)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Col A   2 non-null      float64
 1   Col B   2 non-null      float64
 2   Col C   3 non-null      float64
 3   Col D   4 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 256.0 bytes


In [24]:
df.isnull().sum()

Col A    2
Col B    2
Col C    1
Col D    0
dtype: int64

In [25]:
# Drops any rows containing null values
df.dropna()

Unnamed: 0,Col A,Col B,Col C,Col D
2,30.0,1.0,30.0,30


In [26]:
# drops any column that contains null vals
df.dropna(axis=1)

Unnamed: 0,Col D
0,1
1,20
2,30
3,14


In [27]:
# drops any col that has all na vals
df.dropna(how='all')

Unnamed: 0,Col A,Col B,Col C,Col D
0,1.0,,1.0,1
1,,2.0,9.0,20
2,30.0,1.0,30.0,30
3,,,,14


In [28]:
# drops any col that has any na vals (default behavior)
df.dropna(how='any')

Unnamed: 0,Col A,Col B,Col C,Col D
2,30.0,1.0,30.0,30


In [34]:
# Specify a threshold
df.dropna(thresh=2)

Unnamed: 0,Col A,Col B,Col C,Col D
0,1.0,,1.0,1
1,,2.0,9.0,20
2,30.0,1.0,30.0,30


In [36]:
df.dropna(thresh=3, axis='columns')

Unnamed: 0,Col C,Col D
0,1.0,1
1,9.0,20
2,30.0,30
3,,14


In [38]:
# Filling null values
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [39]:
# fill with val
s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [40]:
s.fillna(s.mean())

0    1.0
1    2.0
2    3.0
3    2.5
4    2.5
5    4.0
dtype: float64

In [41]:
# overflow fill method
s.fillna(method='ffill')

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

In [42]:
s.fillna(method='bfill')

0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
dtype: float64

In [43]:
# leaves vlaues at extremes of Series/dataframe
pd.Series([np.nan, 3, np.nan, 9]).fillna(method='ffill')

0    NaN
1    3.0
2    3.0
3    9.0
dtype: float64

In [44]:
# filling in Data frames
df

Unnamed: 0,Col A,Col B,Col C,Col D
0,1.0,,1.0,1
1,,2.0,9.0,20
2,30.0,1.0,30.0,30
3,,,,14


In [45]:
df.fillna({'Col A': 0, 'Col B': 99, 'Col C' : df['Col C'].mean()})

Unnamed: 0,Col A,Col B,Col C,Col D
0,1.0,99.0,1.0,1
1,0.0,2.0,9.0,20
2,30.0,1.0,30.0,30
3,0.0,99.0,13.333333,14


In [46]:
df.fillna(method='ffill', axis=0)

Unnamed: 0,Col A,Col B,Col C,Col D
0,1.0,,1.0,1
1,1.0,2.0,9.0,20
2,30.0,1.0,30.0,30
3,30.0,1.0,30.0,14


In [47]:
df.fillna(method='ffill', axis=1)

Unnamed: 0,Col A,Col B,Col C,Col D
0,1.0,1.0,1.0,1.0
1,,2.0,9.0,20.0
2,30.0,1.0,30.0,30.0
3,,,,14.0
