#Hands on

In [1]:
import pandas as pd
import numpy as np


In [None]:
pd.isnull(np.nan)

True

In [None]:
pd.isnull(None)

True

In [None]:
pd.isnull(3)

False

In [None]:
pd.notnull(np.nan)

False

In [None]:
pd.notnull(None)

False

In [None]:
pd.notnull(4)

True

In [None]:
pd.isnull(pd.DataFrame({
    'Column 1': ['Tom','Dick','Harry'],
    'Column 2': [765,np.nan,982]
}))

Unnamed: 0,Column 1,Column 2
0,False,False
1,False,True
2,False,False


In [None]:
pd.notnull(pd.DataFrame({
    'Column1':['Tom',None,'Harry'],
    'Column2':[np.nan,981,764],
    'Column3':[876,np.nan,None]
}))

Unnamed: 0,Column1,Column2,Column3
0,True,False,True
1,False,True,False
2,True,True,False


In [None]:
pd.isnull(pd.Series(['Tom','Dick',None]))

0    False
1    False
2     True
dtype: bool

In [None]:
pd.notnull(pd.Series([543,np.nan,67,None]))

0     True
1    False
2     True
3    False
dtype: bool

#Filtering missing data

In [15]:
s=pd.Series([1,2,np.nan,4,None,6])

In [None]:
pd.isnull(s)

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [None]:
pd.notnull(s)

0     True
1     True
2    False
3     True
4    False
5     True
dtype: bool

In [None]:
s.isnull()

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [None]:
s.notnull()

0     True
1     True
2    False
3     True
4    False
5     True
dtype: bool

In [None]:
s.isnull().sum()

2

In [None]:
s.notnull().sum()

4

In [None]:
s[s.notnull()]

0    1.0
1    2.0
3    4.0
5    6.0
dtype: float64

In [None]:
s[s.notnull()].mean()

3.25

#Dropping null values

In [None]:
s

0    1.0
1    2.0
2    NaN
3    4.0
4    NaN
5    6.0
dtype: float64

In [None]:
s.dropna()

0    1.0
1    2.0
3    4.0
5    6.0
dtype: float64

##Dropping null values on DataFrames

In [2]:
df = pd.DataFrame({
    'Column1':[1,np.nan,30,np.nan],
    'Column2':[2,8,31,np.nan],
    'Column3':[np.nan,9,32,100],
    'Column4':[5,8,34,110]
})

In [3]:
df.shape

(4, 4)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Column1  2 non-null      float64
 1   Column2  3 non-null      float64
 2   Column3  3 non-null      float64
 3   Column4  4 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 256.0 bytes


In [9]:
df.isnull()

Unnamed: 0,Column1,Column2,Column3,Column4
0,False,False,True,False
1,True,False,False,False
2,False,False,False,False
3,True,True,False,False


In [8]:
df.isnull().sum()

Column1    2
Column2    1
Column3    1
Column4    0
dtype: int64

In [11]:
df.dropna()

Unnamed: 0,Column1,Column2,Column3,Column4
2,30.0,31.0,32.0,34


In [12]:
df.dropna(axis=1)

Unnamed: 0,Column4
0,5
1,8
2,34
3,110


In [13]:
 df.dropna(thresh=3)

Unnamed: 0,Column1,Column2,Column3,Column4
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34


In [14]:
df.dropna(thresh=3, axis=1)

Unnamed: 0,Column2,Column3,Column4
0,2.0,,5
1,8.0,9.0,8
2,31.0,32.0,34
3,,100.0,110


#Filling null values

Filling null values with arbitrary values.

In [16]:
s

0    1.0
1    2.0
2    NaN
3    4.0
4    NaN
5    6.0
dtype: float64

In [18]:
s.fillna(0)

0    1.0
1    2.0
2    0.0
3    4.0
4    0.0
5    6.0
dtype: float64

In [19]:
s.fillna(s.mean())

0    1.00
1    2.00
2    3.25
3    4.00
4    3.25
5    6.00
dtype: float64

Filling nulls with contiguous(close) values.

In [21]:
s.fillna(method='ffill')

0    1.0
1    2.0
2    2.0
3    4.0
4    4.0
5    6.0
dtype: float64

In [22]:
s.fillna(method='bfill')

0    1.0
1    2.0
2    4.0
3    4.0
4    6.0
5    6.0
dtype: float64

In [23]:
df.fillna(method='ffill', axis=0)

Unnamed: 0,Column1,Column2,Column3,Column4
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,30.0,31.0,100.0,110


In [24]:
df.fillna(method='ffill',axis=1)

Unnamed: 0,Column1,Column2,Column3,Column4
0,1.0,2.0,2.0,5.0
1,,8.0,9.0,8.0
2,30.0,31.0,32.0,34.0
3,,,100.0,110.0


In [26]:
df.fillna(method='bfill',axis=0)

Unnamed: 0,Column1,Column2,Column3,Column4
0,1.0,2.0,9.0,5
1,30.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [27]:
df.fillna(method='bfill',axis=1)

Unnamed: 0,Column1,Column2,Column3,Column4
0,1.0,2.0,5.0,5.0
1,8.0,8.0,9.0,8.0
2,30.0,31.0,32.0,34.0
3,100.0,100.0,100.0,110.0
