In [1]:
%%html
<h2>Handling Missing Data(Null Values)</h2>
<h3>isnull(), isna(), notnull(), dropna(), fillna()</h3>

In [2]:
import numpy as np
import pandas as pd

In [3]:
data_dic = {'A': [1, 2, None, 4, np.nan],
           'B': [np.nan, np.nan, np.nan, np.nan, np.nan],
           'C': [11, 12, 13, 14, 15],
           'D': [16, np.nan, 18, 19, 20]}

df = pd.DataFrame(data_dic)
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       3 non-null      float64
 1   B       0 non-null      float64
 2   C       5 non-null      int64  
 3   D       4 non-null      float64
dtypes: float64(3), int64(1)
memory usage: 292.0 bytes


In [5]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [6]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,True,False,False
1,False,True,False,True
2,True,True,False,False
3,False,True,False,False
4,True,True,False,False


In [7]:
df.isnull().sum() # column wise total null

A    2
B    5
C    0
D    1
dtype: int64

In [8]:
df.isnull().sum().sum() # total null of the dataframe

np.int64(8)

In [9]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [10]:
df['A'].isnull()

0    False
1    False
2     True
3    False
4     True
Name: A, dtype: bool

In [11]:
df['A'].isnull().sum()

np.int64(2)

In [12]:
df['A'].isnull().sum().sum()

np.int64(2)

In [13]:
df.isna()

Unnamed: 0,A,B,C,D
0,False,True,False,False
1,False,True,False,True
2,True,True,False,False
3,False,True,False,False
4,True,True,False,False


In [14]:
df.isna().sum()

A    2
B    5
C    0
D    1
dtype: int64

In [15]:
df.isna().sum().sum()

np.int64(8)

In [16]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [17]:
df.loc[1] # row wise data

A     2.0
B     NaN
C    12.0
D     NaN
Name: 1, dtype: float64

In [18]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [19]:
df.loc[0] # row wise data

A     1.0
B     NaN
C    11.0
D    16.0
Name: 0, dtype: float64

In [20]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [21]:
df.loc[1].isna()

A    False
B     True
C    False
D     True
Name: 1, dtype: bool

In [22]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [23]:
df.loc[0].isna().sum()

np.int64(1)

In [24]:
# loc is used to find row wise data from the dataframe
df.loc[1].isna().sum()

np.int64(2)

In [25]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [26]:
df.notna()

Unnamed: 0,A,B,C,D
0,True,False,True,True
1,True,False,True,False
2,False,False,True,True
3,True,False,True,True
4,False,False,True,True


In [27]:
df.notna().sum()

A    3
B    0
C    5
D    4
dtype: int64

In [28]:
df.notnull().sum()

A    3
B    0
C    5
D    4
dtype: int64

In [29]:
df.notnull().sum().sum()

np.int64(12)

In [30]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [31]:
# column wise sum
df['A'].sum()

np.float64(7.0)

In [32]:
# column wise sum
# every values are NaN in column B, 
# Nan is float type 
# so the result is float type
# so the result is 0(zero)
df['B'].sum()

np.float64(0.0)

In [33]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [34]:
df['C'].sum()

np.int64(65)

In [35]:
df['A'].sum()

np.float64(7.0)

In [36]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [37]:
# column wise mean
df['A'].mean() 

np.float64(2.3333333333333335)

In [38]:
df['B'].sum()

np.float64(0.0)

In [39]:
print(df['B'].sum())

0.0


In [40]:
df['B'].mean()

nan

In [41]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [42]:
# row wise sum
df.loc[3].sum()

np.float64(37.0)

In [43]:
df['A'].mean()

np.float64(2.3333333333333335)

In [44]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [46]:
# row wise drop null values
# axis = 0 means row wise data
# it will delete/drop temporary, not permanently
df.dropna(axis=0)

Unnamed: 0,A,B,C,D


In [47]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [48]:
# column wise drop null values
# axis = 1 means column wise data
# it will delete/drop temporary, not permanently
df.dropna(axis=1)

Unnamed: 0,C
0,11
1,12
2,13
3,14
4,15


In [49]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [50]:
## column wise drop null values
## it will delete/drop permanently

# df.dropna(axis=1, inplace=True)
# df

In [53]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [54]:
# Drop columns with less than 3 non-missing values
df.dropna(thresh=3, axis=1)

Unnamed: 0,A,C,D
0,1.0,11,16.0
1,2.0,12,
2,,13,18.0
3,4.0,14,19.0
4,,15,20.0


In [55]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [56]:
df.dropna(thresh=4, axis=1)

Unnamed: 0,C,D
0,11,16.0
1,12,
2,13,18.0
3,14,19.0
4,15,20.0


In [57]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [58]:
df.dropna(thresh=5, axis=1)

Unnamed: 0,C
0,11
1,12
2,13
3,14
4,15


In [59]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [60]:
df.dropna(thresh=1, axis=1)

Unnamed: 0,A,C,D
0,1.0,11,16.0
1,2.0,12,
2,,13,18.0
3,4.0,14,19.0
4,,15,20.0


In [61]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [62]:
df.dropna(thresh=2, axis=1)

Unnamed: 0,A,C,D
0,1.0,11,16.0
1,2.0,12,
2,,13,18.0
3,4.0,14,19.0
4,,15,20.0


In [63]:
df.dropna(thresh=3, axis=1)

Unnamed: 0,A,C,D
0,1.0,11,16.0
1,2.0,12,
2,,13,18.0
3,4.0,14,19.0
4,,15,20.0


In [64]:
df.dropna(thresh=4, axis=1)

Unnamed: 0,C,D
0,11,16.0
1,12,
2,13,18.0
3,14,19.0
4,15,20.0


In [65]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [66]:
df.dropna(thresh=5, axis=1)

Unnamed: 0,C
0,11
1,12
2,13
3,14
4,15


In [67]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [68]:
df.fillna(2)

Unnamed: 0,A,B,C,D
0,1.0,2.0,11,16.0
1,2.0,2.0,12,2.0
2,2.0,2.0,13,18.0
3,4.0,2.0,14,19.0
4,2.0,2.0,15,20.0


In [69]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [70]:
df.fillna(value=2)

Unnamed: 0,A,B,C,D
0,1.0,2.0,11,16.0
1,2.0,2.0,12,2.0
2,2.0,2.0,13,18.0
3,4.0,2.0,14,19.0
4,2.0,2.0,15,20.0


In [71]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [72]:
df.fillna(value=0)

Unnamed: 0,A,B,C,D
0,1.0,0.0,11,16.0
1,2.0,0.0,12,0.0
2,0.0,0.0,13,18.0
3,4.0,0.0,14,19.0
4,0.0,0.0,15,20.0


In [73]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [74]:
df['A'].fillna(value=df['A'].mean())

0    1.000000
1    2.000000
2    2.333333
3    4.000000
4    2.333333
Name: A, dtype: float64

In [75]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [76]:
df['B'].fillna(value=df['B'].mean())

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: B, dtype: float64

In [77]:
df['B'].fillna(value=df['B'].mean())

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: B, dtype: float64

In [78]:
df['A'].fillna(value=df['A'].mean())

0    1.000000
1    2.000000
2    2.333333
3    4.000000
4    2.333333
Name: A, dtype: float64

In [79]:
df.fillna(value=df['A'].mean())

Unnamed: 0,A,B,C,D
0,1.0,2.333333,11,16.0
1,2.0,2.333333,12,2.333333
2,2.333333,2.333333,13,18.0
3,4.0,2.333333,14,19.0
4,2.333333,2.333333,15,20.0


In [80]:
df['A'].fillna(value=df['A'].mean())

0    1.000000
1    2.000000
2    2.333333
3    4.000000
4    2.333333
Name: A, dtype: float64

In [81]:
df.fillna(method='ffill')

  df.fillna(method='ffill')


Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,16.0
2,2.0,,13,18.0
3,4.0,,14,19.0
4,4.0,,15,20.0


In [82]:
df.fillna(method='bfill')

  df.fillna(method='bfill')


Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,18.0
2,4.0,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [83]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [84]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,1.0,0.0,11,16.0
1,2.0,0.0,12,0.0
2,0.0,0.0,13,18.0
3,4.0,0.0,14,19.0
4,0.0,0.0,15,20.0


In [85]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [86]:
df['A'].fillna(df['A'].sum())

0    1.0
1    2.0
2    7.0
3    4.0
4    7.0
Name: A, dtype: float64

In [87]:
df['A'].fillna(df['A'].mean())

0    1.000000
1    2.000000
2    2.333333
3    4.000000
4    2.333333
Name: A, dtype: float64

In [88]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [89]:
# forward fill
df.fillna(method="ffill")

  df.fillna(method="ffill")


Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,16.0
2,2.0,,13,18.0
3,4.0,,14,19.0
4,4.0,,15,20.0


In [90]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [91]:
# backward fill
df.fillna(method='bfill')

  df.fillna(method='bfill')


Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,18.0
2,4.0,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [92]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [93]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,1.0,0.0,11,16.0
1,2.0,0.0,12,0.0
2,0.0,0.0,13,18.0
3,4.0,0.0,14,19.0
4,0.0,0.0,15,20.0


In [94]:
  df.fillna(1)

Unnamed: 0,A,B,C,D
0,1.0,1.0,11,16.0
1,2.0,1.0,12,1.0
2,1.0,1.0,13,18.0
3,4.0,1.0,14,19.0
4,1.0,1.0,15,20.0


In [95]:
df.fillna('hi')

Unnamed: 0,A,B,C,D
0,1.0,hi,11,16.0
1,2.0,hi,12,hi
2,hi,hi,13,18.0
3,4.0,hi,14,19.0
4,hi,hi,15,20.0


In [96]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [97]:
df.describe()

Unnamed: 0,A,B,C,D
count,3.0,0.0,5.0,4.0
mean,2.333333,,13.0,18.25
std,1.527525,,1.581139,1.707825
min,1.0,,11.0,16.0
25%,1.5,,12.0,17.5
50%,2.0,,13.0,18.5
75%,3.0,,14.0,19.25
max,4.0,,15.0,20.0


In [99]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0
