Finding Missing Data

In [1]:
import numpy as np
import pandas as pd

In [3]:
data = {
    'a' : [1, 2, np.nan],
    'b' : [4, np.nan, 6],  
    'c' : [7, 8, 9],
    'd' : [np.nan, 11, 12]
}
df = pd.DataFrame(data)
df

Unnamed: 0,a,b,c,d
0,1.0,4.0,7,
1,2.0,,8,11.0
2,,6.0,9,12.0


In [None]:
# Check for missing values
df.isna()

Unnamed: 0,a,b,c,d
0,False,False,False,True
1,False,True,False,False
2,True,False,False,False


In [None]:
# Count of missing values in each column
df.isna().sum()

a    1
b    1
c    0
d    1
dtype: int64

In [None]:
# Check if any column has missing values
df.isna().any

<bound method DataFrame.any of        a      b      c      d
0  False  False  False   True
1  False   True  False  False
2   True  False  False  False>

In [None]:
# Alternative method to check for missing values
df.isnull()

Unnamed: 0,a,b,c,d
0,False,False,False,True
1,False,True,False,False
2,True,False,False,False


Removing Missing data

In [11]:
data = {
    'a' : [1, 2, np.nan],
    'b' : [4, 8, 6],  
    'c' : [7, 8, 9],
    'd' : [np.nan, 11, 12]
}
df = pd.DataFrame(data)
df

Unnamed: 0,a,b,c,d
0,1.0,4,7,
1,2.0,8,8,11.0
2,,6,9,12.0


In [12]:
# Drop rows with missing values
df.dropna()

Unnamed: 0,a,b,c,d
1,2.0,8,8,11.0


In [13]:
df

Unnamed: 0,a,b,c,d
0,1.0,4,7,
1,2.0,8,8,11.0
2,,6,9,12.0


In [None]:
# Drop rows with at least 4 non-NA values
df.dropna(thresh=4)

Unnamed: 0,a,b,c,d
1,2.0,8,8,11.0


Filling the missing data

In [18]:
df

Unnamed: 0,a,b,c,d
0,1.0,4,7,
1,2.0,8,8,11.0
2,,6,9,12.0


In [None]:
# Fill missing values with 0
df.fillna(0)

Unnamed: 0,a,b,c,d
0,1.0,4,7,0.0
1,2.0,8,8,11.0
2,0.0,6,9,12.0


In [None]:
df
#nothing update, for update the data fram use inplace = True
#df.fillna(0, inplace=True)

Unnamed: 0,a,b,c,d
0,1.0,4,7,
1,2.0,8,8,11.0
2,,6,9,12.0


In [None]:
# Fill missing values with different values for each column
values = { 'a': 10, 'b': 20, 'c': 30, 'd': 40 }
df.fillna(value=values)

Unnamed: 0,a,b,c,d
0,1.0,4,7,40.0
1,2.0,8,8,11.0
2,10.0,6,9,12.0


In [None]:
# Fill missing values with the mean of each column
df.fillna(df.mean())

Unnamed: 0,a,b,c,d
0,1.0,4,7,11.5
1,2.0,8,8,11.0
2,1.5,6,9,12.0


In [23]:
#fill missing values with the median of each column
df.fillna(df.median())

Unnamed: 0,a,b,c,d
0,1.0,4,7,11.5
1,2.0,8,8,11.0
2,1.5,6,9,12.0
