In [1]:
import numpy as np
import pandas as pd

In [2]:
string_data = pd.Series(["aardvark","artichoke",np.nan,"avocado"])

In [3]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
# .isnull()

In [6]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
string_data[0] = None

In [10]:
string_data  # None like nan

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [12]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

**dropna** Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate.<br>
**fillna** Fill in missing data with some value or using an interpolation method such as 'ffill' or 'bfill'.<br>
**isnull** Return boolean values indicating which values are missing/NA.<br>
**notnull** Negation of isnull.<br>

### Filtering Out Missing Data

In [13]:
data = pd.Series([1,np.nan,3.5,np.nan,7])

In [14]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [None]:
# .dropna()

In [15]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [16]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [17]:
data = pd.DataFrame([[1,6.5,3],[1,np.nan,np.nan],[np.nan,np.nan,np.nan],[np.nan,6.5,3]])

In [18]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [19]:
cleaned = data.dropna()

In [20]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [21]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [23]:
data.dropna(how = "all") # axis = 0

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [28]:
data[4] = np.nan #create new column

In [26]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [29]:
data.dropna(axis = 1, how = "all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [30]:
df = pd.DataFrame(np.random.randn(7,3))

In [31]:
df

Unnamed: 0,0,1,2
0,0.481929,0.512972,-1.503594
1,-0.118487,0.336035,0.438124
2,-0.032024,-0.8055,-2.094052
3,0.435317,-1.349867,0.861984
4,-0.776697,-1.105998,-0.004852
5,1.826026,0.314589,-0.673601
6,0.847076,-0.243894,-1.173403


In [32]:
df.iloc[:4,1] = np.nan

In [33]:
df

Unnamed: 0,0,1,2
0,0.481929,,-1.503594
1,-0.118487,,0.438124
2,-0.032024,,-2.094052
3,0.435317,,0.861984
4,-0.776697,-1.105998,-0.004852
5,1.826026,0.314589,-0.673601
6,0.847076,-0.243894,-1.173403


In [34]:
df.iloc[:2,2] = np.nan

In [35]:
df

Unnamed: 0,0,1,2
0,0.481929,,
1,-0.118487,,
2,-0.032024,,-2.094052
3,0.435317,,0.861984
4,-0.776697,-1.105998,-0.004852
5,1.826026,0.314589,-0.673601
6,0.847076,-0.243894,-1.173403


In [36]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.776697,-1.105998,-0.004852
5,1.826026,0.314589,-0.673601
6,0.847076,-0.243894,-1.173403


In [37]:
# thresh = <number>

In [38]:
df.dropna(thresh =2) #axis = 0

Unnamed: 0,0,1,2
2,-0.032024,,-2.094052
3,0.435317,,0.861984
4,-0.776697,-1.105998,-0.004852
5,1.826026,0.314589,-0.673601
6,0.847076,-0.243894,-1.173403


### Filling In Missing Data

In [39]:
df

Unnamed: 0,0,1,2
0,0.481929,,
1,-0.118487,,
2,-0.032024,,-2.094052
3,0.435317,,0.861984
4,-0.776697,-1.105998,-0.004852
5,1.826026,0.314589,-0.673601
6,0.847076,-0.243894,-1.173403


In [41]:
df.fillna(0) #inplace = False

Unnamed: 0,0,1,2
0,0.481929,0.0,0.0
1,-0.118487,0.0,0.0
2,-0.032024,0.0,-2.094052
3,0.435317,0.0,0.861984
4,-0.776697,-1.105998,-0.004852
5,1.826026,0.314589,-0.673601
6,0.847076,-0.243894,-1.173403


In [42]:
# Choose column

In [43]:
df.fillna({1:0.5,2:0})

Unnamed: 0,0,1,2
0,0.481929,0.5,0.0
1,-0.118487,0.5,0.0
2,-0.032024,0.5,-2.094052
3,0.435317,0.5,0.861984
4,-0.776697,-1.105998,-0.004852
5,1.826026,0.314589,-0.673601
6,0.847076,-0.243894,-1.173403


In [44]:
df

Unnamed: 0,0,1,2
0,0.481929,,
1,-0.118487,,
2,-0.032024,,-2.094052
3,0.435317,,0.861984
4,-0.776697,-1.105998,-0.004852
5,1.826026,0.314589,-0.673601
6,0.847076,-0.243894,-1.173403


In [45]:
_ = df.fillna(0,inplace = True)

In [48]:
df    #!!! fillna returns a new object, but you can modify the existing object in-place

Unnamed: 0,0,1,2
0,0.481929,0.0,0.0
1,-0.118487,0.0,0.0
2,-0.032024,0.0,-2.094052
3,0.435317,0.0,0.861984
4,-0.776697,-1.105998,-0.004852
5,1.826026,0.314589,-0.673601
6,0.847076,-0.243894,-1.173403


In [49]:
df = pd.DataFrame(np.random.randn(6,3))

In [50]:
df

Unnamed: 0,0,1,2
0,0.9354,-0.195627,1.640454
1,-1.63174,0.778524,1.024953
2,-0.409209,-0.163891,0.885
3,1.113619,-0.304449,1.228908
4,0.326776,0.285046,-0.19223
5,0.929009,-0.40957,0.418382


In [51]:
df.iloc[2:,1] = np.nan
df.iloc[4:,2] = np.nan

In [52]:
df

Unnamed: 0,0,1,2
0,0.9354,-0.195627,1.640454
1,-1.63174,0.778524,1.024953
2,-0.409209,,0.885
3,1.113619,,1.228908
4,0.326776,,
5,0.929009,,


In [54]:
df.fillna(method = "ffill")   # ffill: propagate last valid observation forward to next valid 

Unnamed: 0,0,1,2
0,0.9354,-0.195627,1.640454
1,-1.63174,0.778524,1.024953
2,-0.409209,0.778524,0.885
3,1.113619,0.778524,1.228908
4,0.326776,0.778524,1.228908
5,0.929009,0.778524,1.228908


In [55]:
df

Unnamed: 0,0,1,2
0,0.9354,-0.195627,1.640454
1,-1.63174,0.778524,1.024953
2,-0.409209,,0.885
3,1.113619,,1.228908
4,0.326776,,
5,0.929009,,


In [56]:
df.fillna(method = "ffill", limit = 1)

Unnamed: 0,0,1,2
0,0.9354,-0.195627,1.640454
1,-1.63174,0.778524,1.024953
2,-0.409209,0.778524,0.885
3,1.113619,,1.228908
4,0.326776,,1.228908
5,0.929009,,


In [57]:
data = pd.Series([1,np.nan,3.5,np.nan,7])

In [58]:
data.mean()

3.8333333333333335

In [59]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [60]:
data.fillna(data.mean()).mean()

3.8333333333333335