In [1]:
import pandas as pd
import numpy as np

## 过滤缺失值

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
string_data[0] = None

In [5]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

pandas项目持续改善处理缺失值的内部细节，但是用户API函数，比如pandas. isnull，抽象掉了很多令人厌烦的细节。处理缺失值的相关函数列表如下：

- dropna：根据每个标签的值是否是缺失数据来筛选轴标签，并根据允许丢失的数据量来确定阈值
- fillna：用某些值填充缺失的数据或使用插值方法(如“ffill”或“bfill”)。
- isnull：返回表明哪些值是缺失值的布尔值
- notnull：isnull的反作用函数

In [6]:
from numpy import nan as NA

In [7]:
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [9]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [10]:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [11]:
# 传入how='all’时，将删除所有值均为NA的行：
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [12]:
# 如果要用同样的方式去删除列，传入参数axis=1：
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [13]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


过滤DataFrame的行的相关方法往往涉及时间序列数据。假设你只想保留包含一定数量的观察值的行。你可以用thresh参数来表示：

In [14]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.420988,,
1,0.407003,,
2,-1.509423,,-0.358767
3,-0.461389,,-0.448598
4,0.851561,-1.344985,-0.079144
5,-3.940764,-0.173783,0.152719
6,1.153031,1.76196,1.008634


In [15]:
df.dropna()

Unnamed: 0,0,1,2
4,0.851561,-1.344985,-0.079144
5,-3.940764,-0.173783,0.152719
6,1.153031,1.76196,1.008634


In [16]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.509423,,-0.358767
3,-0.461389,,-0.448598
4,0.851561,-1.344985,-0.079144
5,-3.940764,-0.173783,0.152719
6,1.153031,1.76196,1.008634


## 填充缺失值

可能需要以多种方式补全“漏洞”，而不是过滤缺失值（也可能丢弃其他数据）。

大多数情况下，主要使用fillna方法来补全缺失值。调用fillna时，可以使用一个常数来替代缺失值：

In [17]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.420988,0.0,0.0
1,0.407003,0.0,0.0
2,-1.509423,0.0,-0.358767
3,-0.461389,0.0,-0.448598
4,0.851561,-1.344985,-0.079144
5,-3.940764,-0.173783,0.152719
6,1.153031,1.76196,1.008634


In [18]:
# 在调用fillna时使用字典，你可以为不同列设定不同的填充值：
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-0.420988,0.5,0.0
1,0.407003,0.5,0.0
2,-1.509423,0.5,-0.358767
3,-0.461389,0.5,-0.448598
4,0.851561,-1.344985,-0.079144
5,-3.940764,-0.173783,0.152719
6,1.153031,1.76196,1.008634


In [19]:
# fillna返回的是一个新的对象，但你也可以修改已经存在的对象：
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,-0.420988,0.0,0.0
1,0.407003,0.0,0.0
2,-1.509423,0.0,-0.358767
3,-0.461389,0.0,-0.448598
4,0.851561,-1.344985,-0.079144
5,-3.940764,-0.173783,0.152719
6,1.153031,1.76196,1.008634


In [20]:
# 用于重建索引的相同的插值方法也可以用于fillna：
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.534866,-1.379581,0.322881
1,-0.588503,0.364621,-1.747663
2,-2.583051,,-0.413324
3,1.113816,,-1.119099
4,0.776692,,
5,-1.204988,,


In [21]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.534866,-1.379581,0.322881
1,-0.588503,0.364621,-1.747663
2,-2.583051,0.364621,-0.413324
3,1.113816,0.364621,-1.119099
4,0.776692,0.364621,-1.119099
5,-1.204988,0.364621,-1.119099


In [22]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.534866,-1.379581,0.322881
1,-0.588503,0.364621,-1.747663
2,-2.583051,0.364621,-0.413324
3,1.113816,0.364621,-1.119099
4,0.776692,,-1.119099
5,-1.204988,,-1.119099


使用fillna你可以完成很多带有一点创造性的工作。例如，你可以将Series的平均值或中位数用于填充缺失值：

In [23]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

以下是fillna的函数参数。

- value：标量值或字典型对象用于填充缺失值
- method：插值方法，如果没有其他参数，默认是'ffill'
- axis：需要填充的轴，默认axis=0
- inplace：修改被调用的对象，而不是生成一个备份
- limit：用于前向或后向填充时最大的填充范围