In [1]:
###在许多数据分析工作中，缺失数据是经常发生的。pandas的目标之一就是尽量轻松地处理缺失数据。例如，pandas对象的所有描述性统计默认都不包括缺失数据。

###缺失数据在pandas中呈现的方式有些不完美，但对于大多数用户可以保证功能正常。对于数值数据，pandas使用浮点值NaN（Not a Number）表示缺失数据。我们称其为哨兵值，可以方便的检测出来：

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [4]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
#在pandas中，我们采用了R语言中的惯用法，即将缺失值表示为NA，它表示不可用not available。在统计应用中，NA数据可能是不存在的数据或者虽然存在，但是没有观察到（例如，数据采集中发生了问题）。当进行数据清洗以进行分析时，最好直接对缺失数据进行分析，以判断数据采集的问题或缺失数据可能导致的偏差。

In [7]:
#Python内置的None值在对象数组中也可以作为NA：
string_data[0] = None

In [8]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

# 滤除缺失数据

In [9]:
#过滤掉缺失数据的办法有很多种。你可以通过pandas.isnull或布尔索引的手工方法，但dropna可能会更实用一些。对于一个Series，dropna返回一个仅含非空数据和索引值的Series：

In [10]:
from numpy import nan as NA

In [11]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [12]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
#这等价于：
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [14]:
#而对于DataFrame对象，事情就有点复杂了。你可能希望丢弃全NA或含有NA的行或列。dropna默认丢弃任何含有缺失值的行：

In [15]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                  [NA, NA, NA], [NA, 6.5, 3.]])

In [16]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
cleaned = data.dropna()

In [18]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [19]:
#传入how='all'将只丢弃全为NA的那些行：
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [22]:
#用这种方式丢弃列，只需传入axis=1即可：
data[4] = NA

In [23]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [24]:
data.dropna( how='all',axis=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [25]:
#另一个滤除DataFrame行的问题涉及时间序列数据。假设你只想留下一部分观测数据，可以用thresh参数实现此目的：

In [27]:
df = pd.DataFrame(np.random.randn(7,3))

In [28]:
df 

Unnamed: 0,0,1,2
0,0.031624,1.92836,0.718444
1,0.815141,0.284279,1.507302
2,-1.257822,1.609041,0.53251
3,1.088161,-0.849562,0.115133
4,-0.816768,0.445211,0.176197
5,0.979499,0.435645,0.512062
6,0.536993,-1.051725,0.557234


In [29]:
df.iloc[:4,1] =NA

In [30]:
df 

Unnamed: 0,0,1,2
0,0.031624,,0.718444
1,0.815141,,1.507302
2,-1.257822,,0.53251
3,1.088161,,0.115133
4,-0.816768,0.445211,0.176197
5,0.979499,0.435645,0.512062
6,0.536993,-1.051725,0.557234


In [31]:
df.iloc[:2, 2] = NA

In [32]:
df

Unnamed: 0,0,1,2
0,0.031624,,
1,0.815141,,
2,-1.257822,,0.53251
3,1.088161,,0.115133
4,-0.816768,0.445211,0.176197
5,0.979499,0.435645,0.512062
6,0.536993,-1.051725,0.557234


In [33]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.816768,0.445211,0.176197
5,0.979499,0.435645,0.512062
6,0.536993,-1.051725,0.557234


In [34]:
df.dropna(thresh=2)
###df.dropna ( thresh=n )这一行除去NA值，剩余数值的数量大于等于n，便显示这一行。

Unnamed: 0,0,1,2
2,-1.257822,,0.53251
3,1.088161,,0.115133
4,-0.816768,0.445211,0.176197
5,0.979499,0.435645,0.512062
6,0.536993,-1.051725,0.557234


# 填充缺失数据

In [35]:
#你可能不想滤除缺失数据（有可能会丢弃跟它有关的其他数据），而是希望通过其他方式填补那些“空洞”。对于大多数情况而言，fillna方法是最主要的函数。通过一个常数调用fillna就会将缺失值替换为那个常数值：

In [36]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.031624,0.0,0.0
1,0.815141,0.0,0.0
2,-1.257822,0.0,0.53251
3,1.088161,0.0,0.115133
4,-0.816768,0.445211,0.176197
5,0.979499,0.435645,0.512062
6,0.536993,-1.051725,0.557234


In [37]:
#若是通过一个字典调用fillna，就可以实现对不同的列填充不同的值：
df.fillna({1:0.5,2:0})#第一列填充0.5，第二列填充0

Unnamed: 0,0,1,2
0,0.031624,0.5,0.0
1,0.815141,0.5,0.0
2,-1.257822,0.5,0.53251
3,1.088161,0.5,0.115133
4,-0.816768,0.445211,0.176197
5,0.979499,0.435645,0.512062
6,0.536993,-1.051725,0.557234


In [38]:
#fillna默认会返回新对象，但也可以对现有对象进行就地修改：
_ = df.fillna(0, inplace=True)#inplace=True 修改原文件

In [39]:
df

Unnamed: 0,0,1,2
0,0.031624,0.0,0.0
1,0.815141,0.0,0.0
2,-1.257822,0.0,0.53251
3,1.088161,0.0,0.115133
4,-0.816768,0.445211,0.176197
5,0.979499,0.435645,0.512062
6,0.536993,-1.051725,0.557234


In [40]:
#对reindexing有效的那些插值方法也可用于fillna：|

In [41]:
df = pd.DataFrame(np.random.randn(6,3))

In [42]:
df

Unnamed: 0,0,1,2
0,-1.37446,2.041289,-0.258501
1,-0.935046,-0.067952,0.780009
2,-0.014126,1.521997,0.714538
3,-0.303888,0.385858,-0.398666
4,0.848482,0.549963,-1.02017
5,-2.094827,-0.061269,0.663931


In [43]:
df.iloc[2:, 1] = NA

In [44]:
df

Unnamed: 0,0,1,2
0,-1.37446,2.041289,-0.258501
1,-0.935046,-0.067952,0.780009
2,-0.014126,,0.714538
3,-0.303888,,-0.398666
4,0.848482,,-1.02017
5,-2.094827,,0.663931


In [45]:
 df.iloc[4:, 2] = NA

In [46]:
df

Unnamed: 0,0,1,2
0,-1.37446,2.041289,-0.258501
1,-0.935046,-0.067952,0.780009
2,-0.014126,,0.714538
3,-0.303888,,-0.398666
4,0.848482,,
5,-2.094827,,


In [47]:
df.fillna(method='ffill')
#method：{backfill,bfill,pad,ffill,none},default none填充的方法，backfill和bfill代表填充后侧值，ffill和pad填充空值前侧值
#向前填充，注意此处默认参数axis=0，所以空值是填充上一行的数据，而不是前一列。

Unnamed: 0,0,1,2
0,-1.37446,2.041289,-0.258501
1,-0.935046,-0.067952,0.780009
2,-0.014126,-0.067952,0.714538
3,-0.303888,-0.067952,-0.398666
4,0.848482,-0.067952,-0.398666
5,-2.094827,-0.067952,-0.398666


In [48]:
 df.fillna(method='ffill', limit=2)
#向前或后填充的最大数量，必须是大于0的整数
#如果指定了method参数，则连续空值值填充前int个
#如果未指定method参数，则只填充所在轴上的前int空值

Unnamed: 0,0,1,2
0,-1.37446,2.041289,-0.258501
1,-0.935046,-0.067952,0.780009
2,-0.014126,-0.067952,0.714538
3,-0.303888,-0.067952,-0.398666
4,0.848482,,-0.398666
5,-2.094827,,-0.398666


In [49]:
#只要有些创新，你就可以利用fillna实现许多别的功能。比如说，你可以传入Series的平均值或中位数：

In [50]:
data = pd.Series([1., NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [51]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64