# 处理缺失数据
> pandas 使用浮点值NaN表示浮点和非浮点数组中的缺失数据

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame,Series

In [2]:
string_data = Series(['aardvark','artichoke',np.nan,'avocado'])

In [3]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

python 中的None值也会被当做NA处理

In [5]:
string_data[0] = None

In [6]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

## 滤除缺失数据


In [8]:
from numpy import nan as NA

In [9]:
data = Series([1,NA,3.5,NA,7])

In [11]:
data.dropna() # 返回一个仅有非空数据和索引值的Series

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
data[data.notnull()] # 和上面一样的效果

0    1.0
2    3.5
4    7.0
dtype: float64

In [15]:
data = DataFrame([[1,6.5,3],[1,NA,NA],[NA,NA,NA],[NA,6.5,3]])

In [16]:
cleaned = data.dropna()

In [18]:
# 对于DataFrame对象，dropna默认丢弃任何含有缺失值的行
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [19]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [20]:
# 传入 how = 'all' 表示只丢弃全为NA的行
data.dropna(how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [21]:
# 丢弃列，只需要传入axis = 1
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [22]:
data.dropna(axis = 1,how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


## 填充缺失数据


In [23]:
df = DataFrame(np.random.randn(7,3))

In [24]:
df

Unnamed: 0,0,1,2
0,-0.484227,2.706119,-0.519616
1,-0.118581,-0.959936,0.855612
2,1.445263,0.166543,0.121656
3,-0.274037,-0.855873,1.416924
4,-0.847832,0.517986,0.89046
5,0.104517,-0.112302,0.653788
6,-1.119111,-1.026504,-0.310345


In [25]:
df.ix[:4,1] = NA

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [26]:
df

Unnamed: 0,0,1,2
0,-0.484227,,-0.519616
1,-0.118581,,0.855612
2,1.445263,,0.121656
3,-0.274037,,1.416924
4,-0.847832,,0.89046
5,0.104517,-0.112302,0.653788
6,-1.119111,-1.026504,-0.310345


In [27]:
df.ix[:2,2] = NA

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [28]:
df

Unnamed: 0,0,1,2
0,-0.484227,,
1,-0.118581,,
2,1.445263,,
3,-0.274037,,1.416924
4,-0.847832,,0.89046
5,0.104517,-0.112302,0.653788
6,-1.119111,-1.026504,-0.310345


In [29]:
df.fillna(0) # 将na填充为0

Unnamed: 0,0,1,2
0,-0.484227,0.0,0.0
1,-0.118581,0.0,0.0
2,1.445263,0.0,0.0
3,-0.274037,0.0,1.416924
4,-0.847832,0.0,0.89046
5,0.104517,-0.112302,0.653788
6,-1.119111,-1.026504,-0.310345


In [30]:
# 通过字典调用fillna，实现对不同列填充不同的值
df.fillna({1:0.5,2:-1})

Unnamed: 0,0,1,2
0,-0.484227,0.5,-1.0
1,-0.118581,0.5,-1.0
2,1.445263,0.5,-1.0
3,-0.274037,0.5,1.416924
4,-0.847832,0.5,0.89046
5,0.104517,-0.112302,0.653788
6,-1.119111,-1.026504,-0.310345


In [31]:
# fillna默认会返回新对象，但也可以对现有对象就地修改
_ = df.fillna(0,inplace = True)  #inplace 表示修改调用者对象，不产生副本

In [32]:
df

Unnamed: 0,0,1,2
0,-0.484227,0.0,0.0
1,-0.118581,0.0,0.0
2,1.445263,0.0,0.0
3,-0.274037,0.0,1.416924
4,-0.847832,0.0,0.89046
5,0.104517,-0.112302,0.653788
6,-1.119111,-1.026504,-0.310345


In [33]:
df = DataFrame(np.random.randn(6,3))

In [34]:
df

Unnamed: 0,0,1,2
0,0.093969,1.540453,-0.048432
1,-1.448615,0.578894,1.055817
2,0.970405,-1.403968,1.416503
3,-0.993811,-0.834435,0.565788
4,-0.924828,-0.456131,-0.232714
5,-0.97558,1.438712,0.292764


In [35]:
df.ix[2:,1] = NA;df.ix[4:,2] = NA

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [36]:
df

Unnamed: 0,0,1,2
0,0.093969,1.540453,-0.048432
1,-1.448615,0.578894,1.055817
2,0.970405,,1.416503
3,-0.993811,,0.565788
4,-0.924828,,
5,-0.97558,,


In [37]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.093969,1.540453,-0.048432
1,-1.448615,0.578894,1.055817
2,0.970405,0.578894,1.416503
3,-0.993811,0.578894,0.565788
4,-0.924828,0.578894,0.565788
5,-0.97558,0.578894,0.565788


In [39]:
df.fillna(method = 'ffill',limit = 2)

Unnamed: 0,0,1,2
0,0.093969,1.540453,-0.048432
1,-1.448615,0.578894,1.055817
2,0.970405,0.578894,1.416503
3,-0.993811,0.578894,0.565788
4,-0.924828,,0.565788
5,-0.97558,,0.565788


In [40]:
# 传入中位数
data = Series([1,NA,3.5,NA,7])

In [41]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64