# 处理缺失数据

In [1]:
import numpy as np
import pandas as pd

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

Python内置的None值也会被当做NA处理：

In [4]:
string_data[0] = None

string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

## 滤除缺失数据

In [6]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [7]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [9]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data[data.isnull()]

1   NaN
3   NaN
dtype: float64

对于DataFrame对象，dropna默认丢弃任何含有缺失值的行：

In [11]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                   [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
cleaned = data.dropna()

cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [13]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


传入 how='all' 将之丢弃全为NA的那些行

In [14]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


要用这种方式丢弃列，只需传入axis=1即可：

In [15]:
data[4] = np.nan

data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [16]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
df = pd.DataFrame(np.random.randn(7, 3))

df

Unnamed: 0,0,1,2
0,-1.127606,-0.902357,-1.874167
1,-1.866241,-1.243738,-0.785742
2,-0.426517,-0.071427,-0.099868
3,-0.842031,0.050004,0.281788
4,0.541495,0.566452,-0.995524
5,0.103469,1.589065,0.24046
6,0.810016,0.670584,-1.445595


In [18]:
df.loc[:4, 1] = np.nan
df.loc[:2, 2] = np.nan

df

Unnamed: 0,0,1,2
0,-1.127606,,
1,-1.866241,,
2,-0.426517,,
3,-0.842031,,0.281788
4,0.541495,,-0.995524
5,0.103469,1.589065,0.24046
6,0.810016,0.670584,-1.445595


In [19]:
df.dropna()

Unnamed: 0,0,1,2
5,0.103469,1.589065,0.24046
6,0.810016,0.670584,-1.445595


In [20]:
df.dropna(thresh=3)

Unnamed: 0,0,1,2
5,0.103469,1.589065,0.24046
6,0.810016,0.670584,-1.445595


## 填充缺失数据

In [21]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-1.127606,0.0,0.0
1,-1.866241,0.0,0.0
2,-0.426517,0.0,0.0
3,-0.842031,0.0,0.281788
4,0.541495,0.0,-0.995524
5,0.103469,1.589065,0.24046
6,0.810016,0.670584,-1.445595


In [22]:
df

Unnamed: 0,0,1,2
0,-1.127606,,
1,-1.866241,,
2,-0.426517,,
3,-0.842031,,0.281788
4,0.541495,,-0.995524
5,0.103469,1.589065,0.24046
6,0.810016,0.670584,-1.445595


通过一个字典调用fillna，就可以实现对不同的列填充不同的值：

In [23]:
df.fillna({1: 0.5, 3: -1})

Unnamed: 0,0,1,2
0,-1.127606,0.5,
1,-1.866241,0.5,
2,-0.426517,0.5,
3,-0.842031,0.5,0.281788
4,0.541495,0.5,-0.995524
5,0.103469,1.589065,0.24046
6,0.810016,0.670584,-1.445595


fillna默认会返回新对象，但也可以对现有对象进行就地修改：

In [24]:
_ = df.fillna(0, inplace=True)

type(_)

NoneType

In [25]:
df

Unnamed: 0,0,1,2
0,-1.127606,0.0,0.0
1,-1.866241,0.0,0.0
2,-0.426517,0.0,0.0
3,-0.842031,0.0,0.281788
4,0.541495,0.0,-0.995524
5,0.103469,1.589065,0.24046
6,0.810016,0.670584,-1.445595


对reindex有效的那些差值方法也可用于fillna：

In [26]:
df = pd.DataFrame(np.random.randn(6, 3))

df

Unnamed: 0,0,1,2
0,-1.539443,1.233058,0.448994
1,-0.553848,1.08141,-0.644916
2,0.742478,1.091889,0.826809
3,-1.514546,2.230289,-0.780526
4,0.292536,-0.978296,0.876481
5,-1.699742,-1.333677,-0.823124


In [27]:
df.loc[2:, 1] = np.nan
df.loc[4:, 2] = np.nan

df

Unnamed: 0,0,1,2
0,-1.539443,1.233058,0.448994
1,-0.553848,1.08141,-0.644916
2,0.742478,,0.826809
3,-1.514546,,-0.780526
4,0.292536,,
5,-1.699742,,


In [28]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-1.539443,1.233058,0.448994
1,-0.553848,1.08141,-0.644916
2,0.742478,1.08141,0.826809
3,-1.514546,1.08141,-0.780526
4,0.292536,1.08141,-0.780526
5,-1.699742,1.08141,-0.780526


In [29]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-1.539443,1.233058,0.448994
1,-0.553848,1.08141,-0.644916
2,0.742478,1.08141,0.826809
3,-1.514546,1.08141,-0.780526
4,0.292536,,-0.780526
5,-1.699742,,-0.780526


In [30]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])

data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [31]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64