# 处理缺失数据

In [1]:
import numpy as np
import pandas as pd

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

Python内置的None值也会被当做NA处理：

In [4]:
string_data[0] = None

string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

## 滤除缺失数据

In [6]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [7]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [9]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data[data.isnull()]

1   NaN
3   NaN
dtype: float64

对于DataFrame对象，dropna默认丢弃任何含有缺失值的行：

In [11]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                   [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
cleaned = data.dropna()

cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [13]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


传入 how='all' 将之丢弃全为NA的那些行

In [14]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


要用这种方式丢弃列，只需传入axis=1即可：

In [15]:
data[4] = np.nan

data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [16]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
df = pd.DataFrame(np.random.randn(7, 3))

df

Unnamed: 0,0,1,2
0,-0.327563,-0.171471,0.150622
1,-1.152294,0.323409,-0.06454
2,-0.115431,0.600262,-0.836076
3,-1.440348,0.428689,0.695415
4,1.072534,-0.437569,-0.486193
5,-0.018466,0.433036,0.637517
6,0.76521,0.461896,-0.121529


In [18]:
df.loc[:4, 1] = np.nan
df.loc[:2, 2] = np.nan

df

Unnamed: 0,0,1,2
0,-0.327563,,
1,-1.152294,,
2,-0.115431,,
3,-1.440348,,0.695415
4,1.072534,,-0.486193
5,-0.018466,0.433036,0.637517
6,0.76521,0.461896,-0.121529


In [19]:
df.dropna()

Unnamed: 0,0,1,2
5,-0.018466,0.433036,0.637517
6,0.76521,0.461896,-0.121529


In [24]:
df.dropna(thresh=3)

Unnamed: 0,0,1,2
5,-0.018466,0.433036,0.637517
6,0.76521,0.461896,-0.121529


## 填充缺失数据

In [25]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.327563,0.0,0.0
1,-1.152294,0.0,0.0
2,-0.115431,0.0,0.0
3,-1.440348,0.0,0.695415
4,1.072534,0.0,-0.486193
5,-0.018466,0.433036,0.637517
6,0.76521,0.461896,-0.121529


In [26]:
df

Unnamed: 0,0,1,2
0,-0.327563,,
1,-1.152294,,
2,-0.115431,,
3,-1.440348,,0.695415
4,1.072534,,-0.486193
5,-0.018466,0.433036,0.637517
6,0.76521,0.461896,-0.121529


In [27]:
_ = df.fillna(0, inplace=True)

_

In [28]:
df

Unnamed: 0,0,1,2
0,-0.327563,0.0,0.0
1,-1.152294,0.0,0.0
2,-0.115431,0.0,0.0
3,-1.440348,0.0,0.695415
4,1.072534,0.0,-0.486193
5,-0.018466,0.433036,0.637517
6,0.76521,0.461896,-0.121529


对reindex有效的那些差值方法也可用于fillna：

In [29]:
df = pd.DataFrame(np.random.randn(6, 3))

df

Unnamed: 0,0,1,2
0,0.973701,-0.480763,0.540987
1,-0.709756,-0.963358,0.197819
2,1.337591,0.470625,0.346395
3,-1.918172,-0.85975,-1.062132
4,0.730289,0.415545,-0.483521
5,0.153469,-1.337287,1.288648


In [31]:
df.loc[2:, 1] = np.nan
df.loc[4:, 2] = np.nan

df

Unnamed: 0,0,1,2
0,0.973701,-0.480763,0.540987
1,-0.709756,-0.963358,0.197819
2,1.337591,,0.346395
3,-1.918172,,-1.062132
4,0.730289,,
5,0.153469,,


In [32]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.973701,-0.480763,0.540987
1,-0.709756,-0.963358,0.197819
2,1.337591,-0.963358,0.346395
3,-1.918172,-0.963358,-1.062132
4,0.730289,-0.963358,-1.062132
5,0.153469,-0.963358,-1.062132


In [33]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.973701,-0.480763,0.540987
1,-0.709756,-0.963358,0.197819
2,1.337591,-0.963358,0.346395
3,-1.918172,-0.963358,-1.062132
4,0.730289,,-1.062132
5,0.153469,,-1.062132


In [35]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])

data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [36]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64