# 处理缺失数据

In [1]:
import numpy as np
import pandas as pd

In [3]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
string_data[0] = None
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

## 滤除缺失数据

In [11]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [16]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [17]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [20]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [22]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                   [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [23]:
cleaned = data.dropna()

In [24]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [25]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [26]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [27]:
data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [28]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [29]:
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,-1.786221,2.402091,0.452227
1,0.390032,-0.558616,-0.320584
2,-1.141997,1.121584,-0.813264
3,-0.189844,0.28562,1.343882
4,-0.973073,-0.051664,0.179609
5,-0.421125,0.973266,-0.255898
6,-0.162587,-1.288397,-0.564176


In [34]:
df.loc[:4, 1] = np.nan
df.loc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-1.786221,,
1,0.390032,,
2,-1.141997,,
3,-0.189844,,1.343882
4,-0.973073,,0.179609
5,-0.421125,0.973266,-0.255898
6,-0.162587,-1.288397,-0.564176


In [36]:
df.dropna()

Unnamed: 0,0,1,2
5,-0.421125,0.973266,-0.255898
6,-0.162587,-1.288397,-0.564176


In [42]:
df.dropna(thresh=3)

Unnamed: 0,0,1,2
5,-0.421125,0.973266,-0.255898
6,-0.162587,-1.288397,-0.564176


## 填充缺失数据

In [43]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-1.786221,0.0,0.0
1,0.390032,0.0,0.0
2,-1.141997,0.0,0.0
3,-0.189844,0.0,1.343882
4,-0.973073,0.0,0.179609
5,-0.421125,0.973266,-0.255898
6,-0.162587,-1.288397,-0.564176


In [44]:
df

Unnamed: 0,0,1,2
0,-1.786221,,
1,0.390032,,
2,-1.141997,,
3,-0.189844,,1.343882
4,-0.973073,,0.179609
5,-0.421125,0.973266,-0.255898
6,-0.162587,-1.288397,-0.564176


In [47]:
df.fillna({1:0.5, 3:-1})

Unnamed: 0,0,1,2
0,-1.786221,0.5,
1,0.390032,0.5,
2,-1.141997,0.5,
3,-0.189844,0.5,1.343882
4,-0.973073,0.5,0.179609
5,-0.421125,0.973266,-0.255898
6,-0.162587,-1.288397,-0.564176


In [48]:
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,-1.786221,0.0,0.0
1,0.390032,0.0,0.0
2,-1.141997,0.0,0.0
3,-0.189844,0.0,1.343882
4,-0.973073,0.0,0.179609
5,-0.421125,0.973266,-0.255898
6,-0.162587,-1.288397,-0.564176


In [49]:
_

Unnamed: 0,0,1,2
0,-1.786221,0.0,0.0
1,0.390032,0.0,0.0
2,-1.141997,0.0,0.0
3,-0.189844,0.0,1.343882
4,-0.973073,0.0,0.179609
5,-0.421125,0.973266,-0.255898
6,-0.162587,-1.288397,-0.564176


In [50]:
df = pd.DataFrame(np.random.randn(6,3))
df.loc[2:, 1] = np.nan
df.loc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.524632,0.250371,0.618978
1,-0.916691,-0.612769,1.383619
2,0.060633,,-1.246755
3,0.4718,,-0.803812
4,0.973065,,
5,-2.380847,,


In [51]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.524632,0.250371,0.618978
1,-0.916691,-0.612769,1.383619
2,0.060633,-0.612769,-1.246755
3,0.4718,-0.612769,-0.803812
4,0.973065,-0.612769,-0.803812
5,-2.380847,-0.612769,-0.803812


In [52]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.524632,0.250371,0.618978
1,-0.916691,-0.612769,1.383619
2,0.060633,-0.612769,-1.246755
3,0.4718,-0.612769,-0.803812
4,0.973065,,-0.803812
5,-2.380847,,-0.803812


In [54]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [55]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64