In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series

In [2]:
# pandas用浮点值Nan(Not a Number)来表示缺失值，为了检测，.0只是一种标记而已
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
# --------------1 Filtering Out Missing Data（过滤缺失值）-------------------

In [5]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna() # 过滤缺失值

0    1.0
2    3.5
4    7.0
dtype: float64

In [6]:
data[data.notnull()] # 等同于用bool索引

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
# dataFrame dropna默认会删除包含有缺失值的row
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [8]:
cleaned = data.dropna() #只有要NA就删除
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [9]:
# 设定how=all只会删除那些全是NA的行
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [10]:
# 使用thresh参数保留一部分数据，设置容忍的非NA的列数
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,0.756224,,
1,1.369152,,
2,0.186491,,0.289922
3,-1.062089,,-0.835359
4,-1.582916,-1.986572,-0.50918
5,-0.795622,1.272695,0.432698
6,-1.729617,-1.100878,0.537771


In [14]:
df.dropna(thresh=2)  # 容忍的非NA的列数 = thresh

Unnamed: 0,0,1,2
2,0.186491,,0.289922
3,-1.062089,,-0.835359
4,-1.582916,-1.986572,-0.50918
5,-0.795622,1.272695,0.432698
6,-1.729617,-1.100878,0.537771


In [15]:
# ----------------2 Filling In Missing Data（填补缺失值）-----------------

In [16]:
# 替换为常数值
df.fillna(0)

Unnamed: 0,0,1,2
0,0.756224,0.0,0.0
1,1.369152,0.0,0.0
2,0.186491,0.0,0.289922
3,-1.062089,0.0,-0.835359
4,-1.582916,-1.986572,-0.50918
5,-0.795622,1.272695,0.432698
6,-1.729617,-1.100878,0.537771


In [17]:
# 给fillna传入一个dict，可以给不同列替换不同的值
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.756224,0.5,0.0
1,1.369152,0.5,0.0
2,0.186491,0.5,0.289922
3,-1.062089,0.5,-0.835359
4,-1.582916,-1.986572,-0.50918
5,-0.795622,1.272695,0.432698
6,-1.729617,-1.100878,0.537771


In [18]:
# fillna返回一个新对象，但你可以使用in-place来直接更改原有的数据：
_ = df.fillna(100, inplace=True)
df

Unnamed: 0,0,1,2
0,0.756224,100.0,100.0
1,1.369152,100.0,100.0
2,0.186491,100.0,0.289922
3,-1.062089,100.0,-0.835359
4,-1.582916,-1.986572,-0.50918
5,-0.795622,1.272695,0.432698
6,-1.729617,-1.100878,0.537771


In [19]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.86228,-0.747455,-0.189901
1,0.831789,1.132886,0.736785
2,-0.194958,,-2.883704
3,0.639222,,1.359501
4,-0.531625,,
5,0.278751,,


In [20]:
# 使用插值方法 method
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.86228,-0.747455,-0.189901
1,0.831789,1.132886,0.736785
2,-0.194958,1.132886,-2.883704
3,0.639222,1.132886,1.359501
4,-0.531625,1.132886,1.359501
5,0.278751,1.132886,1.359501


In [26]:
df.fillna(method='ffill', limit=2)  # limit=连续填充的最大数量

Unnamed: 0,0,1,2
0,-0.86228,-0.747455,-0.189901
1,0.831789,1.132886,0.736785
2,-0.194958,1.132886,-2.883704
3,0.639222,1.132886,1.359501
4,-0.531625,,1.359501
5,0.278751,,1.359501


In [24]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())  # 传入一个series的平均值或中位数

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64