# 缺失值的判断与处理

In [1]:
import pandas as pd
import numpy as np

In [4]:
# 什么是缺失值，它的类型是什么？
print(np.NaN)
print(type(np.NaN))

nan
<class 'float'>


In [10]:
# 读取数据
# 可以发现pm2.5里出现了缺失值
data = pd.read_csv('./DataFolder/PRSA_data.csv')
data.head(10)

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0
1,2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0
2,3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0
3,4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0
4,5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0
5,6,2010,1,1,5,,-19,-10.0,1017.0,NW,16.1,0,0
6,7,2010,1,1,6,,-19,-9.0,1017.0,NW,19.23,0,0
7,8,2010,1,1,7,,-19,-9.0,1017.0,NW,21.02,0,0
8,9,2010,1,1,8,,-19,-9.0,1017.0,NW,24.15,0,0
9,10,2010,1,1,9,,-20,-8.0,1017.0,NW,27.28,0,0


## 判断数据中是否有缺失值
* 使用isnull()函数和notnull()函数

In [18]:
# isnull()的使用（notnull()刚好和isnull()相反）
# 利用np.any（是否有互异的值，有一个NaN就返回True）判断否有缺失值，有的话则返回True
# 利用np.all（是否有互异的值，有一个NaN就返回False）判断否有缺失值，有的话则返回False
print(np.any(pd.isnull(data)))
print(np.all(pd.isnull(data)))
pd.isnull(data)

True
False


Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,False,False,False,False,False,True,False,False,False,False,False,False,False
1,False,False,False,False,False,True,False,False,False,False,False,False,False
2,False,False,False,False,False,True,False,False,False,False,False,False,False
3,False,False,False,False,False,True,False,False,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43819,False,False,False,False,False,False,False,False,False,False,False,False,False
43820,False,False,False,False,False,False,False,False,False,False,False,False,False
43821,False,False,False,False,False,False,False,False,False,False,False,False,False
43822,False,False,False,False,False,False,False,False,False,False,False,False,False


## 处理缺失值
* 删除缺失值dropna()
* 替换缺失值fillna(xx, inplace=True)<br />
第一个参数表示替换的内容，第二个参数True/False表示是否替换

### 删除缺失值dropna()

In [21]:
data_noNan = data.dropna()
# 再次判断是否有缺失值
print(np.any(pd.isnull(data_noNan)))
data_noNan

False


Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
24,25,2010,1,2,0,129.0,-16,-4.0,1020.0,SE,1.79,0,0
25,26,2010,1,2,1,148.0,-15,-4.0,1020.0,SE,2.68,0,0
26,27,2010,1,2,2,159.0,-11,-5.0,1021.0,SE,3.57,0,0
27,28,2010,1,2,3,181.0,-7,-5.0,1022.0,SE,5.36,1,0
28,29,2010,1,2,4,138.0,-7,-5.0,1022.0,SE,6.25,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43819,43820,2014,12,31,19,8.0,-23,-2.0,1034.0,NW,231.97,0,0
43820,43821,2014,12,31,20,10.0,-22,-3.0,1034.0,NW,237.78,0,0
43821,43822,2014,12,31,21,10.0,-22,-3.0,1034.0,NW,242.70,0,0
43822,43823,2014,12,31,22,8.0,-22,-4.0,1034.0,NW,246.72,0,0


### 替换缺失值fillna()

In [23]:
# 发现pm2.5这一列有缺失值
data.head()

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0
1,2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0
2,3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0
3,4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0
4,5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0


In [25]:
data["pm2.5"].fillna(value=data["pm2.5"].mean(), inplace=True)
data.head(10)

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,1,2010,1,1,0,98.613215,-21,-11.0,1021.0,NW,1.79,0,0
1,2,2010,1,1,1,98.613215,-21,-12.0,1020.0,NW,4.92,0,0
2,3,2010,1,1,2,98.613215,-21,-11.0,1019.0,NW,6.71,0,0
3,4,2010,1,1,3,98.613215,-21,-14.0,1019.0,NW,9.84,0,0
4,5,2010,1,1,4,98.613215,-20,-12.0,1018.0,NW,12.97,0,0
5,6,2010,1,1,5,98.613215,-19,-10.0,1017.0,NW,16.1,0,0
6,7,2010,1,1,6,98.613215,-19,-9.0,1017.0,NW,19.23,0,0
7,8,2010,1,1,7,98.613215,-19,-9.0,1017.0,NW,21.02,0,0
8,9,2010,1,1,8,98.613215,-19,-9.0,1017.0,NW,24.15,0,0
9,10,2010,1,1,9,98.613215,-20,-8.0,1017.0,NW,27.28,0,0


### 综合判断每一列是否有缺失值

In [28]:
data = pd.read_csv('./DataFolder/PRSA_data.csv')
for i in data.columns:
    if np.any(pd.isnull(data[i])):
        print(i, "有缺失值")
        # 进行替换
        data[i].fillna(value=data[i].mean(), inplace=True)
    else:
        print("Nothing happened!")

Nothing happened!
Nothing happened!
Nothing happened!
Nothing happened!
Nothing happened!
pm2.5 有缺失值
Nothing happened!
Nothing happened!
Nothing happened!
Nothing happened!
Nothing happened!
Nothing happened!
Nothing happened!


In [29]:
data["pm2.5"].head(10)

0    98.613215
1    98.613215
2    98.613215
3    98.613215
4    98.613215
5    98.613215
6    98.613215
7    98.613215
8    98.613215
9    98.613215
Name: pm2.5, dtype: float64