In [1]:
import numpy as np
import pandas as pd

******
## 目录


**一、缺失值的统计和删除**

* 1. 缺失信息的统计
* 2. 缺失信息的删除


**二、缺失值的填充和插值¶**

* 1. 利用fillna进行填充
* 2. 插值函数

**三、Nullable类型**

* 1. 缺失记号及其缺陷
* 2. Nullable类型的性质
* 3. 缺失数据的计算和分组


**四、练习**

* Ex1：缺失值与类别的相关性检验
* Ex2：用回归模型解决分类问题

## 正式学习内容

******

In [3]:

df = pd.read_csv('../data/learn_pandas.csv', usecols = ['Grade', 'Name', 'Gender', 'Height', 'Weight', 'Transfer'])
df.head()

Unnamed: 0,Grade,Name,Gender,Height,Weight,Transfer
0,Freshman,Gaopeng Yang,Female,158.9,46.0,N
1,Freshman,Changqiang You,Male,166.5,70.0,N
2,Senior,Mei Sun,Male,188.9,89.0,N
3,Sophomore,Xiaojuan Sun,Female,,41.0,N
4,Sophomore,Gaojuan You,Male,174.0,74.0,N


### 一、缺失值的统计和删除**

#### 1.1. 缺失信息的统计

* 【例1】缺失数据查看 isna、isnull
* 【例2】统计每列缺失数据比例 mean+isna
* 【例3】结合isna（notna）和布尔索引查看某一列（非）缺失行
* 【例4】同时多列查看缺失信息 isna, notna和any, all



In [6]:
# 例1 isna、isnull
df.isna().head()

Unnamed: 0,Grade,Name,Gender,Height,Weight,Transfer
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,True,False,False
4,False,False,False,False,False,False


In [8]:
# 例2 查看缺失的比例

df.isna().mean()

Grade       0.000
Name        0.000
Gender      0.000
Height      0.085
Weight      0.055
Transfer    0.060
dtype: float64

In [23]:
# 【例3】结合isna和布尔索引查看某一列缺失行

df[df.Height.isna()].head()

Unnamed: 0,Grade,Name,Gender,Height,Weight,Transfer
3,Sophomore,Xiaojuan Sun,Female,,41.0,N
12,Senior,Peng You,Female,,48.0,
26,Junior,Yanli You,Female,,48.0,N
36,Freshman,Xiaojuan Qin,Male,,79.0,Y
60,Freshman,Yanpeng Lv,Male,,65.0,N


In [22]:
# 【例3】结合notna和布尔索引查看非缺失

df[df.Transfer.notna()].head()

Unnamed: 0,Grade,Name,Gender,Height,Weight,Transfer
0,Freshman,Gaopeng Yang,Female,158.9,46.0,N
1,Freshman,Changqiang You,Male,166.5,70.0,N
2,Senior,Mei Sun,Male,188.9,89.0,N
3,Sophomore,Xiaojuan Sun,Female,,41.0,N
4,Sophomore,Gaojuan You,Male,174.0,74.0,N


In [38]:
# 【例4】同时查看多列全部缺失isna+all

allna_idx = df[['Height','Weight','Transfer']].isna().all(1)

sum(allna_idx)                                           

1

In [36]:
# 'Height','Weight','Transfer' 均为缺失数据

df[allna_idx]

Unnamed: 0,Grade,Name,Gender,Height,Weight,Transfer
102,Junior,Chengli Zhao,Male,,,


In [39]:
# 【例4】查看多列任意一列缺失 isna+any


anyna_idx = df[['Height','Weight','Transfer']].isna().any(1)

sum(anyna_idx)                                            

35

In [40]:
df[anyna_idx]

Unnamed: 0,Grade,Name,Gender,Height,Weight,Transfer
3,Sophomore,Xiaojuan Sun,Female,,41.0,N
9,Junior,Juan Xu,Female,164.8,,N
12,Senior,Peng You,Female,,48.0,
21,Senior,Xiaopeng Shen,Male,166.0,62.0,
26,Junior,Yanli You,Female,,48.0,N
36,Freshman,Xiaojuan Qin,Male,,79.0,Y
60,Freshman,Yanpeng Lv,Male,,65.0,N
61,Sophomore,Xiaopeng Qin,Male,172.8,,N
69,Junior,Chunquan Xu,Female,162.1,54.0,
76,Sophomore,Yanquan Lv,Male,174.6,,N


notna的使用和isna相同，不重复举例

#### 1.2. 缺失信息的删除


丢弃缺失信息的pandas函数：

* **df.dropna**(axis=0, how='any', thresh=None, subset=None, inplace=False)

    
    参数说明  

> * axis：{0 or 'index', 1 or 'columns'} 丢弃空值的维度    
> * how ：default any 丢弃数据的方式，any or all
> * thresh :  int, optional 保留至少 tresh个非空的数据   
> * subset ： 作为参考的子集，默认dropna是考虑整个数据的，举例，如果是删除行的话，参考子集可以设置为列名的list
> * inplace ：bool, default False，是否替换原数据


In [49]:
df_tmp = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
                       "toy": [np.nan, 'Batmobile', 'Bullwhip'],
                       "born": [pd.NaT, pd.Timestamp("1940-04-25"),
                                pd.NaT]})
df_tmp.head()

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


In [50]:
# 例1 dropna的默认使用 删除  包含空值的行

df_tmp.dropna()

Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25


In [51]:
# 例2 axis=‘1’ 删除 包含空值的列

df_tmp.dropna(axis=1)

Unnamed: 0,name
0,Alfred
1,Batman
2,Catwoman


In [52]:
# 例3 how = ‘all’ 删除 全为空值的行
df_tmp = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman',np.nan],
                       "toy": [np.nan, 'Batmobile', 'Bullwhip',np.nan],
                       "born": [pd.NaT, pd.Timestamp("1940-04-25"),
                                pd.NaT,pd.NaT]},)
df_tmp.head()

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT
3,,,NaT


In [53]:
df_tmp.dropna(how = 'all')

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


In [60]:
# 例4 thresh  非空个数大于该阈值，否则删除

df_tmp.dropna(thresh = 2)

Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


In [61]:
df_tmp.dropna(thresh = 1)

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


In [64]:
df_tmp.dropna(axis = 1,thresh = 2)

Unnamed: 0,name,toy
0,Alfred,
1,Batman,Batmobile
2,Catwoman,Bullwhip
3,,


In [65]:
df_tmp.dropna(axis = 1,thresh = 1)

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT
3,,,NaT


In [66]:
# 例5 subset

df_tmp.dropna(subset = ['toy'])

Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


【注】dropna的一些操作也可以直接通过 布尔索引 来完成

In [69]:
# 删除 toy 和 bron 至少有一个缺失的行

df_tmp.loc[df_tmp[['toy','born']].notna().all(1)]


Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25


In [77]:
# thresh 非空个数大于该阈值，否则删除
# df_tmp.dropna(thresh = 2)

df_tmp.loc[df_tmp.notna().sum(axis = 1) >=2]

Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


### 二、缺失值的填充和插值

####  2.1. 利用fillna进行填充

pandas中的填充函数

**df.fillna**(value=None,method=None,axis=None,inplace=False,limit=None,downcast=None,)

    参数说明
> * value=None,标量或者字典索引，填充空值的 值
> * method=None, 填充方法，有用前面的元素填充ffill和用后面的元素填充bfill两种类型  
> * axis=None,  
> * inplace=False,  是否替换原数据  
> * limit=None, limit参数表示连续缺失值的最大填充次数  
> * downcast=None, 

    

In [78]:
# 例子来源官方文档
df_f = pd.DataFrame([[np.nan, 2, np.nan, 0],
                    [3, 4, np.nan, 1],
                    [np.nan, np.nan, np.nan, 5],
                    [np.nan, 3, np.nan, 4]],
                   columns=list('ABCD'))
df_f.head()

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,,3.0,,4


In [80]:
#例1 values 用0填充所有空值

df_f.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,5
3,0.0,3.0,0.0,4


In [82]:
df_f.head()

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,,3.0,,4


In [86]:
# 例2 values 用字典指定不同的填充方式
values = {'A':'a','B':'b','C':'c','D':'d'}
df_f.fillna(value=values)

Unnamed: 0,A,B,C,D
0,a,2,c,0
1,3,4,c,1
2,a,b,c,5
3,a,3,c,4


In [83]:
# 例3 method 前向填充ffill

df_f.fillna(method = 'ffill',axis =1)

Unnamed: 0,A,B,C,D
0,,2.0,2.0,0.0
1,3.0,4.0,4.0,1.0
2,,,,5.0
3,,3.0,3.0,4.0


In [84]:
df_f.fillna(method = 'ffill',axis =0)

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,3.0,4.0,,5
3,3.0,3.0,,4


In [89]:
# 例4 limit=1 连续多个空值 只替换一个

df_f.fillna(0,limit = 1)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,,1
2,,0.0,,5
3,,3.0,,4


####  2.2. 插值函数