In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame,Series

### 处理丢失数据
- 有两种丢失数据：
    - None
    - np.nan(NaN)

- 两种丢失数据的区别
    - 数据分析中的空值是float类型，而None是对象类型
    - 对象类型的空值是不可以直接参与运算的，而nan是可以直接参与运算

In [3]:
type(np.nan)

float

In [4]:
type(None)

NoneType

In [5]:
np.nan + 1

nan

In [6]:
None + 1

TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'

### pandas处理空值操作
- isnull
- notnull
- any
- all
- dropna
- fillna

In [8]:
df = DataFrame(data=np.random.randint(0,100,size=(6,7)))
df.iloc[1,2] = np.nan
df.iloc[4,2] = None
df.iloc[5,5] = None
df

Unnamed: 0,0,1,2,3,4,5,6
0,72,73,2.0,31,4,45.0,85
1,95,62,,71,98,17.0,50
2,20,72,62.0,52,75,63.0,59
3,36,9,38.0,68,15,56.0,16
4,62,72,,68,55,54.0,82
5,93,3,80.0,93,54,,86


In [9]:
#dropna：可以直接将缺失的行或者列进行删除
df.dropna(axis=0) #0表示行

Unnamed: 0,0,1,2,3,4,5,6
0,72,73,2.0,31,4,45.0,85
2,20,72,62.0,52,75,63.0,59
3,36,9,38.0,68,15,56.0,16


In [10]:
df.dropna(axis=1)

Unnamed: 0,0,1,3,4,6
0,72,73,31,4,85
1,95,62,71,98,50
2,20,72,52,75,59
3,36,9,68,15,16
4,62,72,68,55,82
5,93,3,93,54,86


- 对缺失值进行覆盖
    - fillna

In [12]:
df.fillna(value=-999) #使用指定值做填充

Unnamed: 0,0,1,2,3,4,5,6
0,72,73,2.0,31,4,45.0,85
1,95,62,-999.0,71,98,17.0,50
2,20,72,62.0,52,75,63.0,59
3,36,9,38.0,68,15,56.0,16
4,62,72,-999.0,68,55,54.0,82
5,93,3,80.0,93,54,-999.0,86


In [13]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,72,73,2.0,31,4,45.0,85
1,95,62,,71,98,17.0,50
2,20,72,62.0,52,75,63.0,59
3,36,9,38.0,68,15,56.0,16
4,62,72,,68,55,54.0,82
5,93,3,80.0,93,54,,86


In [15]:
#使用近邻值做填充
df.fillna(method='bfill',axis=0)

Unnamed: 0,0,1,2,3,4,5,6
0,72,73,2.0,31,4,45.0,85
1,95,62,62.0,71,98,17.0,50
2,20,72,62.0,52,75,63.0,59
3,36,9,38.0,68,15,56.0,16
4,62,72,80.0,68,55,54.0,82
5,93,3,80.0,93,54,,86


- 什么时候用dropna什么时候用fillna
    - 尽量使用dropna，如果删除成本比较高，则使用fillna

- 使用空值对应列的均值或者中位数进行空值填充

In [24]:
#使用均值填充
for col in df.columns:
    #每一次循环col表示某一列的列索引
    if df[col].isnull().sum() > 0:#df[col]列中存有空值
        df[col].fillna(value=df[col].mean(),inplace=True)

In [26]:
df = DataFrame(data=np.random.randint(0,100,size=(6,7)))
df.iloc[1,2] = np.nan
df.iloc[4,2] = None
df.iloc[5,5] = None
df

Unnamed: 0,0,1,2,3,4,5,6
0,46,68,35.0,15,51,90.0,39
1,95,52,,77,6,54.0,67
2,52,11,43.0,62,32,80.0,92
3,96,76,13.0,80,32,26.0,70
4,25,81,,87,13,57.0,35
5,36,85,94.0,20,68,,19


In [37]:
#注意:如果序列中存在空值，则返回的中位数为空
s = Series(data=[1,2,3,None,4,5,6])
np.median(s)

nan

In [47]:
#使用中位数填充空值
for col in df.columns:
    if df[col].isnull().sum() > 0:
        df[col].fillna(value=np.median(df[col][df[col].notnull()]),inplace=True)
        

### 处理重复数据

In [50]:
df = DataFrame(data=np.random.randint(0,100,size=(6,4)))
df.iloc[1] = [1,1,1,1]
df.iloc[3] = [1,1,1,1]
df.iloc[5] = [1,1,1,1]
df

Unnamed: 0,0,1,2,3
0,51,12,33,75
1,1,1,1,1
2,92,83,11,83
3,1,1,1,1
4,62,94,94,18
5,1,1,1,1


In [53]:
df.drop_duplicates(keep='last')

Unnamed: 0,0,1,2,3
0,51,12,33,75
2,92,83,11,83
4,62,94,94,18
5,1,1,1,1


### 处理异常数据
- 自定义一个1000行3列（A，B，C）取值范围为0-1的数据源，然后将C列中的值大于其两倍标准差的异常值进行清洗

In [55]:
df = DataFrame(data=np.random.random(size=(1000,3)),columns=['A','B','C'])
df.head()

Unnamed: 0,A,B,C
0,0.118582,0.872855,0.259485
1,0.846661,0.050873,0.475173
2,0.608949,0.006414,0.652965
3,0.28983,0.364659,0.253954
4,0.535705,0.907904,0.651121


In [57]:
std_twice = df['C'].std() * 2
std_twice

0.5973257466810663

In [60]:
#将异常值对应的行数据进行删除即可
df.loc[df['C'] > std_twice]#将异常值对应的行数据获取

Unnamed: 0,A,B,C
2,0.608949,0.006414,0.652965
4,0.535705,0.907904,0.651121
5,0.008901,0.000220,0.680910
6,0.065221,0.644572,0.864630
8,0.131324,0.448420,0.965676
14,0.013003,0.517938,0.732737
15,0.318996,0.908338,0.908482
18,0.313953,0.020498,0.710764
22,0.480828,0.747365,0.817816
29,0.370843,0.667232,0.615675


In [63]:
drop_indexs = df.loc[df['C'] > std_twice].index #获取异常数据对应的行索引
drop_indexs

Int64Index([  2,   4,   5,   6,   8,  14,  15,  18,  22,  29,
            ...
            981, 982, 984, 986, 990, 991, 994, 995, 996, 999],
           dtype='int64', length=427)

In [64]:
#将异常数据对应的行删除
df.drop(labels=drop_indexs,axis=0,inplace=True)

In [65]:
df

Unnamed: 0,A,B,C
0,0.118582,0.872855,0.259485
1,0.846661,0.050873,0.475173
3,0.289830,0.364659,0.253954
7,0.499094,0.607221,0.398592
9,0.989946,0.748047,0.421730
10,0.302302,0.813178,0.306542
11,0.914834,0.715107,0.155623
12,0.714053,0.301937,0.076535
13,0.916246,0.561781,0.155916
16,0.184803,0.853697,0.044904
