### 处理缺失值的方法：
- 删掉有缺失值的行或者列(如果按列进行删除操作，训练数据和测试数据操作务必同步)
- 使用标准统计量（均值，分位数，众数...）填充法
- 使用简单的模型对缺失值进行填充（可能带入噪音）

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer

In [2]:
np.random.seed(333)
original_data=pd.DataFrame(np.random.randint(-10,11,size=(5,4)),columns=list('ABCD'))

In [3]:
original_data

Unnamed: 0,A,B,C,D
0,2,2,3,-7
1,9,4,4,9
2,-3,10,2,10
3,-1,3,-5,-5
4,-2,-2,-3,9


In [4]:
print original_data.mean()
print original_data.dtypes
print 
print original_data.describe()

A    1.0
B    3.4
C    0.2
D    3.2
dtype: float64
A    int32
B    int32
C    int32
D    int32
dtype: object

             A          B         C          D
count  5.00000   5.000000  5.000000   5.000000
mean   1.00000   3.400000  0.200000   3.200000
std    4.84768   4.335897  3.962323   8.438009
min   -3.00000  -2.000000 -5.000000  -7.000000
25%   -2.00000   2.000000 -3.000000  -5.000000
50%   -1.00000   3.000000  2.000000   9.000000
75%    2.00000   4.000000  3.000000   9.000000
max    9.00000  10.000000  4.000000  10.000000


### set some nan values

In [5]:
original_data_copy1=original_data.copy()


In [6]:
original_data_copy1

Unnamed: 0,A,B,C,D
0,2,2,3,-7
1,9,4,4,9
2,-3,10,2,10
3,-1,3,-5,-5
4,-2,-2,-3,9


In [7]:
original_data_copy1.at[0,'A']=None
original_data_copy1.at[1,'C']=None
original_data_copy1.at[3,'D']=None

In [8]:
original_data_copy1

Unnamed: 0,A,B,C,D
0,,2,3.0,-7.0
1,9.0,4,,9.0
2,-3.0,10,2.0,10.0
3,-1.0,3,-5.0,
4,-2.0,-2,-3.0,9.0


In [9]:
original_data_copy1.describe()

Unnamed: 0,A,B,C,D
count,4.0,5.0,4.0,4.0
mean,0.75,3.4,-0.75,5.25
std,5.560276,4.335897,3.86221,8.180261
min,-3.0,-2.0,-5.0,-7.0
25%,-2.25,2.0,-3.5,5.0
50%,-1.5,3.0,-0.5,9.0
75%,1.5,4.0,2.25,9.25
max,9.0,10.0,3.0,10.0


### sklearn填充

In [10]:
imp=Imputer()
after_imp=imp.fit_transform(original_data_copy1)

In [11]:
original_data.values-after_imp

array([[  1.25,   0.  ,   0.  ,   0.  ],
       [  0.  ,   0.  ,   4.75,   0.  ],
       [  0.  ,   0.  ,   0.  ,   0.  ],
       [  0.  ,   0.  ,   0.  , -10.25],
       [  0.  ,   0.  ,   0.  ,   0.  ]])

### 删除有空值的列或行

In [12]:
cols_with_missing = [col for col in original_data_copy1.columns 
                                 if original_data_copy1[col].isnull().any()]
print cols_with_missing
redued_original_data = original_data_copy1.drop(cols_with_missing, axis=1)

['A', 'C', 'D']


In [13]:
redued_original_data

Unnamed: 0,B
0,2
1,4
2,10
3,3
4,-2


In [14]:
original_data_copy1.dropna()

Unnamed: 0,A,B,C,D
2,-3.0,10,2.0,10.0
4,-2.0,-2,-3.0,9.0


In [15]:
original_data_copy1

Unnamed: 0,A,B,C,D
0,,2,3.0,-7.0
1,9.0,4,,9.0
2,-3.0,10,2.0,10.0
3,-1.0,3,-5.0,
4,-2.0,-2,-3.0,9.0


In [16]:
original_data_copy2 = original_data_copy1.copy()
cols_with_missing = (col for col in original_data_copy2.columns 
                                 if original_data_copy2[col].isnull().any())
for col in cols_with_missing:
    print col
    original_data_copy2[col + '_was_missing'] = original_data_copy2[col].isnull()

# Imputation
print original_data_copy2
print original_data_copy2.describe()
my_imputer = Imputer()
new_data = my_imputer.fit_transform(original_data_copy2)

A
C
D
     A   B    C     D A_was_missing C_was_missing D_was_missing
0  NaN   2  3.0  -7.0          True         False         False
1  9.0   4  NaN   9.0         False          True         False
2 -3.0  10  2.0  10.0         False         False         False
3 -1.0   3 -5.0   NaN         False         False          True
4 -2.0  -2 -3.0   9.0         False         False         False
              A          B        C          D
count  4.000000   5.000000  4.00000   4.000000
mean   0.750000   3.400000 -0.75000   5.250000
std    5.560276   4.335897  3.86221   8.180261
min   -3.000000  -2.000000 -5.00000  -7.000000
25%   -2.250000   2.000000 -3.50000   5.000000
50%   -1.500000   3.000000 -0.50000   9.000000
75%    1.500000   4.000000  2.25000   9.250000
max    9.000000  10.000000  3.00000  10.000000


In [17]:
new_data

array([[  0.75,   2.  ,   3.  ,  -7.  ,   1.  ,   0.  ,   0.  ],
       [  9.  ,   4.  ,  -0.75,   9.  ,   0.  ,   1.  ,   0.  ],
       [ -3.  ,  10.  ,   2.  ,  10.  ,   0.  ,   0.  ,   0.  ],
       [ -1.  ,   3.  ,  -5.  ,   5.25,   0.  ,   0.  ,   1.  ],
       [ -2.  ,  -2.  ,  -3.  ,   9.  ,   0.  ,   0.  ,   0.  ]])