In [2]:
from io import StringIO
import pandas as pd
import numpy as np

cvs_data = '''A,B,C,d
              1.0,2.0,3.0,4.0
              5.0,6.0,,8.0
              0.0,11.0,12.0'''
df = pd.read_csv(StringIO(cvs_data))
df

Unnamed: 0,A,B,C,d
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [3]:
# 检查NaN列
df.isnull().sum()

A    0
B    0
C    1
d    1
dtype: int64

In [4]:
# values之后转换为array就可将dataFrame转换为矩阵/数组的形式,就可以用于sklearn
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [ 0., 11., 12., nan]])

In [5]:
# 消除NaN样本
df.dropna()

Unnamed: 0,A,B,C,d
0,1.0,2.0,3.0,4.0


In [6]:
# 通过设置axis,删除NaN列
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,0.0,11.0


In [7]:
# 消除所有属性都是NaN的样本
df.dropna(how='all')

Unnamed: 0,A,B,C,d
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [8]:
# 根据threshold去删除样本
df.dropna(thresh=4)

Unnamed: 0,A,B,C,d
0,1.0,2.0,3.0,4.0


In [9]:
# 删除在特定列出现NaN的样本
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,d
0,1.0,2.0,3.0,4.0
2,0.0,11.0,12.0,


In [11]:
# 插值
# 使用一定的策略补上缺省值
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values='NaN', strategy='mean', axis=0) # 使用特征的均值补充上缺省值.但是这样带来的副作用是,不在符合原样本的分布.常用的策略是most_frequent
imr = imr.fit(df)

In [12]:
imputed_data = imr.transform(df.values)
print(imputed_data)

[[ 1.   2.   3.   4. ]
 [ 5.   6.   7.5  8. ]
 [ 0.  11.  12.   6. ]]


In [16]:
# 处理分类数据
df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
                   ['red', 'L', 13.5, 'class2'],
                   ['blue', 'XL', 15.3, 'class3']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class3


In [17]:
# 1.映射有序特征:比如size这类特征虽然是非数值型特征,但是内部有顺序的,比如我们假设:XL=L+1=M+2
size_mapping = {
    'XL': 3,
    'L': 2,
    'M': 1
}
df['size'] = df['size'].map(size_mapping)
# inv_size_mapping={v: k for k, v in size_mapping.items()} 还原原始特征数据
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class3
