In [359]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

In [360]:
data = [[-1,2],[-0.5,6],[0,10],[1,18]]

pd.DataFrame(data)

Unnamed: 0,0,1
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,18


#### 实现归一化，收敛到[0,1]之间

In [361]:
scaler = MinMaxScaler()
scaler = scaler.fit(data)   #  这里本质是生成了 min(x), 和 max(x)
result = scaler.transform(data)
result

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [362]:
result_ = scaler.fit_transform(data)
result_

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [363]:
scaler.inverse_transform(result_)   # 将归一化后的数据进行反转

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

##### 使用MinMaxScaler的参数feature_range实现将数据归一化到[0,1]以外的范围中

In [364]:
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler(feature_range=[5,10])
scaler.fit_transform(data)


array([[ 5.  ,  5.  ],
       [ 6.25,  6.25],
       [ 7.5 ,  7.5 ],
       [10.  , 10.  ]])

#当X中的特征数量非常多的时候，fit会报错并表示，数据量太大了我计算不了
#此时使用partial_fit作为训练接口
#scaler = scaler.partial_fit(data)



###  标准化

In [365]:
from sklearn.preprocessing import StandardScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]


In [366]:
scaler = StandardScaler()
scaler = scaler.fit(data)
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [367]:
scaler.mean_

array([-0.125,  9.   ])

In [368]:
scaler.var_

array([ 0.546875, 35.      ])

In [369]:
x_ = scaler.transform(data)


In [370]:
x_.mean()

0.0

In [371]:
x_.std()

1.0

In [372]:
scaler.inverse_transform(x_)

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

## 缺失值的处理

In [373]:
pwd

'C:\\Users\\Sundasheng\\code\\caicai_ketang'

In [374]:
data = pd.read_csv("./cai_data/Narrativedata.csv",index_col = 0)
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [375]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age         714 non-null float64
Sex         891 non-null object
Embarked    889 non-null object
Survived    891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [376]:
data.describe()

Unnamed: 0,Age
count,714.0
mean,29.699118
std,14.526497
min,0.42
25%,20.125
50%,28.0
75%,38.0
max,80.0


###  填充缺失值的类

class sklearn.impute.SimpleImputer (missing_values=nan, strategy=’mean’, fill_value=None, verbose=0,
copy=True)

#####   对年龄的缺失值 进行处理

In [377]:
data.loc[:,"Age"].values.shape

(891,)

In [378]:
data.loc[:,"Age"].values.reshape(-1,1).shape

(891, 1)

In [379]:
Age = data.loc[:,"Age"].values.reshape(-1,1)
type(Age)

numpy.ndarray

In [380]:
Age = data.loc[:,"Age"].values.reshape(-1,1) #sklearn当中特征矩阵必须是二维
Age[:20]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [nan],
       [54.],
       [ 2.],
       [27.],
       [14.],
       [ 4.],
       [58.],
       [20.],
       [39.],
       [14.],
       [55.],
       [ 2.],
       [nan],
       [31.],
       [nan]])

实例化 填充对象

In [381]:
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer()
imp_median = SimpleImputer(strategy="median")
imp_0 = SimpleImputer(strategy="constant",fill_value=0)


In [382]:
imp_mean_ = imp_mean.fit_transform(Age) #fit_transform一步完成调取结果
imp_mean_[:20]

array([[22.        ],
       [38.        ],
       [26.        ],
       [35.        ],
       [35.        ],
       [29.69911765],
       [54.        ],
       [ 2.        ],
       [27.        ],
       [14.        ],
       [ 4.        ],
       [58.        ],
       [20.        ],
       [39.        ],
       [14.        ],
       [55.        ],
       [ 2.        ],
       [29.69911765],
       [31.        ],
       [29.69911765]])

In [383]:
imp_median_ = imp_median.fit_transform(Age)
imp_median_[:20]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [28.],
       [54.],
       [ 2.],
       [27.],
       [14.],
       [ 4.],
       [58.],
       [20.],
       [39.],
       [14.],
       [55.],
       [ 2.],
       [28.],
       [31.],
       [28.]])

In [384]:
imp_0_ = imp_0.fit_transform(Age)
imp_0_[:20]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [ 0.],
       [54.],
       [ 2.],
       [27.],
       [14.],
       [ 4.],
       [58.],
       [20.],
       [39.],
       [14.],
       [55.],
       [ 2.],
       [ 0.],
       [31.],
       [ 0.]])

In [385]:
##  在这里我们使用  中位数 填充Age
data.loc[:,"Age"] = imp_median_
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age         891 non-null float64
Sex         891 non-null object
Embarked    889 non-null object
Survived    891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [386]:
#使用众数填补Embarked
Embarked = data.loc[:,"Embarked"].values.reshape(-1,1)
imp_mode = SimpleImputer(strategy = "most_frequent") #  可以填充文字型数据
data.loc[:,"Embarked"] = imp_mode.fit_transform(Embarked)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age         891 non-null float64
Sex         891 non-null object
Embarked    891 non-null object
Survived    891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [387]:
###  在这里 使用Pandas 中的填充 fillna  更为方便

In [388]:
# import pandas as pd
# data = pd.read_csv(r"C:\work\learnbetter\micro-class\week 3Preprocessing\Narrativedata.csv",index_col=0)
# data.head()
# data.loc[:,"Age"] = data.loc[:,"Age"].fillna(data.loc[:,"Age"].median())
# #.fillna 在DataFrame里面直接进行填补



# data.dropna(axis=0,inplace=True)
# #.dropna(axis=0)删除所有有缺失值的行，.dropna(axis=1)删除所有有缺失值的列
# #参数inplace，为True表示在原数据集上进行修改，为False表示生成一个复制对象，不修改原数据，默认False
#  

#  data_ = data_.dropna(axis=0, inplace=False)

## 2.3 处理分类型特征：编码与哑变量

### preprocessing.LabelEncoder：标签专用，能够将分类转换为分类数值


In [389]:
from sklearn.preprocessing import LabelEncoder
y = data.iloc[:, -1]  #  要输入的是标签 不是特征矩阵  所以 允许一维

In [390]:
le = LabelEncoder()
le = le.fit(y)
label = le.transform(y)
label[:30]

array([0, 2, 2, 2, 0, 0, 0, 0, 2, 2, 1, 2, 0, 0, 0, 1, 0, 2, 0, 2, 1, 2,
       2, 2, 0, 1, 0, 0, 2, 0])

In [391]:
le.classes_  #  属性.class_查看比标签中的多少类别

array(['No', 'Unknown', 'Yes'], dtype=object)

In [392]:
le.inverse_transform(label)[:20]

array(['No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'Yes',
       'Unknown', 'Yes', 'No', 'No', 'No', 'Unknown', 'No', 'Yes', 'No',
       'Yes'], dtype=object)

In [393]:
data.iloc[:,-1] = label  # 让标签中的值 替换原来的数据

In [394]:
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


 ##  preprocessing.OrdinalEncoder：特征专用，能够将分类特征转换为分类数值

In [395]:
from sklearn.preprocessing import OrdinalEncoder
#接口categories_对应LabelEncoder的接口classes_，一模一样的功能
data_ = data.copy()

data_.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


In [396]:
OrdinalEncoder().fit(data_.iloc[:,1:-1]).categories_

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

In [397]:
data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1])
data_.head(20)

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,1.0,2.0,0
1,38.0,0.0,0.0,2
2,26.0,0.0,2.0,2
3,35.0,0.0,2.0,2
4,35.0,1.0,2.0,0
5,28.0,1.0,1.0,0
6,54.0,1.0,2.0,0
7,2.0,1.0,2.0,0
8,27.0,0.0,2.0,2
9,14.0,0.0,0.0,2


##  preprocessing.OneHotEncoder：独热编码，创建哑变量

In [398]:
data.head()


Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


In [399]:
from sklearn.preprocessing import OneHotEncoder
X = data.iloc[:, 1:-1]
X.head()

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S


In [400]:
enc = OneHotEncoder(categories="auto").fit(X)
enc.transform(X).toarray()

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

In [401]:
result = enc.transform(X).toarray()
result.shape

(891, 5)

In [402]:
##  依然可以还原
pd.DataFrame(enc.inverse_transform(result)).head()

Unnamed: 0,0,1
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S


In [403]:
enc.get_feature_names()

array(['x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S'], dtype=object)

In [404]:
#axis=1,表示跨行进行合并，也就是将量表左右相连，如果是axis=0，就是将量表上下相连
newdata = pd.concat([data,pd.DataFrame(result)],axis=1)

In [405]:
newdata.head()

Unnamed: 0,Age,Sex,Embarked,Survived,0,1,2,3,4
0,22.0,male,S,0,0.0,1.0,0.0,0.0,1.0
1,38.0,female,C,2,1.0,0.0,1.0,0.0,0.0
2,26.0,female,S,2,1.0,0.0,0.0,0.0,1.0
3,35.0,female,S,2,1.0,0.0,0.0,0.0,1.0
4,35.0,male,S,0,0.0,1.0,0.0,0.0,1.0


In [406]:
newdata.drop(["Sex","Embarked"],axis=1,inplace=True)
newdata.head()

Unnamed: 0,Age,Survived,0,1,2,3,4
0,22.0,0,0.0,1.0,0.0,0.0,1.0
1,38.0,2,1.0,0.0,1.0,0.0,0.0
2,26.0,2,1.0,0.0,0.0,0.0,1.0
3,35.0,2,1.0,0.0,0.0,0.0,1.0
4,35.0,0,0.0,1.0,0.0,0.0,1.0


In [407]:
newdata.columns =["Age","Survived","Female","Male","Embarked_C","Embarked_Q","Embarked_S"]
newdata.head()

Unnamed: 0,Age,Survived,Female,Male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,0,0.0,1.0,0.0,0.0,1.0
1,38.0,2,1.0,0.0,1.0,0.0,0.0
2,26.0,2,1.0,0.0,0.0,0.0,1.0
3,35.0,2,1.0,0.0,0.0,0.0,1.0
4,35.0,0,0.0,1.0,0.0,0.0,1.0


##  2.4 处理连续型特征：二值化与分段

##将年龄进行二值化

In [408]:
data_2 = data.copy()
data_2.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


In [409]:
from sklearn.preprocessing import Binarizer

X = data_2.iloc[:,0].values.reshape(-1,1)  #  类为特征专用，所以不能使用一维数组


In [410]:
#  把  年龄 变成二值化
transformer = Binarizer(threshold=30).fit_transform(X)
transformer[:20]

array([[0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.]])

In [411]:
data_2.iloc[:,0] = pd.DataFrame(transformer)

In [412]:
data2_.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


## 分箱preprocessing.KBinsDiscretizer

In [413]:
from sklearn.preprocessing import KBinsDiscretizer

X = data.iloc[:,0].values.reshape(-1,1)
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy="uniform")
est.fit_transform(X)[:20]

array([[0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [2.],
       [0.],
       [1.],
       [0.],
       [0.],
       [2.],
       [0.],
       [1.],
       [0.],
       [2.],
       [0.],
       [1.],
       [1.],
       [1.]])

In [414]:
#  查看转换之后的箱数：  变成了一列中的三箱
set(est.fit_transform(X).ravel())

{0.0, 1.0, 2.0}

In [415]:
est = KBinsDiscretizer(n_bins=3, encode="onehot", strategy="uniform")
#  查看转换之后的箱， 变成了呀编码
est.fit_transform(X)

<891x3 sparse matrix of type '<class 'numpy.float64'>'
	with 891 stored elements in Compressed Sparse Row format>

# 特征选择 feature_selection