# Scikit-Learn Cheat Sheet

## Loading The Data Preset

In [1]:
from sklearn import datasets

iris = datasets.load_iris()
x_iris = iris.data
y_iris = iris.target

## Split The Data

In [2]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_iris, y_iris, random_state=7)

## Preprocessing The Data

### Standardization
标准化的前提是特征值服从正态分布，标准化后，其转换成标准正态分布。

In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(x_train)
x_std = scaler.transform(x_train)
x_test_std = scaler.transform(x_test)

### MinMaxScaler
区间缩放法利用了边界值信息，将特征的取值区间缩放到某个特点的范围

In [4]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(x_train)
x_minmax = scaler.transform(x_train)
x_test_x_minmax = scaler.transform(x_test)

### Normalization
归一化是依照特征矩阵的行处理数据，目的是样本向量在点乘运算或其他核函数计算相似性时，拥有统一的标准，也就是都转化为“单位向量”

In [5]:
from sklearn.preprocessing import Normalizer

scaler = Normalizer().fit(x_train)
x_nor = scaler.transform(x_train)
x_test_nor = scaler.transform(x_test)

### Binarization
设定一个阈值，大于阈值的赋值为1，小于等于阈值的赋值为0

In [6]:
from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold=0.5).fit(x_train)
x_bin = binarizer.transform(x_train)
x_test_bin = binarizer.transform(x_test)

### LabelEncode Categorical Features
把定性特征编码成定量特征

In [7]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder().fit(y_train)
y_enc = enc.transform(y_train)
y_test_enc = enc.transform(y_test)

### Imputing Missing Values
填充缺失值

In [8]:
from sklearn.preprocessing import Imputer

imp = Imputer(strategy='mean', axis=0).fit(x_train)
x_imp = imp.transform(x_train)
x_test_imp = imp.transform(x_test)

### Generating Polynomial Features
数据变换

In [9]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2).fit(x_train)
x_poly = poly.transform(x_train)
X_test_poly = poly.transform(x_test)

### OneHotEncode Categorical Features
把定性特征编码成DummyCoding

In [10]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder().fit(y_train.reshape(-1,1))
y_enc = enc.transform(y_train.reshape(-1,1))
y_test_enc = enc.transform(y_test.reshape(-1,1))

### FunctionTransformer Features
对特征进行函数转换

In [11]:
from numpy import log1p
from sklearn.preprocessing import FunctionTransformer

loger = FunctionTransformer(log1p).fit(x_train)
x_log = loger.transform(x_train)
x_test_log = loger.transform(x_test)

## Features Select