# 1. scikit-learn
```
sklearn.preprocessing.Imputer    "处理丢失的数据"     
sklearn.preprocessing.LabelEncoder    "将标签转为0-n_classes-1的连续编号"    
sklearn.preprocessing.OneHotEncoder    "不仅对label可以进行编码，还可对categorical feature(非数值类型)进行编码"   
sklearn.preprocessing.StandardScaler    "去均值和方差归一化。且是针对每一个特征维度来做的，而不是针对样本"    
sklearn.model_selection.train_test_split    "将矩阵随机划分为训练子集和测试子集，并返回划分好的训练集测试集样本和训练集测试集标签"   
```
[sklearn preprocessing 数据预处理（OneHotEncoder）](https://blog.csdn.net/lanchunhui/article/details/72794317)    
[sklearn的train_test_split，果然很好用啊！](https://www.cnblogs.com/bonelee/p/8036024.html)

In [1]:
import numpy as np
import pandas as pd

In [3]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[ : , :-1].values
Y = dataset.iloc[ : , 3].values

In [4]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [5]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object)

In [6]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0)
imputer = imputer.fit(X[ : , 1:3])
X[ : , 1:3] = imputer.transform(X[ : , 1:3])

In [7]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Step 4: Encoding categorical data

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])

In [11]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [12]:
# Creating a dummy variable
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
labelencoder_Y = LabelEncoder()
Y =  labelencoder_Y.fit_transform(Y)

In [15]:
X

array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04],
       [  1.00000000e+00,   0.0000000

In [16]:
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Step 5: Splitting the datasets into training sets and Test sets

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( X , Y , test_size = 0.2, random_state = 0)

In [19]:
X_train, X_test

(array([[  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
           4.00000000e+01,   6.37777778e+04],
        [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           3.70000000e+01,   6.70000000e+04],
        [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
           2.70000000e+01,   4.80000000e+04],
        [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
           3.87777778e+01,   5.20000000e+04],
        [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           4.80000000e+01,   7.90000000e+04],
        [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
           3.80000000e+01,   6.10000000e+04],
        [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           4.40000000e+01,   7.20000000e+04],
        [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
           3.50000000e+01,   5.80000000e+04]]),
 array([[  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
           3.00000000e+01,   5.40000000e+04],
        [  0.0000

In [20]:
Y_train, Y_test

(array([1, 1, 1, 0, 1, 0, 0, 1]), array([0, 0]))

## Step 6: Feature Scaling

In [21]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

In [22]:
X_train, X_test

(array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
        [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
        [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
        [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
        [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
        [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
        [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
        [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]]),
 array([[ 0.,  0.,  0., -1., -1.],
        [ 0.,  0.,  0.,  1.,  1.]]))

# Done