In [None]:
###################
# Standardization #
###################

In [1]:
from sklearn import preprocessing
import numpy as np

X = np.array([[1., -1., 2.],
              [2., 0., 0.],
              [0., 1., -1.]])
X_scaled = preprocessing.scale(X)

X_scaled  

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [9]:
scaler = preprocessing.StandardScaler().fit(X)

scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [6]:
scaler.mean_

array([1.        , 0.        , 0.33333333])

In [7]:
scaler.scale_

array([0.81649658, 0.81649658, 1.24721913])

In [8]:
scaler.transform(X)

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [12]:
X_test = [[-1., 0., 1.]]
scaler.transform(X_test)

array([[-2.44948974,  0.        ,  0.53452248]])

In [15]:
X = np.array([[1., -1., 2.],
             [2., 0., 0.], 
             [0., 1., -1.]])
min_max_scaler = preprocessing.MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X)
X_minmax

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [17]:
X_test = np.array([[-3., -1., 4.]])
X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax

array([[-1.5       ,  0.        ,  1.66666667]])

In [None]:
#################
# Normalization #
#################

In [21]:
X = [[1., -1., 0.],
     [-1., 1., 0.],
     [0., -1., 1.]]
X_normalized = preprocessing.normalize(X, norm='l2')

X_normalized

array([[ 0.70710678, -0.70710678,  0.        ],
       [-0.70710678,  0.70710678,  0.        ],
       [ 0.        , -0.70710678,  0.70710678]])

In [23]:
normalizer = preprocessing.Normalizer().fit(X)
normalizer

Normalizer(copy=True, norm='l2')

In [24]:
normalizer.transform(X)

array([[ 0.70710678, -0.70710678,  0.        ],
       [-0.70710678,  0.70710678,  0.        ],
       [ 0.        , -0.70710678,  0.70710678]])

In [27]:
normalizer.transform([[-1., 1., 0.]])

array([[-0.70710678,  0.70710678,  0.        ]])

In [None]:
########################
# Encoding Categorical #
########################

In [28]:
enc = preprocessing.OrdinalEncoder()
X = [['male', 'from US', 'uses Safari'],
     ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)

OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)

In [29]:
enc.transform([['female', 'from US', 'uses Safari']])

array([[0., 1., 1.]])

In [30]:
enc = preprocessing.OneHotEncoder()
X = [['male', 'from US', 'uses Safari'],
     ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [34]:
enc.transform([['female', 'from US', 'uses Safari'],
               ['male', 'from Europe', 'uses Safari']]).toarray()

array([[1., 0., 0., 1., 0., 1.],
       [0., 1., 1., 0., 0., 1.]])

In [35]:
enc.categories_

[array(['female', 'male'], dtype=object),
 array(['from Europe', 'from US'], dtype=object),
 array(['uses Firefox', 'uses Safari'], dtype=object)]

In [36]:
genders = ['female', 'male']
locations = ['from Africa', 'from Asia', 'from Europe', 'from US']
browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari']
enc = preprocessing.OneHotEncoder(categories=[genders,
                                              locations,
                                              browsers])
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)

OneHotEncoder(categorical_features=None,
              categories=[['female', 'male'],
                          ['from Africa', 'from Asia', 'from Europe',
                           'from US'],
                          ['uses Chrome', 'uses Firefox', 'uses IE',
                           'uses Safari']],
              drop=None, dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [None]:
########################
# Feature Binarization #
########################

In [37]:
X = [[1., -1., 2.],
     [2., 0., 1.],
     [1., 2., -1.]]
binarizer = preprocessing.Binarizer().fit(X)
binarizer

Binarizer(copy=True, threshold=0.0)

In [38]:
binarizer.transform(X)

array([[1., 0., 1.],
       [1., 0., 1.],
       [1., 1., 0.]])

In [39]:
binarizer = preprocessing.Binarizer(threshold=1.1)
binarizer.transform(X)

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [None]:
##################################
# Generating Polynomial Features #
##################################

In [40]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
X = np.arange(6).reshape(3, 2)
X

array([[0, 1],
       [2, 3],
       [4, 5]])

In [44]:
X = np.arange(9).reshape(3, 3)
X

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [42]:
poly = PolynomialFeatures(degree=3, interaction_only=True)
poly.fit_transform(X)

array([[  1.,   0.,   1.,   2.,   0.,   0.,   2.,   0.],
       [  1.,   3.,   4.,   5.,  12.,  15.,  20.,  60.],
       [  1.,   6.,   7.,   8.,  42.,  48.,  56., 336.]])

In [45]:
#######################
# Custom Transformers #
#######################

In [48]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p, validate=True)
X = np.array([[0, 1], [2, 3]])
transformer.transform(X)

array([[0.        , 0.69314718],
       [1.09861229, 1.38629436]])

In [49]:
################################
# Inputation of Missing Values #
################################

In [52]:
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit([[1, 2], [np.nan, 3], [7, 6]])

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [55]:
X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.transform(X))

[[4.         2.        ]
 [6.         3.66666667]
 [7.         6.        ]]


In [54]:
#############################
# Non-linear Transformation #
#############################

In [57]:
#映射到统一分布
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train,y_test = train_test_split(X, y, random_state=0)
quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
X_train_trans = quantile_transformer.fit_transform(X_train)
X_test_trans = quantile_transformer.transform(X_test)
np.percentile(X_train[:, 0], [0, 25, 50, 75, 100])

  % (self.n_quantiles, n_samples))


array([4.3, 5.1, 5.8, 6.5, 7.9])

In [58]:
############
# Exercise #
############

In [79]:
from sklearn import preprocessing
a = preprocessing.OneHotEncoder()
b = preprocessing.LabelEncoder()
b.fit([123, 456, 789])
temp = b.transform([123, 456, 789])
a.fit(temp.reshape(-1, 1))
print(a.transform(b.transform([123, 789]).reshape(-1, 1)))

  (0, 0)	1.0
  (1, 2)	1.0


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


TypeError: 'method' object is not subscriptable

In [78]:
a = preprocessing.OneHotEncoder()
b = preprocessing.LabelEncoder()

######
#
#  分析题目，含义应当是用[123,456,789]来训练OneHotEncoder和LabelEncoder类，
#  产生响应的编码，所以，先用所给数据训练，再将[123,789]进行输入得到最终编码结果
#
######
#b.fit([123,456,789])
#temp = b.transform([123,456,789]).reshape(-1,1)   此过程可以省略temp 
#a.fit(temp)
#print (temp)
a.fit(b.transform(b.fit([123,456,789]).classes_).reshape(-1,1)) #进行训练
k=a.transform(  b.transform([123,789]).reshape(-1,1)  )#.reshape(-1,1)  #进行实际转化
print(k)

  (0, 0)	1.0
  (1, 2)	1.0


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
