# Pre-processing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [2]:
y = np.array(['Chat', 'Chien', 'Chat', 'Oiseau'])

# Les transformateurs LabelEncoder et OrdinalEncoder (Pas recommander)

In [3]:
# Méthode 1

encoder = LabelEncoder()
encoder.fit(y)

LabelEncoder()

In [4]:
encoder.transform(y)

array([0, 1, 0, 2], dtype=int64)

In [5]:
# Méthode 2 (Efficace)

encoder.fit_transform(y)

array([0, 1, 0, 2], dtype=int64)

In [6]:
# Des nombres aux chaînes de caractères (pour y) 

encoder.inverse_transform(np.array([0, 1, 0, 2]))

array(['Chat', 'Chien', 'Chat', 'Oiseau'], dtype='<U6')

In [7]:
# De chaines de caractères aux nombres (pour x)

X = np.array([['Chat', 'Poils'], ['Chien', 'Poils'], ['Chat', 'Poils'], ['Oiseau', 'Plumes']])

In [8]:

encoder = OrdinalEncoder()
encoder.fit_transform(X)

array([[0., 1.],
       [1., 1.],
       [0., 1.],
       [2., 0.]])

# Les meilleurs transformateurs: LabelBinarizer, MultiLabelBinarizer et OneHotEncoder

In [9]:
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder

In [10]:
y = np.array(['Chat', 'Chien', 'Chat', 'Oiseau'])

In [11]:
encoder = LabelBinarizer(sparse_output=True)
encoder.fit_transform(y)

<4x3 sparse matrix of type '<class 'numpy.int32'>'
	with 4 stored elements in Compressed Sparse Row format>

In [12]:
X = np.array([['Chat', 'Poils'], ['Chien', 'Poils'], ['Chat', 'Poils'], ['Oiseau', 'Plumes']])

In [13]:
encoder = OneHotEncoder()

encoder.fit_transform(X)

<4x5 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

# Normalisation MinMax (Valeurs comprises entre 0 et 1)

In [14]:
from sklearn.preprocessing import MinMaxScaler

In [15]:
X = np.array([[70], [80], [120]])

In [16]:
scaler = MinMaxScaler()
scaler.fit_transform(X)

array([[0. ],
       [0.2],
       [1. ]])

# La Sandarditsation: StandardScaler (Chaque variable X ait une moyenne = 0 et l'écart type = 1)

In [2]:
from sklearn.preprocessing import StandardScaler

In [3]:
X = np.array([[70], [80], [120]])

scaler = StandardScaler()
scaler.fit_transform(X)

array([[-0.9258201 ],
       [-0.46291005],
       [ 1.38873015]])

# RobustScaler (Transforme chaque valeur X en étant peu sensible aux outliers)

In [4]:
from sklearn.preprocessing import RobustScaler

In [5]:
X = np.array([[70], [80], [120]])
scaler = RobustScaler()
scaler.fit_transform(X)

array([[-0.4],
       [ 0. ],
       [ 1.6]])

# Pipeline: pipeline et estimateur composite ( Chaine de transformation)

In [43]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris

In [44]:
iris = load_iris()
X = iris.data
y = iris.target

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5 )

In [38]:
model = make_pipeline(PolynomialFeatures(), StandardScaler(), SGDClassifier(random_state=0))

params = {
    'polynomialFeatures__degree': [2, 3, 4],
    'sgdclassifier__penalty': ['11', '12']
}

grid = GridSearchCV(model, param_grid=params, cv=4)
grid.fit(X_train, y_train)

ValueError: Invalid parameter polynomialFeatures for estimator Pipeline(memory=None,
         steps=[('polynomialfeatures',
                 PolynomialFeatures(degree=2, include_bias=True,
                                    interaction_only=False, order='C')),
                ('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('sgdclassifier',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000, n_iter_no_change=5, n_jobs=None,
                               penalty='l2', power_t=0.5, random_state=0,
                               shuffle=True, tol=0.001, validation_fraction=0.1,
                               verbose=0, warm_start=False))],
         verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.