# Engineering features for machine learning

## Engineering categorical features

In [1]:
data = [['Bleach'], ['Cereal'], ['Toilet Roll']]

In [2]:
from sklearn import preprocessing

In [3]:
ordinal_enc = preprocessing.OrdinalEncoder()
ordinal_enc.fit(data)
print(ordinal_enc.transform(data))

[[0.]
 [1.]
 [2.]]


Strongly Disagree, Disagree, Neither Disagree nor Agree, Agree, and Strongly Agree are ordinal features.

In [7]:
ordinal_categories = [
    ['Strongly Disagree', 'Disagree', 'Neither Disagree nor Agree', 'Agree', 'Strongly Agree']
]
data = [['Strongly Disagree'], ['Disagree'], ['Neither Disagree nor Agree'], ['Agree'], ['Strongly Agree'],
        ['Neither Disagree nor Agree'], ['Agree'], ['Strongly Agree'], ['Strongly Disagree'], ['Disagree']
        ]
ordinal_enc = preprocessing.OrdinalEncoder(
    categories=[
        ['Strongly Disagree', 'Disagree', 'Neither Disagree nor Agree', 'Agree', 'Strongly Agree']
    ]
)
ordinal_enc.fit(data)
print(ordinal_enc.transform(data))

[[0.]
 [1.]
 [2.]
 [3.]
 [4.]
 [2.]
 [3.]
 [4.]
 [0.]
 [1.]]


## Engineering numerical features

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifier
from sklearn import metrics
from sklearn.datasets import load_wine
from sklearn.pipeline import make_pipeline

In [2]:
X, y = load_wine(return_X_y=True)

In [3]:
X

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [4]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [8]:
print(type(X))
print(type(y))
print(np.unique(y))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[0 1 2]


In [9]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.30, random_state=42)


### Training without standardization

In [10]:
no_scale_clf = make_pipeline(RidgeClassifier(tol=1e-2,
                                             solver="sag"))
no_scale_clf.fit(X_train, y_train)
y_pred_no_scale = no_scale_clf.predict(X_test)

In [11]:
print('\nAccuracy [no scaling]')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, y_pred_no_scale)))
print('\nClassification Report [no scaling]')
print(metrics.classification_report(y_test, y_pred_no_scale))


Accuracy [no scaling]
75.93%


Classification Report [no scaling]
              precision    recall  f1-score   support

           0       0.90      1.00      0.95        19
           1       0.66      1.00      0.79        21
           2       1.00      0.07      0.13        14

    accuracy                           0.76        54
   macro avg       0.85      0.69      0.63        54
weighted avg       0.83      0.76      0.68        54



### Training with standardization

In [12]:
std_scale_clf = make_pipeline(StandardScaler(), RidgeClassifier(tol=1e-2, solver="sag"))
std_scale_clf.fit(X_train, y_train)
y_pred_std_scale = std_scale_clf.predict(X_test)


In [13]:
print('\nAccuracy [scaling]')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, y_pred_std_scale)))
print('\nClassification Report [scaling]')
print(metrics.classification_report(y_test, y_pred_std_scale))



Accuracy [scaling]
98.15%


Classification Report [scaling]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       1.00      0.95      0.98        21
           2       1.00      1.00      1.00        14

    accuracy                           0.98        54
   macro avg       0.98      0.98      0.98        54
weighted avg       0.98      0.98      0.98        54

