## Инструменты сериализации объектов Python

### 1. Обучение модели

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_diabetes

X, y = load_diabetes(return_X_y=True) #Загрузка датасета
regressor = LinearRegression()

regressor.fit(X, y) #Веса модели были сформированы во время обучения

LinearRegression()

### 2. Сериализация модели -> Поток байтов

In [7]:
import pickle
model = pickle.dumps(regressor) #Сериализация обученной модели

print(type(model))
print(type(regressor))

<class 'bytes'>
<class 'sklearn.linear_model._base.LinearRegression'>


### 3. Десериализация объекта -> Восстановление объекта Python

In [13]:
regressor_from_bytes = pickle.loads(model)

In [14]:
regressor_from_bytes

LinearRegression()

### 4. Сохранение сериализованного объекта

In [10]:
with open('myfile.pkl', 'wb') as output:
    pickle.dump(regressor, output)
    
# Можем передать этот файл ML-инженерам для деплоя модели на сервер

### 5. Десериализация модели из файла

In [11]:
with open('myfile.pkl', 'rb') as pkl_file:
    regressor_from_file = pickle.load(pkl_file)

In [12]:
regressor_from_file

LinearRegression()

### 6. Проверка результатов предсказания

In [15]:
# all(iter) - проверяет, что все элементы последовательности = True
all(regressor.predict(X) == regressor_from_bytes.predict(X))

True

In [16]:
all(regressor.predict(X) == regressor_from_file.predict(X))

True

### 7. Ограничения pickle

In [17]:
my_lambda = lambda x: x*2
with open('my_lambda.pkl', 'wb') as output:
    pickle.dump(my_lambda, output)
    
# В таких случаях лучше пользоваться пакетом dill

PicklingError: Can't pickle <function <lambda> at 0x7be9806ae840>: attribute lookup <lambda> on __main__ failed

## Сохранение пайплайнов и моделей

### 1. Пайплайны

In [18]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler



In [19]:
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [24]:
#data = fetch_california_housing()
#df = pd.DataFrame(data['data'], columns=data['feature_names'])
#df.loc[:, 'target'] = data['target']

data = pd.read_csv('california.csv')
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [34]:
data.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [36]:
def rmse(y_hat, y):
    return mean_squared_error(y_hat, y, squared = False)

X = data.drop('target', axis=1)
Y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 42)

In [37]:
X_train.shape #Размер обучающей выборки

(15480, 8)

In [38]:
X_test.shape #Размер тестовой выборки

(5160, 8)

In [40]:
#PIPELINE <- Список из кортежей (сокращение, преобразование)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor())
])

pipeline.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())])

In [42]:
y_pred = pipeline.predict(X_test)

r2_score(y_test, y_pred) #R2

0.8086956212854606

In [45]:
rmse(y_test, y_pred) #RMSE

0.5031276353612809

In [46]:
pipeline.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()), ('rf', RandomForestRegressor())],
 'verbose': False,
 'scaler': StandardScaler(),
 'rf': RandomForestRegressor(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'rf__bootstrap': True,
 'rf__ccp_alpha': 0.0,
 'rf__criterion': 'squared_error',
 'rf__max_depth': None,
 'rf__max_features': 'auto',
 'rf__max_leaf_nodes': None,
 'rf__max_samples': None,
 'rf__min_impurity_decrease': 0.0,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'rf__min_weight_fraction_leaf': 0.0,
 'rf__n_estimators': 100,
 'rf__n_jobs': None,
 'rf__oob_score': False,
 'rf__random_state': None,
 'rf__verbose': 0,
 'rf__warm_start': False}

In [47]:
pipeline #СПИСОК

Pipeline(steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())])

In [48]:
pipeline[1]

RandomForestRegressor()

In [49]:
pipeline['rf']

RandomForestRegressor()

In [51]:
#ИЗМЕНЕНИЕ ПАРАМЕТРОВ МОДЕЛИ
pipeline.set_params(rf__n_estimators = 200) #обращение через два нижних подчёркивания

Pipeline(steps=[('scaler', StandardScaler()),
                ('rf', RandomForestRegressor(n_estimators=200))])

In [54]:
#ТАКОЙ ФОРМАТ ОБРАЩЕНИЯ НЕОБХОДИМ И ДЛЯ GridSearch кросс-валидации

from sklearn.model_selection import GridSearchCV

param_grid = {'scaler__with_mean': [True, False],
             'rf__n_estimators': [100, 200, 500]}

grid_search = GridSearchCV(pipeline, param_grid = param_grid, verbose = True)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('rf',
                                        RandomForestRegressor(n_estimators=200))]),
             param_grid={'rf__n_estimators': [100, 200, 500],
                         'scaler__with_mean': [True, False]},
             verbose=True)

In [55]:
grid_search.best_estimator_

Pipeline(steps=[('scaler', StandardScaler(with_mean=False)),
                ('rf', RandomForestRegressor(n_estimators=500))])

In [56]:
y_pred = grid_search.best_estimator_.predict(X_test)

In [57]:
r2_score(y_test, y_pred) #R2

0.8104727632082045

In [58]:
rmse(y_test, y_pred) #RMSE

0.5007852544724727

### 2. Предобработка данных в пайплайнах. ColumnTransformer

In [60]:
df_wine = pd.read_csv('Red.csv')
df_wine.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016


In [61]:
df_wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8666 entries, 0 to 8665
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             8666 non-null   object 
 1   Country          8666 non-null   object 
 2   Region           8666 non-null   object 
 3   Winery           8666 non-null   object 
 4   Rating           8666 non-null   float64
 5   NumberOfRatings  8666 non-null   int64  
 6   Price            8666 non-null   float64
 7   Year             8666 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 541.8+ KB


In [62]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer

In [64]:
#Ин-ция объекта make_column_transformer: <- на Вход подаются кортежи (Преобразование, Список колонок)

ct = make_column_transformer(
    (StandardScaler(), ['Price']),
    (OneHotEncoder(), ['Country'])
)

ct

ColumnTransformer(transformers=[('standardscaler', StandardScaler(), ['Price']),
                                ('onehotencoder', OneHotEncoder(),
                                 ['Country'])])

In [65]:
#Уже ColumnTransformer можно добавить в Pipeline
pipeline = Pipeline([
    ('ct', ct),
    ('rf', RandomForestRegressor())
])

In [66]:
X = df_wine[['Country', 'Price']]
X.head(3)

Unnamed: 0,Country,Price
0,France,95.0
1,France,15.5
2,Italy,7.45


In [67]:
y = df_wine['Rating']
y.head(3)

0    4.2
1    4.3
2    3.9
Name: Rating, dtype: float64

In [68]:
#Обучим наш пайплайн
pipeline.fit(X, y)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(), ['Price']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Country'])])),
                ('rf', RandomForestRegressor())])

In [70]:
#Можем посмотреть на преобразования ColumnTransformera
pipeline['ct']

ColumnTransformer(transformers=[('standardscaler', StandardScaler(), ['Price']),
                                ('onehotencoder', OneHotEncoder(),
                                 ['Country'])])

In [71]:
X.head(3)

Unnamed: 0,Country,Price
0,France,95.0
1,France,15.5
2,Italy,7.45


In [73]:
pipeline['ct'].transform(X).toarray()

array([[ 0.6576476 ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.2784019 ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.37318427,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.17890984,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.38778428,  0.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.36812136,  1.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [75]:
pipeline['ct'].transformers_

[('standardscaler', StandardScaler(), ['Price']),
 ('onehotencoder', OneHotEncoder(), ['Country'])]

In [77]:
pipeline['ct'].transformers_[1][1]

OneHotEncoder()

In [78]:
pipeline['ct'].transformers_[1][1].get_feature_names_out()

array(['Country_Argentina', 'Country_Australia', 'Country_Austria',
       'Country_Brazil', 'Country_Bulgaria', 'Country_Canada',
       'Country_Chile', 'Country_China', 'Country_Croatia',
       'Country_France', 'Country_Georgia', 'Country_Germany',
       'Country_Greece', 'Country_Hungary', 'Country_Israel',
       'Country_Italy', 'Country_Lebanon', 'Country_Mexico',
       'Country_Moldova', 'Country_New Zealand', 'Country_Portugal',
       'Country_Romania', 'Country_Slovakia', 'Country_Slovenia',
       'Country_South Africa', 'Country_Spain', 'Country_Switzerland',
       'Country_Turkey', 'Country_United States', 'Country_Uruguay'],
      dtype=object)

In [79]:
pd.DataFrame(
    pipeline['ct'].transform(X).toarray(),
    columns = ['Price'] + pipeline['ct'].transformers_[1][1].get_feature_names_out().tolist()
)

Unnamed: 0,Price,Country_Argentina,Country_Australia,Country_Austria,Country_Brazil,Country_Bulgaria,Country_Canada,Country_Chile,Country_China,Country_Croatia,...,Country_Portugal,Country_Romania,Country_Slovakia,Country_Slovenia,Country_South Africa,Country_Spain,Country_Switzerland,Country_Turkey,Country_United States,Country_Uruguay
0,0.657648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.278402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.373184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.358231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.117684,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8661,-0.266981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8662,-0.224358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8663,-0.178910,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8664,-0.387784,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
#Сохраним pipeline методом сериализации
import joblib
joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl']

In [81]:
pipeline_loaded = joblib.load('pipeline.pkl')

In [82]:
pipeline_loaded #тот же pipeline

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(), ['Price']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Country'])])),
                ('rf', RandomForestRegressor())])

### 3. Сохранение пайплайна. Кастомные трансформеры

Хотим сериализовать пайплайн, который включает в себя min-max нормализацию и отбор пяти наиболее важных признаков на основе корреляции Пирсона

In [116]:
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_diabetes
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

X, y = load_diabetes(return_X_y=True)

pipe = Pipeline([
    ('Scaling', MinMaxScaler()),
    ('FeatureSelection', SelectKBest(f_regression, k=5)),
    ('Linear', LinearRegression())
])

pipe.fit(X, y)

Pipeline(steps=[('Scaling', MinMaxScaler()),
                ('FeatureSelection',
                 SelectKBest(k=5,
                             score_func=<function f_regression at 0x7be979c5c950>)),
                ('Linear', LinearRegression())])

In [92]:
#Сериализуем pipeline и запишем результат в файл
with open('my_pipeline.pkl', 'wb') as output:
    pickle.dump(pipe, output)

In [93]:
#Десериализуем pipeline из файла
with open('my_pipeline.pkl', 'rb') as pkl_file:
    loaded_pipe = pickle.load(pkl_file)

In [94]:
all(pipe.predict(X) == loaded_pipe.predict(X))

True

Если хотим сохранить сериализованные пайплайны в виде потока байтов, нужно использовать функции dumps() и loads(), но не dump() и load()

Для использования нестандартных методов sklearn (вроде feature engineering) организованы кастомные трансформеры.
</br>Такой трансформер должен наследоваться от двух классов: **TransformerMixin** и **BaseEstimator**

In [97]:
from sklearn.base import TransformerMixin, BaseEstimator

class MyTransformer(TransformerMixin, BaseEstimator):
    '''Шаблон кастомного трансформера'''
    
    def __init__(self):
        '''Инициализация исходных параметров, не зависящих от данных'''
        pass
    
    def fit(self, X, y=None):
        '''Здесь прописывается обучение трансформера. Возвращает ссылку на сам объект'''
        return self
    
    def transform(self, X):
        '''Здесь прописываются действия с данными. Возвращает преобразованный массив данных'''

Предположим, что мы хотим сгенерировать новый признак, который является простым произведением первых трёх столбцов таблицы. Пропишем эти действия в **transform()**

In [99]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930


In [103]:
new_column = X[:, 0] * X[:, 1] * X[:, 2]

In [104]:
new_column

array([ 1.19054643e-04, -4.32466005e-06,  1.92160759e-04, -4.61007963e-05,
        8.74355534e-06, -1.68402969e-04,  1.08689088e-04, -6.09787167e-06,
        1.30412759e-04,  1.23635740e-04, -3.60396359e-04,  2.41126253e-05,
        2.09608025e-05, -5.16902542e-07,  5.18301511e-05,  4.82748278e-05,
        1.04122757e-05,  4.34579199e-05, -1.79385707e-05, -2.20202132e-05,
       -1.24651258e-04,  9.68672378e-05, -1.54469578e-05,  1.39294185e-04,
        1.01781435e-04,  4.32033825e-05, -3.70212942e-04,  6.29338635e-05,
        5.00103760e-05, -2.11155808e-05,  1.19067717e-04, -6.92177133e-05,
        2.18700440e-04,  6.93173119e-05,  4.60279869e-05, -7.69306592e-05,
       -1.29272692e-05,  4.50769467e-06, -6.80985824e-06, -1.36132041e-06,
       -2.28115678e-06, -3.01842466e-04,  3.19821868e-05,  2.08469182e-05,
        1.56631013e-04, -4.86316621e-05, -2.91783115e-05, -2.54833982e-04,
       -1.42133994e-04, -3.02641144e-05,  1.11995795e-05,  4.98473934e-05,
       -2.22230694e-05, -

In [108]:
new_column.shape

(442,)

In [106]:
X.shape #442 строки и 10 признаков

(442, 10)

In [107]:
new_column.reshape(X.shape[0], 1) #Массив -> Столбец

array([[ 1.19054643e-04],
       [-4.32466005e-06],
       [ 1.92160759e-04],
       [-4.61007963e-05],
       [ 8.74355534e-06],
       [-1.68402969e-04],
       [ 1.08689088e-04],
       [-6.09787167e-06],
       [ 1.30412759e-04],
       [ 1.23635740e-04],
       [-3.60396359e-04],
       [ 2.41126253e-05],
       [ 2.09608025e-05],
       [-5.16902542e-07],
       [ 5.18301511e-05],
       [ 4.82748278e-05],
       [ 1.04122757e-05],
       [ 4.34579199e-05],
       [-1.79385707e-05],
       [-2.20202132e-05],
       [-1.24651258e-04],
       [ 9.68672378e-05],
       [-1.54469578e-05],
       [ 1.39294185e-04],
       [ 1.01781435e-04],
       [ 4.32033825e-05],
       [-3.70212942e-04],
       [ 6.29338635e-05],
       [ 5.00103760e-05],
       [-2.11155808e-05],
       [ 1.19067717e-04],
       [-6.92177133e-05],
       [ 2.18700440e-04],
       [ 6.93173119e-05],
       [ 4.60279869e-05],
       [-7.69306592e-05],
       [-1.29272692e-05],
       [ 4.50769467e-06],
       [-6.8

In [110]:
new_column = new_column.reshape(X.shape[0], 1)

In [111]:
X = np.append(X, new_column, axis=1)

In [113]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,0.000119
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204,-0.000004
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930,0.000192
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,-0.000046
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,0.000009
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,0.000042
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485,0.000004
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491,-0.000034
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930,0.000079


In [115]:
class MyTransformer(TransformerMixin, BaseEstimator):
    
    def __init__(self):
        
        pass
    
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X):
        #Создаём новый столбец как произведение первых трёх
        new_column = X[:, 0] * X[:, 1] * X[:, 2]
        new_column = new_column.reshape(X.shape[0], 1) #Массив -> Столбец
        X = np.append(X, new_column, axis=1)
        return X

In [117]:
#Создадим объект класса MyTransformer
custom_transformer = MyTransformer() #ин-ция методом __init__
custom_transformer.fit(X) #пока вызываем fit чисто формально
X_transformed = custom_transformer.transform(X) #трансформируем исходные данные

In [118]:
X.shape #before transform

(442, 10)

In [120]:
X_transformed.shape #after transform

(442, 11)

Встроим трансформе в сам пайплайн. Теперь пайплан будет включать в себя Feature Engineering, нормализацию, отбор признаков и обучение модели

In [121]:
pipe = Pipeline([
    ('FeatureEngineering', MyTransformer()),
    ('Scaling', MinMaxScaler()),
    ('FeatureSelection', SelectKBest(f_regression, k=5)),
    ('Linear', LinearRegression())
])

In [122]:
pipe.fit(X, y)

Pipeline(steps=[('FeatureEngineering', MyTransformer()),
                ('Scaling', MinMaxScaler()),
                ('FeatureSelection',
                 SelectKBest(k=5,
                             score_func=<function f_regression at 0x7be979c5c950>)),
                ('Linear', LinearRegression())])

In [123]:
with open('my_new_pipeline.pkl', 'wb') as output:
    pickle.dump(pipe, output)

Десереализируем полученный пайплайн и попробуем сделать предсказание

In [124]:
features = np.array([[ 0.00538306, -0.04464164,  0.05954058, -0.05616605,  0.02457414, 0.05286081, -0.04340085,  0.05091436, -0.00421986, -0.03007245]])

In [125]:
#Десериализуем pipeline из файла
with open('my_new_pipeline.pkl', 'rb') as pkl_file:
    loaded_pipe = pickle.load(pkl_file)

In [126]:
loaded_pipe.predict(features)

array([173.01938844])

### 4. Библиотека JOBLIB

Модуль **joblib** более надёжен для работы с объектами, которые содержат большие массивы данных. Минус этого модуля заключается в том, что он может консервировать только в файл, поэтому мы не сможем получить объект в виде бинарной строки и работать с ним. Сериализация происходит с помощьб функции dump(), а десериализация - с помощью load()

In [127]:
import joblib

X, y = load_diabetes(return_X_y=True)

regressor = LinearRegression()
regressor.fit(X, y)

joblib.dump(regressor, 'regr.joblib')

['regr.joblib']

In [128]:
regr_from_joblib = joblib.load('regr.joblib')

In [131]:
all(regressor.predict(X) == regr_from_joblib.predict(X))

True

## Практика

In [5]:
import pickle
import numpy as np

#model.pkl
with open('model.pkl', 'rb') as pkl_file:
    model = pickle.load(pkl_file)

secret word: skillfactory
how is this possible? answer is here: https://youtu.be/xm-A-h9QkXg


In [3]:
model

LinearRegression(normalize=False, positive=True)

In [8]:
features = np.array([[1, 1, 1, 0.661212487096872]])

In [9]:
test = model.predict(features)
test

array([0.666])