In [1]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

from sklearn import metrics
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import category_encoders as ce

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

import dill

from sklearn.base import BaseEstimator, TransformerMixin

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Загружаем данные Сбербанка по недвижимости: https://www.kaggle.com/c/sberbank-russian-housing-market/data
path_name = "train.csv"
df = pd.read_csv(path_name, sep=",")


In [3]:
# смотрим сведения о них
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30471 entries, 0 to 30470
Columns: 292 entries, id to price_doc
dtypes: float64(119), int64(157), object(16)
memory usage: 67.9+ MB


In [4]:
# выделяем целевую переменную
y = df["price_doc"]

In [5]:
# убираем "id", "price_doc" из набора признаков
X = df.drop(['id', 'price_doc'], axis=1)

In [6]:
# Обзор номинативных признаков
cat_names = []
for cat_colname in X.select_dtypes(include='object').columns:
    cat_names.append(cat_colname)
    #print(f"{cat_colname} \n\n{X[cat_colname].value_counts()} \n{'*' * 70} \n")

In [7]:
# Обзор численных признаков
num_names = []
for num_colname in X.select_dtypes(include='number').columns:
    num_names.append(num_colname)
    #print(f"{cat_colname} \n\n{X[cat_colname].value_counts()} \n{'*' * 70} \n")

In [8]:
# делаем класс удаления тех колонок, в которых пропусков больше 20%

class empty_drop(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        pass
    
    def fit(self, x, y=None):
        dat = x.copy()
        self.lstRemCols = self._delemptCols(dat)
        return self
    
    def transform(self, x):        
        dat = x.copy()
        lstcols = list(set(dat.columns) - set(self.lstRemCols))
        return dat.loc[:, lstcols]
    
    def _delemptCols(self, df):
        emp_arr = []
        for i in df:
            pct_missing = np.mean(df[i].isnull())
            if (pct_missing != 0) and (pct_missing >= 0.2):
                emp_arr.append(i)        
        return emp_arr


In [9]:
# делаем класс устранения выбросов
class outlier_replace(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        pass
    
    def fit(self, x, y=None):
        dat = x.copy()
        self.lstRemCols = self._indicies_of_outliers(dat)
        return self
    
    def transform(self, x):        
        dat = x.copy()        
        for cat_colname in dat.columns:
            for array_of_index in self.lstRemCols:
                for i in array_of_index:
                    dat.loc[dat[cat_colname] == dat[cat_colname][i], cat_colname] = dat[cat_colname].median()
        return dat
    
    def _indicies_of_outliers(self, df):
        """Возвращает индекс выбросов, используя межквартильный размах
        """
        q1, q3 = np.percentile(df, [25,75])
        iqr = q3 - q1
        lower_bound = q1 - (iqr * 1.5)
        upper_bound = q3 + (iqr * 1.5)
        return np.where((df > upper_bound)|(df < lower_bound))    


In [10]:
# формируем пайплайн для численных признаков
numeric_transformer = Pipeline(steps=[
    ('emptdr',empty_drop(list(X.columns))),
    ('outlier',outlier_replace(list(X.columns))),
    ('imputer', SimpleImputer(strategy='median'))
    ])

In [11]:
# формируем пайплайн для категориальных признаков
categorical_transformer = ce.CatBoostEncoder(return_df=True)

In [12]:
# формируем пайплайн для всех признаков
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, cat_names),
        ('num', numeric_transformer, num_names)
        ])

In [13]:
# формируем результирующий пайплайн
mdl = Pipeline(steps=[('preprocessor', preprocessor),
                     ('scaler', StandardScaler()),
                      ('pca', PCA(n_components=0.98, whiten=True)),
                      ('catboost', CatBoostRegressor(learning_rate=0.1,depth=5,l2_leaf_reg=11,silent=True))])

In [14]:
#разделяем данные на тестовые и тренировочные
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# тренируем модель и смотрим ее R^2
mdl.fit(x_train, y_train)
print(f"model score R^2: {mdl.score(x_test, y_test):.3f}")

model score R^2: 0.652


In [16]:
# Предсказываем значения
y_predict_ = mdl.predict(x_test)

In [17]:
# смотрим R^2
print(f"R^2: {r2_score(y_test, y_predict_):.3f}")

R^2: 0.652


In [18]:
# смотрим cреднеквадратичную логарифмическую ошибку регрессии
print(f"Средкв лог ошибка рег: {metrics.mean_squared_log_error(y_test,y_predict_):.3f}")

Средкв лог ошибка рег: 0.233


In [19]:
# задаем набор для отбора
param_grid = {
    'preprocessor__num__imputer__strategy': ['median'],    
'catboost__learning_rate': [0.1],
        'catboost__depth': [4, 6],
        'catboost__l2_leaf_reg': [11]}

grid_search = GridSearchCV(mdl, param_grid, cv=10)
grid_search

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         CatBoostEncoder(),
                                                                         ['timestamp',
                                                                          'product_type',
                                                                          'sub_area',
                                                                          'culture_objects_top_25',
                                                                          'thermal_power_plant_raion',
                                                                          'incineration_raion',
                                                                          'oil_chemistry_raion',
                                                                          'radiation_raion',


In [20]:
# начинаем отбор лучших параметров модели
grid_search.fit(x_train, y_train)
print(f"Best params:")
print(grid_search.best_params_)

Best params:
{'catboost__depth': 6, 'catboost__l2_leaf_reg': 11, 'catboost__learning_rate': 0.1, 'preprocessor__num__imputer__strategy': 'median'}


In [21]:
# с учетом лучших отобранных параметров составляем пайплайн

numeric_transformer = Pipeline(steps=[
    ('emptdr',empty_drop(list(X.columns))),
    ('outlier',outlier_replace(list(X.columns))),
    ('imputer', SimpleImputer(strategy=grid_search.best_params_.get('preprocessor__num__imputer__strategy')))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_names),
        ('cat', categorical_transformer, cat_names)])

mdl2 = Pipeline(steps=[
                      ('preprocessor', preprocessor),
                    ('scaler', StandardScaler()),
                      ('pca', PCA(n_components=0.95, whiten=True)),
                      ('catboost', CatBoostRegressor(learning_rate=grid_search.best_params_.get('catboost__learning_rate'),
                                                     depth=grid_search.best_params_.get('catboost__depth'),
                                                     l2_leaf_reg=grid_search.best_params_.get('catboost__l2_leaf_reg'),
                                                    silent=True))])

In [22]:
# тренируем модель на лучших параметрах
mdl2.fit(x_train, y_train)
print(f"model score R^2: {mdl.score(x_test, y_test):.3f}")


model score R^2: 0.652


In [23]:
# Предсказываем значения
y_predict_2 = mdl2.predict(x_test)

In [24]:
# смотрим R^2
print(f"R^2: {r2_score(y_test, y_predict_2):.3f}")

R^2: 0.650


In [25]:
# смотрим cреднеквадратичную логарифмическую ошибку регрессии
print(f"Средкв лог ошибка рег: {metrics.mean_squared_log_error(y_test, y_predict_2):.3f}")

Средкв лог ошибка рег: 0.232


In [26]:
# смотрим на пайплайн
mdl2.steps

[('preprocessor',
  ColumnTransformer(transformers=[('num',
                                   Pipeline(steps=[('emptdr',
                                                    empty_drop(attribute_names=None)),
                                                   ('outlier',
                                                    outlier_replace(attribute_names=None)),
                                                   ('imputer',
                                                    SimpleImputer(strategy='median'))]),
                                   ['full_sq', 'life_sq', 'floor', 'max_floor',
                                    'material', 'build_year', 'num_room',
                                    'kitch_sq', 'state', 'area_m', 'raion_popul',
                                    'green_zone_part', 'indust_part',
                                    'chi...
                                    'full_all', 'male_f', ...]),
                                  ('cat', CatBoostEncoder(),
        

In [27]:
# сериализуем пайплайн
with open("catboost_pipeline2.dill", "wb") as f:
    dill.dump(mdl2, f)
    