In [42]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from matplotlib import pyplot as plt

import click
import logging
import pandas as pd

from pathlib import Path
from dotenv import find_dotenv, load_dotenv
from src.utils import save_as_pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

from catboost import CatBoostRegressor
from category_encoders.count import CountEncoder

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [24]:
raw_path = '../data/raw/'
train = pd.read_csv(os.path.join(raw_path, 'train.csv'))
test = pd.read_csv(os.path.join(raw_path, 'test.csv'))

Препроцессинг

In [25]:
TARGET_COL = 'SalePrice'
ID_COL = 'Id'
CAT_COLS = [
    'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 
    'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
    'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
    'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
    'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
    'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'
]
REAL_COLS = [
    'MSSubClass', 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
    'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
    'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars',
    'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold',
    'YrSold'
]


RS = 7

In [26]:
def fill_fields(df: pd.DataFrame) -> pd.DataFrame:
    df['FireplaceQu'] = df['FireplaceQu'].fillna('No Fireplace')
    df['BsmtQual'] = df['BsmtQual'].fillna('No Basement')
    df['BsmtCond'] = df['BsmtCond'].fillna('No Basement')
    df['BsmtExposure'] = df['BsmtExposure'].fillna('No Basement')
    df['BsmtFinType1'] = df['BsmtFinType1'].fillna('No Basement')
    df['BsmtFinType2'] = df['BsmtFinType2'].fillna('No Basement')
    df['MasVnrArea'] = df['MasVnrArea'].fillna(0)
    df['MasVnrType'] = df['MasVnrType'].fillna('None')
    df['Alley'] = df['Alley'].fillna('No alley access')
    most_freq = df['LotFrontage'].value_counts().index[0]
    df['LotFrontage'] = df['LotFrontage'].fillna(most_freq)
    df['GarageType'] = df['GarageType'].fillna('No Garage')
    df['GarageFinish'] = df['GarageFinish'].fillna('No Garage')
    df['GarageQual'] = df['GarageQual'].fillna('No Garage')
    df['GarageCond'] = df['GarageCond'].fillna('No Garage')
    df['PoolQC'] = df['PoolQC'].fillna('No Pool')
    df['Fence'] = df['Fence'].fillna('No Fence')
    df['MiscFeature'] = df['MiscFeature'].fillna('None')
    return df


def cast_types(df: pd.DataFrame) -> pd.DataFrame:
    df[ID_COL] = df[ID_COL].astype(np.int8)
    df[CAT_COLS] = df[CAT_COLS].astype('category')
    df[REAL_COLS] = df[REAL_COLS].astype(np.float32)
    return df


def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    df = fill_fields(df)
    df = cast_types(df)
    return df


def preprocess_target(df: pd.DataFrame) -> pd.DataFrame:
    df[TARGET_COL] = df[TARGET_COL].astype(np.int32)
    return df


def extract_target(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    df, target = df.drop(TARGET_COL, axis=1), df[TARGET_COL].copy()
    return df, target

In [27]:
train_data, train_target =  extract_target(train)

In [28]:
train_data = preprocess_data(train)

MODELING

In [29]:
from sklearn.model_selection import train_test_split
train_data, val_data, train_target, val_target = train_test_split(train_data, train_target, test_size=0.4, random_state=7)

In [30]:
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

real_pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
    ])

sklearn classical linear regressors

In [39]:
sk_model = LinearRegression(positive=True)

sk_preprocess_pipeline = ColumnTransformer(transformers=[
    ('real_cols', real_pipeline, REAL_COLS),
    ('cat_cols', cat_pipeline, CAT_COLS),
    ])

sk_linear_regression_model = Pipeline([
    ('preprocess', sk_preprocess_pipeline),
    ('model', sk_model)
    ])

In [40]:
sk_linear_regression_model.fit(train_data, train_target)

In [43]:
res_predict = sk_linear_regression_model.predict(val_data)

In [44]:
train_score = sk_linear_regression_model.score(train_data, train_target)
test_score = sk_linear_regression_model.score(val_data, val_target)
print(train_score, test_score)

0.9516951217925221 -2.2010491852541612e+24


In [45]:
mean_absolute_error(val_target, res_predict)

1.1616360467623412e+16

In [46]:
mean_squared_error(val_target, res_predict)

1.475419709401162e+34

In [47]:
r2_score(val_target, res_predict)

-2.2010491852541612e+24

catboost

In [50]:
catboost_model = CatBoostRegressor(iterations=1000,
                                   learning_rate=1,
                                   depth=2)

catboost_preprocess_pipeline = ColumnTransformer(transformers=[
    ('real_cols', real_pipeline, REAL_COLS),
    ('cat_cols', cat_pipeline, CAT_COLS),
    ('cat_bost_cols', CountEncoder(), CAT_COLS)
    ])

rscv = GridSearchCV(
    estimator=catboost_model,
    param_grid={'learning_rate': [0.03, 0.1],
                'depth': [2, 4],
                'l2_leaf_reg': [0.2, 0.5],
                'model_size_reg': [0.5, 1]},
    scoring='explained_variance',
    cv=5,
    refit=True
)

catboost_regression_model = Pipeline([
    ('preprocess', catboost_preprocess_pipeline),
    ('model', rscv)
    ])

In [51]:
catboost_regression_model.fit(train_data, train_target)

0:	learn: 77218.0891564	total: 154ms	remaining: 2m 34s
1:	learn: 75958.2201208	total: 158ms	remaining: 1m 18s
2:	learn: 74724.1266812	total: 160ms	remaining: 53.1s
3:	learn: 73522.7697696	total: 162ms	remaining: 40.3s
4:	learn: 72467.0382870	total: 164ms	remaining: 32.6s
5:	learn: 71407.6170677	total: 166ms	remaining: 27.5s
6:	learn: 70328.9718076	total: 168ms	remaining: 23.8s
7:	learn: 69246.9899297	total: 170ms	remaining: 21s
8:	learn: 68149.4366635	total: 173ms	remaining: 19.1s
9:	learn: 67320.0016316	total: 175ms	remaining: 17.3s
10:	learn: 66297.9346501	total: 177ms	remaining: 15.9s
11:	learn: 65349.4472083	total: 179ms	remaining: 14.7s
12:	learn: 64427.0592160	total: 181ms	remaining: 13.7s
13:	learn: 63641.3092352	total: 183ms	remaining: 12.9s
14:	learn: 62765.6151562	total: 189ms	remaining: 12.4s
15:	learn: 61979.9243567	total: 192ms	remaining: 11.8s
16:	learn: 61132.6065948	total: 194ms	remaining: 11.2s
17:	learn: 60319.8014694	total: 196ms	remaining: 10.7s
18:	learn: 59555.507

1 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\TiNa\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\TiNa\AppData\Local\Programs\Python\Python39\lib\site-packages\catboost\core.py", line 5730, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
  File "c:\Users\TiNa\AppData\Local\Programs\Python\Python39\lib\site-packages\catboost\core.py", line 2355, in _fit
    self._train(
  File "c

0:	learn: 72990.5605582	total: 4.55ms	remaining: 4.55s
1:	learn: 68884.7346632	total: 8.07ms	remaining: 4.03s
2:	learn: 65292.3551668	total: 14.2ms	remaining: 4.71s
3:	learn: 61771.0731162	total: 17.9ms	remaining: 4.46s
4:	learn: 58241.5165513	total: 21.2ms	remaining: 4.21s
5:	learn: 55541.3244348	total: 24.5ms	remaining: 4.05s
6:	learn: 52889.0341290	total: 29.6ms	remaining: 4.2s
7:	learn: 50241.8078454	total: 32.8ms	remaining: 4.07s
8:	learn: 48058.6473839	total: 36.1ms	remaining: 3.97s
9:	learn: 46112.5227316	total: 39.2ms	remaining: 3.88s
10:	learn: 44081.9790163	total: 43.9ms	remaining: 3.95s
11:	learn: 42348.1479992	total: 47.5ms	remaining: 3.91s
12:	learn: 40832.5038040	total: 50.7ms	remaining: 3.85s
13:	learn: 39509.9232781	total: 54ms	remaining: 3.8s
14:	learn: 38335.3405971	total: 57.2ms	remaining: 3.75s
15:	learn: 37216.1068318	total: 97.6ms	remaining: 6s
16:	learn: 36047.5081527	total: 105ms	remaining: 6.08s
17:	learn: 34731.7513897	total: 113ms	remaining: 6.16s
18:	learn: 

In [52]:
res_predict = catboost_regression_model.predict(val_data)

In [53]:
train_score = catboost_regression_model.score(train_data, train_target)
test_score = catboost_regression_model.score(val_data, val_target)
print(train_score, test_score)

0.9994369287442121 0.8815518297012481


In [54]:
mean_absolute_error(val_target, res_predict)

15849.295104626088

In [55]:
mean_squared_error(val_target, res_predict)

795090378.6454457

In [56]:
r2_score(val_target, res_predict)

0.8813874439273095