In [24]:
!set PYTHONIOENCODING=utf-8
!jupyter notebook

^C


In [26]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import config
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

In [2]:
df = pd.read_csv(config.CONFIG['paths']['train_with_folds'])
TARGET_COL = config.CONFIG['validation']['target_column']
N_SPLITS = config.CONFIG['validation']['n_splits']
print('Таргет', TARGET_COL, 'Фолды', N_SPLITS)

Таргет SalePrice Фолды 5


In [3]:
feature_cols = [c for c in df.columns 
                if c not in [TARGET_COL, 'fold'] 
                and pd.api.types.is_numeric_dtype(df[c])]

print('Число фич', len(feature_cols)) 

Число фич 288


# считаем RMSE

In [4]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [14]:
def run_cv_regression(model_cls, model_params, df, target_col=TARGET_COL, n_splits=N_SPLITS):
    oof = np.zeros(len(df))
    scores = []

    for fold in range(n_splits):
        train_mask = df['fold'] != fold
        val_mask = df['fold'] == fold
        X_train = df.loc[train_mask, feature_cols]
        y_train = df.loc[train_mask, target_col]
        X_val = df.loc[val_mask, feature_cols]
        y_val = df.loc[val_mask, target_col]
        model = model_cls(**model_params)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        oof[val_mask] = preds
        sc = rmse(y_val, preds)
        scores.append(sc)
        print(f"fold {fold} : RMSE = {sc:.4f}")

    print(f"mean RMSE: {np.mean(scores):.4f} +- {np.std(scores):.4f}")
    return oof, scores

# Лин регрессия 

In [15]:
results = []

In [16]:
_, s = run_cv_regression(LinearRegression, config.CONFIG['models']['linear_regression'], df)
results.append({'model': 'LinearRegression', 'mean_rmse': np.mean(s), 'std_rmse': np.std(s)})

fold 0 : RMSE = 0.1279
fold 1 : RMSE = 0.1253
fold 2 : RMSE = 0.2403
fold 3 : RMSE = 0.1555
fold 4 : RMSE = 0.1131
mean RMSE: 0.1524 +- 0.0461


# Ridge, Lasso, ElasticNEt

In [17]:
_, s = run_cv_regression(Ridge, config.CONFIG['models']['ridge'], df)
results.append({'model': 'Ridge', 'mean_rmse': np.mean(s), 'std_rmse': np.std(s)})

fold 0 : RMSE = 0.1358
fold 1 : RMSE = 0.1303
fold 2 : RMSE = 0.2242
fold 3 : RMSE = 0.1275
fold 4 : RMSE = 0.1163
mean RMSE: 0.1468 +- 0.0392


In [18]:
_, s = run_cv_regression(Lasso, config.CONFIG['models']['lasso'], df)
results.append({'model': 'Lasso', 'mean_rmse': np.mean(s), 'std_rmse': np.std(s)})

fold 0 : RMSE = 0.1276
fold 1 : RMSE = 0.1219
fold 2 : RMSE = 0.2256
fold 3 : RMSE = 0.1246
fold 4 : RMSE = 0.1066
mean RMSE: 0.1412 +- 0.0428


In [19]:
_, s = run_cv_regression(ElasticNet, config.CONFIG['models']['elasticnet'], df)
results.append({'model': 'ElasticNet', 'mean_rmse': np.mean(s), 'std_rmse': np.std(s)})

fold 0 : RMSE = 0.1257
fold 1 : RMSE = 0.1246
fold 2 : RMSE = 0.2256
fold 3 : RMSE = 0.1276
fold 4 : RMSE = 0.1093
mean RMSE: 0.1426 +- 0.0420


# knn

In [20]:
_, s = run_cv_regression(KNeighborsRegressor, config.CONFIG['models']['knn'], df)
results.append({'model': 'KNN', 'mean_rmse': np.mean(s), 'std_rmse': np.std(s)})

Exception in thread Thread-275 (_readerthread):
Traceback (most recent call last):
  File "c:\newTry2\condaR\envs\ml_final\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\newTry2\condaR\envs\ml_final\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "c:\newTry2\condaR\envs\ml_final\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "c:\newTry2\condaR\envs\ml_final\lib\codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8d in position 4: invalid start byte


fold 0 : RMSE = 0.3723
fold 1 : RMSE = 0.3301
fold 2 : RMSE = 0.3078
fold 3 : RMSE = 0.3420
fold 4 : RMSE = 0.2956
mean RMSE: 0.3296 +- 0.0269


# Tree

In [21]:
_, s = run_cv_regression(DecisionTreeRegressor, config.CONFIG['models']['decision_tree'], df)
results.append({'model': 'DecisionTree', 'mean_rmse': np.mean(s), 'std_rmse': np.std(s)})

fold 0 : RMSE = 0.1915
fold 1 : RMSE = 0.2105
fold 2 : RMSE = 0.2136
fold 3 : RMSE = 0.1932
fold 4 : RMSE = 0.1984
mean RMSE: 0.2014 +- 0.0090


# Random forest

In [25]:
_, s = run_cv_regression(RandomForestRegressor, config.CONFIG['models']['random_forest'], df)
results.append({'model': 'RandomForest', 'mean_rmse': np.mean(s), 'std_rmse': np.std(s)})

fold 0 : RMSE = 0.1474
fold 1 : RMSE = 0.1275
fold 2 : RMSE = 0.1773
fold 3 : RMSE = 0.1503
fold 4 : RMSE = 0.1245
mean RMSE: 0.1454 +- 0.0190


# Бустинги

In [27]:
params = {**config.CONFIG['models']['xgboost'], 'random_state': config.CONFIG['seed']}
_, s = run_cv_regression(xgb.XGBRegressor, params, df)
results.append({'model': 'XGBoost', 'mean_rmse': np.mean(s), 'std_rmse': np.std(s)})

fold 0 : RMSE = 0.1292
fold 1 : RMSE = 0.1136
fold 2 : RMSE = 0.1558
fold 3 : RMSE = 0.1248
fold 4 : RMSE = 0.1046
mean RMSE: 0.1256 +- 0.0174


In [28]:
_, s = run_cv_regression(lgb.LGBMRegressor, config.CONFIG['models']['lightgbm'], df)
results.append({'model': 'LightGBM', 'mean_rmse': np.mean(s), 'std_rmse': np.std(s)})

fold 0 : RMSE = 0.1410
fold 1 : RMSE = 0.1205
fold 2 : RMSE = 0.1662
fold 3 : RMSE = 0.1314
fold 4 : RMSE = 0.1137
mean RMSE: 0.1345 +- 0.0184


In [29]:
_, s = run_cv_regression(CatBoostRegressor, config.CONFIG['models']['catboost'], df)
results.append({'model': 'CatBoost', 'mean_rmse': np.mean(s), 'std_rmse': np.std(s)})

0:	learn: 0.3856104	total: 153ms	remaining: 12m 45s
200:	learn: 0.1197089	total: 716ms	remaining: 17.1s
400:	learn: 0.0918064	total: 1.24s	remaining: 14.2s
600:	learn: 0.0788432	total: 1.72s	remaining: 12.6s
800:	learn: 0.0687025	total: 2.23s	remaining: 11.7s
1000:	learn: 0.0601990	total: 2.74s	remaining: 10.9s
1200:	learn: 0.0537837	total: 3.24s	remaining: 10.2s
1400:	learn: 0.0477912	total: 3.76s	remaining: 9.65s
1600:	learn: 0.0427435	total: 4.26s	remaining: 9.05s
1800:	learn: 0.0385172	total: 4.78s	remaining: 8.49s
2000:	learn: 0.0348256	total: 5.27s	remaining: 7.9s
2200:	learn: 0.0315820	total: 5.78s	remaining: 7.35s
2400:	learn: 0.0286791	total: 6.29s	remaining: 6.81s
2600:	learn: 0.0261069	total: 6.79s	remaining: 6.26s
2800:	learn: 0.0237328	total: 7.31s	remaining: 5.74s
3000:	learn: 0.0215937	total: 7.82s	remaining: 5.21s
3200:	learn: 0.0196647	total: 8.38s	remaining: 4.71s
3400:	learn: 0.0179663	total: 8.94s	remaining: 4.2s
3600:	learn: 0.0164693	total: 9.46s	remaining: 3.67s


In [33]:
res_df = pd.DataFrame(results)
res_df = res_df.sort_values('mean_rmse', ascending=False).reset_index(drop=True)
path_metrics = config.CONFIG['paths']['metrics_results']
res_df.to_csv(path_metrics, index=False)
print('Метрики сохранены:', path_metrics)
print(res_df)


Метрики сохранены: C:\newTry2\classicMLpractice\ProjectKaggle\HousePrices\checkpoints\metrics_results.csv
              model  mean_rmse  std_rmse
0               KNN   0.329556  0.026867
1      DecisionTree   0.201436  0.009022
2  LinearRegression   0.152411  0.046069
3             Ridge   0.146807  0.039202
4      RandomForest   0.145413  0.018973
5        ElasticNet   0.142566  0.042012
6             Lasso   0.141237  0.042777
7          LightGBM   0.134539  0.018364
8           XGBoost   0.125615  0.017377
9          CatBoost   0.123610  0.017683
