## Импорт необходимого функционала

In [1]:
import pickle
import warnings

import numpy as np
import pandas as pd

import optuna
import xgboost
import catboost as cb
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import (
    RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor, VotingRegressor
)

warnings.filterwarnings("ignore")

## Загрузка данных

In [2]:
train = pd.read_csv("../data/mars-train-regr.csv")

data = train.drop(['Доля сигнала в ВП', 'Фаза Hor', 'Фаза Ver'], axis=1)
target = train['Доля сигнала в ВП']

## Подбор гиперпараметров для базовых классификаторов

### LGBM

In [3]:
def objective(trial,data=data,target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param = {
        'metric': 'l2', 
        'random_state': 48,
        'n_estimators': 20000,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    
    model = LGBMRegressor(**param)  
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    preds = model.predict(test_x)
    score = r2_score(test_y, preds)
    
    return score

In [4]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=333, show_progress_bar=True)

Trial 37 finished with value: 0.726211271267521 and parameters:  {'reg_alpha': 0.0016989292739716326, 'reg_lambda': 0.12308815195780395, 'colsample_bytree': 0.7, 'subsample': 1.0, 'learning_rate': 0.017, 'max_depth': 100, 'num_leaves': 998, 'min_child_samples': 1, 'min_data_per_groups': 72}  

Best is trial 37 with value: 0.726211271267521.

### CatBoost

In [5]:
def objective(trial,data=data,target=target):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    
    params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10, log=True),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 10, log=True),
        "border_count": trial.suggest_int("border_count", 1, 255),
        "grow_policy":trial.suggest_categorical("grow_policy", ["Lossguide", "Depthwise"])
    }

    model = cb.CatBoostRegressor(**params, silent=True)
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    preds = model.predict(test_x)
    rmscorese = r2_score(test_y, preds)
    
    return score

In [6]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=150, show_pogress_bar=True)

{'learning_rate': 0.0528980351340532,  
 'depth': 9,  
 'subsample': 0.551594034976896,  
 'colsample_bylevel': 0.8850522337727407,  
 'min_data_in_leaf': 1,  
 'l2_leaf_reg': 0.08640360417488033,  
 'random_strength': 0.7384089227664642,  
 'border_count': 222,  
 'grow_policy': 'Depthwise'}  

### xgb

In [7]:
def objective(trial,data=data,target=target):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10),
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    }
    
    model = xgboost.XGBRegressor(**param)
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    preds = model.predict(test_x)
    score = r2_score(test_y, preds)
    
    return score

In [8]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=150, show_progress_bar=True)

{'max_depth': 8,  
 'learning_rate': 0.10313141111769379,  
 'n_estimators': 812,  
 'min_child_weight': 5,  
 'gamma': 0.011317523973762036,  
 'subsample': 0.8394922128769058,  
 'colsample_bytree': 0.7968044965438489,  
 'reg_alpha': 0.5652719437700845,  
 'reg_lambda': 0.17657257431955833,  
 'scale_pos_weight': 1.7063203107667038,  
 'booster': 'gbtree',  
 'grow_policy': 'depthwise'}  

## Обучение моделей

In [9]:
train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.1, random_state=42)

In [10]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(train_x, train_y)
rf_preds = rf_model.predict(test_x)
rf_r2 = r2_score(test_y, rf_preds)
print(f'Random Forest R^2 Score: {rf_r2}')

et_model = ExtraTreesRegressor(n_estimators=100, random_state=42)
et_model.fit(train_x, train_y)
et_preds = et_model.predict(test_x)
et_r2 = r2_score(test_y, et_preds)
print(f'Extra Trees R^2 Score: {et_r2}')

Random Forest R^2 Score: 0.7345881713066735
Extra Trees R^2 Score: 0.7413464130778715


In [11]:
rf_model_2 = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=4, random_state=42)
rf_model_2.fit(train_x, train_y)
rf_preds_2 = rf_model_2.predict(test_x)
rf_r2_2 = r2_score(test_y, rf_preds_2)
print(f'Random Forest 2 R^2 Score: {rf_r2_2}')

et_model_2 = ExtraTreesRegressor(n_estimators=200, max_depth=20, min_samples_split=4, random_state=42)
et_model_2.fit(train_x, train_y)
et_preds_2 = et_model_2.predict(test_x)
et_r2_2 = r2_score(test_y, et_preds_2)
print(f'Extra Trees 2 R^2 Score: {et_r2_2}')

Random Forest 2 R^2 Score: 0.6963753869353264
Extra Trees 2 R^2 Score: 0.7377258356408415


In [12]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 0.8, 1.0],
    'max_features': [0.5, 0.8, 1.0],
    'bootstrap': [True, False],
    'bootstrap_features': [True, False]
}

grid_search = GridSearchCV(
    estimator=BaggingRegressor(base_estimator=DecisionTreeRegressor(), random_state=42),
    param_grid=param_grid,
    scoring=make_scorer(r2_score),
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(train_x, train_y)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best R^2 Score: {grid_search.best_score_}')

best_bagging_model = grid_search.best_estimator_
test_preds = best_bagging_model.predict(test_x)
test_r2 = r2_score(test_y, test_preds)
print(f'Test R^2 Score: {test_r2}')

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'bootstrap': False, 'bootstrap_features': True, 'max_features': 1.0, 'max_samples': 0.8, 'n_estimators': 100}
Best R^2 Score: 0.7462266488924996
Test R^2 Score: 0.7485079360433033


{'bootstrap': False,  
'bootstrap_features': True,  
'max_features': 1.0,  
'max_samples': 0.8,  
'n_estimators': 100}

In [13]:
best_bagging_model = BaggingRegressor(
    base_estimator=DecisionTreeRegressor(),
    random_state=42,
    bootstrap=False,
    bootstrap_features=True,
    max_features=1.0,
    max_samples=0.8,
    n_estimators=100
)

best_bagging_model.fit(train_x, train_y)
test_preds = best_bagging_model.predict(test_x)
test_r2 = r2_score(test_y, test_preds)
print(f'Test R^2 Score: {test_r2}')

Test R^2 Score: 0.7485079360433033


In [14]:
bag_ext = BaggingRegressor(base_estimator=ExtraTreesRegressor(),random_state=5432)
bag_ext.fit(train_x, train_y)

r2_score(test_y, bag_ext.predict(test_x))

0.73877049801193

In [16]:
bag_rf = BaggingRegressor(
    base_estimator=RandomForestRegressor(n_estimators=100, random_state=42),
    random_state=5432
)
bag_rf.fit(train_x, train_y)

r2_score(test_y, bag_rf.predict(test_x))

0.724217420411354

In [17]:
bag_grid = BaggingRegressor(ExtraTreesRegressor(),random_state=5432)
bag_grid.fit(train_x, train_y)

r2_score(test_y, bag_grid.predict(test_x))

0.73877049801193

In [18]:
np.random.seed(5432)

vot_reg = VotingRegressor(
        estimators=[
            ("dt", DecisionTreeRegressor(random_state=5432)),
            ("ex_dt", ExtraTreesRegressor(random_state=5432)),
            ("rf", RandomForestRegressor(random_state=5432)),
            ("cb", cb.CatBoostRegressor(
                learning_rate = 0.0528980351340532,
                depth = 9, # 9 nest
                subsample = 0.551594034976896,
                colsample_bylevel = 0.8850522337727407,
                min_data_in_leaf = 1,
                l2_leaf_reg = 0.08640360417488033,
                random_strength = 0.7384089227664642,
                border_count = 222,
                grow_policy = 'Depthwise')),
            ("xgb", xgboost.XGBRegressor(max_depth = 8,
                learning_rate = 0.10313141111769379,
                n_estimators = 812, # 812 best
                min_child_weight = 5,
                gamma = 0.011317523973762036,
                subsample = 0.8394922128769058,
                colsample_bytree = 0.7968044965438489,
                reg_alpha = 0.5652719437700845,
                reg_lambda = 0.17657257431955833,
                scale_pos_weight = 1.7063203107667038,
                booster = 'gbtree',
                grow_policy = 'depthwise')),
            ("lgbm", LGBMRegressor(reg_alpha = 0.0016989292739716326, 
                           reg_lambda = 0.12308815195780395, 
                           colsample_bytree = 0.7, 
                           subsample = 1.0, 
                           learning_rate = 0.017, 
                           max_depth = 100, 
                           num_leaves = 998, 
                           min_child_samples = 1, 
                           min_data_per_groups = 72
                          ))
            
        ],
        n_jobs=-1,
        verbose=1
    )

vot_reg.fit(train_x, train_y)

vot_reg_r2 = r2_score(test_y, vot_reg.predict(test_x))
print(f'Extra Trees R^2 Score: {vot_reg_r2}')

Extra Trees R^2 Score: 0.7371878268423608


In [19]:
lgbm_model = LGBMRegressor(
    reg_alpha = 0.0016989292739716326, 
    reg_lambda = 0.12308815195780395, 
    colsample_bytree = 0.7, 
    subsample = 1.0, 
    learning_rate = 0.017, 
    max_depth = 100, 
    num_leaves = 998, 
    min_child_samples = 1, 
    min_data_per_groups = 72
)


catboost_model = cb.CatBoostRegressor(
    learning_rate = 0.0528980351340532,
    depth = 9,
    subsample = 0.551594034976896,
    colsample_bylevel = 0.8850522337727407,
    min_data_in_leaf = 1,
    l2_leaf_reg = 0.08640360417488033,
    random_strength = 0.7384089227664642,
    border_count = 222,
    grow_policy = 'Depthwise'
)

xgb_model = xgboost.XGBRegressor(
    max_depth = 8,
    learning_rate = 0.10313141111769379,
    n_estimators = 812,
    min_child_weight = 5,
    gamma = 0.011317523973762036,
    subsample = 0.8394922128769058,
    colsample_bytree = 0.7968044965438489,
    reg_alpha = 0.5652719437700845,
    reg_lambda = 0.17657257431955833,
    scale_pos_weight = 1.7063203107667038,
    booster = 'gbtree',
    grow_policy = 'depthwise'
)

In [20]:
X_train, X_val, y_train, y_val = train_test_split(data, target, test_size=0.1, random_state=42)

In [21]:
lgbm_model.fit(X_train, y_train)

r2_score(y_val, lgbm_model.predict(X_val))



0.6943370984086179

In [22]:
catboost_model.fit(X_train, y_train, silent=True)

r2_score(y_val, catboost_model.predict(X_val))

0.7354686736948759

In [23]:
xgb_model.fit(X_train, y_train)

r2_score(y_val, xgb_model.predict(X_val))

0.7255611845952585

In [24]:
catboost_preds = catboost_model.predict(X_val)
lgbm_preds = lgbm_model.predict(X_val)
xgb_preds = xgb_model.predict(X_val)
rf_preds = rf_model.predict(X_val)
et_preds = et_model.predict(X_val)

breg_preds = best_bagging_model.predict(X_val)
vot_preds = vot_reg.predict(X_val)

bag_ext_preds = bag_ext.predict(X_val)
bag_rf_preds = bag_rf.predict(X_val)

rf_2_preds = rf_model_2.predict(X_val)
et_2_preds = et_model_2.predict(X_val)

X_meta = np.column_stack(
    (
        catboost_preds,
        lgbm_preds,
        xgb_preds,
        rf_preds,
        et_preds,
        breg_preds,
        vot_preds,
        bag_ext_preds,
        bag_rf_preds,
        rf_2_preds,
        et_2_preds
    )
)

meta_model = LinearRegression()
meta_model.fit(X_meta, y_val)

meta_preds = meta_model.predict(X_meta)

r2 = r2_score(y_val, meta_preds)
print(f"R^2 Score for Blended Model on Validation Set: {70 * r2}")

R^2 Score for Blended Model on Validation Set: 53.39263177710193


## Сохраним веса моделей

In [25]:
models = [
    ("cb", catboost_model),
    ("lgbm", lgbm_model),
    ("xgb", xgb_model),
    ("rf", rf_model),
    ("et", et_model),
    ("best_bagging_model", best_bagging_model),
    ("vot_reg", vot_reg),
    ("bag_ext", bag_ext),
    ("bag_rf", bag_rf),
    ("rf2", rf_model_2),
    ("et2", et_model_2),
    ("meta_lr", meta_model)
]

for item in models:
    with open(f"../models/{item[0]}.pkl", "wb") as file:
        pickle.dump(item[1], file)

---