In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, mean_absolute_error
import lightgbm as lgb
import mlflow
import mlflow.lightgbm
import optuna

In [2]:
data = np.load("../data/processed/max_inputs.npy")
data = data.reshape((33133,33*438))
data.shape

(33133, 14454)

In [3]:
output = np.load("../data/processed/max_outputs.npy")
output.shape

(33133, 10)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data, output, test_size=0.66, random_state=1)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.20, random_state=1)

In [5]:
print("Shape of train: ", y_train.shape)
print("Shape of test: ", y_test.shape)
print("Shape of val: ", y_val.shape)

Shape of train:  (11265, 10)
Shape of test:  (17494, 10)
Shape of val:  (4374, 10)


In [6]:
mlflow.set_tracking_uri("http://localhost:5000")

In [7]:
mlflow.set_experiment("/GradientBoosting")

<Experiment: artifact_location='mlflow-artifacts:/511671160559185878', creation_time=1721899131124, experiment_id='511671160559185878', last_update_time=1721899131124, lifecycle_stage='active', name='/GradientBoosting', tags={}>

In [8]:
target_idx = 0
train_data = lgb.Dataset(X_train, label=y_train[:,target_idx])
valid_data = lgb.Dataset(X_val, label=y_val[:,target_idx], reference=train_data)
def objective(trial):
    params = {
        'objective': 'regression_l2',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 1500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0, log=True),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0, log=True),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
    }
    
    with mlflow.start_run():
        model = lgb.train(
            params,
            train_data,
            num_boost_round=1000,
            valid_sets=[valid_data],
            callbacks=[lgb.early_stopping(stopping_rounds=50)],
        )
        
        mlflow.lightgbm.log_model(model, "model")
        mlflow.log_params(params)
        
        y_pred = model.predict(X_test, num_iteration=model.best_iteration)
        
        mse = mean_squared_error(y_test[:,target_idx], y_pred)
        mae = mean_absolute_error(y_test[:,target_idx], y_pred)
        mape = mean_absolute_percentage_error(y_test[:,target_idx], y_pred)
        r2 = r2_score(y_test[:,target_idx], y_pred)
        
        # Логгирование метрик
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mape", mape)
    
        return mse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2024-07-25 12:54:05,672] A new study created in memory with name: no-name-ac07d1a5-4988-4d49-b4c1-ad71a62d70b9


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.564521 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2966656
[LightGBM] [Info] Number of data points in the train set: 11265, number of used features: 11748
[LightGBM] [Info] Start training from score 224.158117
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[33]	valid_0's rmse: 20.2481


[I 2024-07-25 12:59:18,399] Trial 0 finished with value: 430.1767741307378 and parameters: {'num_leaves': 241, 'learning_rate': 0.13835915756533473, 'feature_fraction': 0.8742576853466161, 'bagging_fraction': 0.8777820413914257, 'bagging_freq': 5, 'min_child_samples': 11}. Best is trial 0 with value: 430.1767741307378.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.057269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2966656
