In [1]:
import numpy as np
import xgboost as xgb
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error
import optuna
import pickle
from datetime import timedelta
import time
import os 
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
import gc
import tensorflow as tf

2024-05-05 17:55:45.069975: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
feat = pd.read_csv('./data/test.csv')
FEATURE_COLS = feat.columns[1:].tolist()

In [3]:
study_name = '426_convnextbase_003_998_1'

In [4]:
pickle_file_path = f'./data/test_{study_name}.pickle'

with open(pickle_file_path, 'rb') as f:
    test_df = pickle.load(f)

pickle_file_path = f'./data/train_{study_name}.pickle'

with open(pickle_file_path, 'rb') as f:
    train_df = pickle.load(f)

In [5]:
import warnings

# Ohita tietyn tyyppiset varoitukset
warnings.filterwarnings('ignore', category=UserWarning)

def get_combined_data(df):
    # Oletetaan, että FEATURES_COLS on jo määritelty olemassa oleville piirteille
    data = [df[col].values for col in FEATURE_COLS]
    # Lisää mallin piirteet
    data.append(np.vstack(df['combined_features'].values))
    return np.column_stack(data)

def objective(trial, df, target, fold_train, fold_validation):
    param = {        
        'objective': 'reg:squarederror',        
        'device' : 'cuda',
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log = True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log = True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.9),
        'subsample': trial.suggest_float('subsample', 0.1, 0.9),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),        
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'feature_selector': trial.suggest_categorical('feature_selector', ['shuffle', 'greedy', 'thrifty', 'cyclic', 'random']),
        'boosting': trial.suggest_categorical('boosting', ['gbtree', 'gblinear', 'dart'])

        }
    

    train_data = df[df['fold'] == fold_train]
    valid_data = df[df['fold'] == fold_validation]

    selector = trial.suggest_categorical('selector', ['f_regression', 'mutual_info_regression', 'none'])
    num_selected = trial.suggest_int('num_selected', 1, train_data.shape[1] - 1)

    if selector == 'f_regression':
        select = SelectKBest(f_regression, k=num_selected)
    elif selector == 'mutual_info_regression':
        select = SelectKBest(mutual_info_regression, k=num_selected)
    elif selector == 'none':
        select = None

    if select is not None:
        X_train = select.fit_transform(get_combined_data(train_data), train_data[target])
        X_valid = select.transform(get_combined_data(valid_data))
    else:
        X_train = get_combined_data(train_data)
        X_valid = get_combined_data(valid_data)

    y_train = train_data[target]
    y_valid = valid_data[target]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    num_boost_round = trial.suggest_int('n_estimators', 10, 1000, log=True)
    early_stopping_rounds = trial.suggest_int('early_stopping_rounds', 5, 50)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    model = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=False)
    preds = model.predict(dvalid)
    mse = mean_squared_error(y_valid, preds)

    return mse

def optimize_model(df, target, fold_train, fold_validation):

    if os.path.exists(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_genesampler.pickle'):            
        with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_genesampler.pickle', 'rb') as f:
            print(f'Loading gene sampler from file {f}')
            genemachine = pickle.load(f)

    else:            
        print('Creating new gene sampler')
        genemachine = optuna.samplers.NSGAIISampler(crossover = optuna.samplers.nsgaii.VSBXCrossover())

    if os.path.exists(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_qmc_sampler.pickle'):
        with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_qmc_sampler.pickle', 'rb') as f:
            print(f'Loading QMC sampler from file {f}')
            qmc_sampler = pickle.load(f)
    else:
        print(f'Creating new QMC sampler')
        qmc_sampler = optuna.samplers.QMCSampler(warn_independent_sampling = False)

    if os.path.exists(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_tpe_sampler.pickle'):
        with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_tpe_sampler.pickle', 'rb') as f:
            print(f'Loading TPE sampler from file {f}')
            tpe_sampler = pickle.load(f)
    else:
        print(f'Creating new TPE sampler')
        tpe_sampler = optuna.samplers.TPESampler(n_startup_trials=0, multivariate=True, warn_independent_sampling = False)


    start_time = time.time()
    study = optuna.create_study(direction='minimize',
                            study_name=study_name,
                            storage=f'sqlite:///502_xgboost_{target}.db',
                            load_if_exists=True                            
                            )
    
    print(f'Starting optimization for {target} with qmc sampler')
    random_time = time.time()
    study.sampler = qmc_sampler
    study.optimize(lambda trial: objective(trial, df, target, fold_train, fold_validation), n_trials=50)
    print(f'QCM optimization finished in {timedelta(seconds=time.time() - random_time)}')

    print(f'Saving QMC sampler to file ./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_qmc_sampler.pickle')
    with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_qmc_sampler.pickle', 'wb') as f:
        pickle.dump(qmc_sampler, f)

    print(f'Starting optimization for {target} with gene sampler')
    gene_time = time.time()
    study.sampler = genemachine
    study.optimize(lambda trial: objective(trial, df, target, fold_train, fold_validation), n_trials=50)
    print(f'Gene optimization finished in {timedelta(seconds=time.time() - gene_time)}')

    print(f'Saving gene sampler to file ./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_genesampler.pickle')
    with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_genesampler.pickle', 'wb') as f:
        pickle.dump(genemachine, f)

    print(f'Starting optimization for {target} with TPE sampler')
    tpe_time = time.time()
    study.sampler = tpe_sampler
    study.optimize(lambda trial: objective(trial, df, target, fold_train, fold_validation), n_trials=50)
    print(f'TPE optimization finished in {timedelta(seconds=time.time() - tpe_time)}')

    print(f'Saving TPE sampler to file ./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_tpe_sampler.pickle')
    with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_tpe_sampler.pickle', 'wb') as f:
        pickle.dump(tpe_sampler, f)

    print(f'Optimization finished in {timedelta(seconds=time.time() - start_time)}')

    best_params = study.best_trial.params    

    mse_scores = []

    cross_time = time.time()

    
    tf.keras.backend.clear_session()
    gc.collect()


    for fold in df['fold'].unique():

        train_data = df[df['fold'] != fold]
        valid_data = df[df['fold'] == fold]

        selector = best_params['selector']
        num_selected = best_params['num_selected']

        if selector == 'f_regression':
            select = SelectKBest(f_regression, k=num_selected)
        elif selector == 'mutual_info_regression':
            select = SelectKBest(mutual_info_regression, k=num_selected)
        elif selector == 'none':
            select = None

        if select is not None:
            X_train = select.fit_transform(get_combined_data(train_data), train_data[target])
            X_valid = select.transform(get_combined_data(valid_data))
        
        else:
            X_train = get_combined_data(train_data)
            X_valid = get_combined_data(valid_data)


        y_train = train_data[target]
        y_valid = valid_data[target]

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dvalid = xgb.DMatrix(X_valid, label=y_valid)

        

        num_boost_round = best_params['n_estimators']
        early_stopping_rounds = best_params['early_stopping_rounds']


        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        model = xgb.train(best_params, dtrain, num_boost_round=num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=False)
        preds = model.predict(dvalid)        
        mse = mean_squared_error(y_valid, preds)
        mse_scores.append(mse)

    average_mse = np.mean(mse_scores)

    print(f'Cross-validated MSE: {average_mse} for {target}')

    print(f"Best parameters for {target}: ", best_params)

    print(f'Train data shape without selector {train_data.shape[1]}')

    print(f'Time taken for crosvalidation: {timedelta(seconds=time.time() - cross_time)}')


    print(f'Fitting model for {target} with best parameters')

    
    tf.keras.backend.clear_session()
    gc.collect()

    del best_params['early_stopping_rounds']
   
    endfit_time = time.time()
        
    X = get_combined_data(df)
    y = df[target]

    # Käsittele featureiden valinta
    if best_params['selector'] != 'none':
        if best_params['selector'] == 'f_regression':
            select = SelectKBest(f_regression, k=best_params['num_selected'])
            num_selected = best_params['num_selected']
            print(f'Selector is f_regression with num selected {num_selected}')
        elif best_params['selector'] == 'mutual_info_regression':
            select = SelectKBest(mutual_info_regression, k=best_params['num_selected'])
            num_selected = best_params['num_selected']
            print(f'Selector is mutual_info_regression with num selected {num_selected}')
                  
        X_selected = select.fit_transform(X, y)

    else:
        X_selected = X
        print(f'None selector')

    # Luo ja kouluta lopullinen malli
    model = xgb.XGBRegressor(**best_params)
    model.fit(X_selected, y)

    print(f'Time taken for fitting whole data: {timedelta(seconds=time.time() - endfit_time)}')
    
    return model



In [6]:
train_df.head()

Unnamed: 0,id,WORLDCLIM_BIO1_annual_mean_temperature,WORLDCLIM_BIO12_annual_precipitation,WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month,WORLDCLIM_BIO15_precipitation_seasonality,WORLDCLIM_BIO4_temperature_seasonality,WORLDCLIM_BIO7_temperature_annual_range,SOIL_bdod_0.5cm_mean_0.01_deg,SOIL_bdod_100.200cm_mean_0.01_deg,SOIL_bdod_15.30cm_mean_0.01_deg,...,bin_0,bin_1,bin_2,bin_3,bin_4,bin_5,final_bin,fold,features_avg,model_features_426_convnextbase_003_998_1
0,192027691,12.235703,374.466675,62.524445,72.256844,773.592041,33.277779,125,149,136,...,2,2,1,4,2,1,221421,2.0,"[-0.14304829, -0.28004962, 0.886131, -0.183084...","[0.9996191263198853, 0.999675989151001, -0.269..."
1,195542235,17.270555,90.239998,10.351111,38.22094,859.193298,40.009777,124,144,138,...,3,3,2,2,2,3,332223,4.0,"[0.16563013, -1.4509088, 0.46862665, 0.2771467...","[0.9998477697372437, 0.9998577237129211, 0.975..."
2,196639184,14.254504,902.071411,49.642857,17.873655,387.977753,22.807142,107,133,119,...,5,1,5,5,2,3,515523,2.0,"[-0.034171782, -0.13625506, 0.5971655, -0.6490...","[0.9998796582221985, 0.9999839067459106, 0.979..."
3,195728812,18.680834,1473.93335,163.100006,45.009758,381.053986,20.436666,120,131,125,...,3,2,3,2,1,3,323213,0.0,"[-0.881118, -0.16722384, 1.3514438, 0.34779415...","[0.9997825622558594, 0.9996305108070374, 0.995..."
4,195251545,0.673204,530.088867,50.857777,38.230709,1323.526855,45.891998,91,146,120,...,2,3,3,5,4,4,233544,4.0,"[-0.3382826, 0.3086146, 0.58881557, 0.24890125...","[0.9998947978019714, 0.9997898936271667, 0.789..."


In [7]:
def prepare_features(row):
    return np.array(row[f'model_features_{study_name}'])

train_df['combined_features'] = train_df.apply(prepare_features, axis=1)
test_df['combined_features'] = test_df.apply(prepare_features, axis=1)



In [8]:
target_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
train_fold = 1
validation_fold = 2

# Mallien kouluttaminen jokaiselle kohdemuuttujalle
models = {}

time_search_start = time.time()
time_taken = 0

while time_taken < 60:
    for target in target_columns:    
        print(f'\n\nOptimizing model for {target} using train fold {train_fold} and validation fold {validation_fold}\n\n')
        models[target] = optimize_model(train_df, target, train_fold, validation_fold)
        time_taken = time.time() - time_search_start
        print(f'Time taken: {timedelta(seconds=time_taken)}')   



Optimizing model for X4_mean using train fold 1 and validation fold 2


Loading gene sampler from file <_io.BufferedReader name='./NN_search/426_convnextbase_003_998_1_1_2_X4_mean_genesampler.pickle'>
Loading QMC sampler from file <_io.BufferedReader name='./NN_search/426_convnextbase_003_998_1_1_2_X4_mean_qmc_sampler.pickle'>
Loading TPE sampler from file <_io.BufferedReader name='./NN_search/426_convnextbase_003_998_1_1_2_X4_mean_tpe_sampler.pickle'>


[I 2024-05-05 17:55:48,331] Using an existing study with name '426_convnextbase_003_998_1' instead of creating a new one.


Starting optimization for X4_mean with qmc sampler


[I 2024-05-05 17:55:50,225] Trial 30 finished with value: 0.005458857455936615 and parameters: {'lambda': 0.0031622776601683764, 'alpha': 0.0316227766016838, 'colsample_bytree': 0.45000000000000007, 'subsample': 0.85, 'learning_rate': 0.071875, 'max_depth': 17, 'min_child_weight': 94, 'feature_selector': 'shuffle', 'boosting': 'gbtree', 'selector': 'none', 'num_selected': 80, 'n_estimators': 73, 'early_stopping_rounds': 42}. Best is trial 21 with value: 0.005340167057966362.
[I 2024-05-05 17:55:50,899] Trial 31 finished with value: 0.009757438862756884 and parameters: {'lambda': 0.31622776601683833, 'alpha': 3.16227766016837e-08, 'colsample_bytree': 0.65, 'subsample': 0.25, 'learning_rate': 0.319375, 'max_depth': 12, 'min_child_weight': 19, 'feature_selector': 'cyclic', 'boosting': 'gblinear', 'selector': 'none', 'num_selected': 34, 'n_estimators': 23, 'early_stopping_rounds': 30}. Best is trial 21 with value: 0.005340167057966362.
[I 2024-05-05 17:55:51,187] Trial 32 finished with val

QCM optimization finished in 0:01:08.273258
Saving QMC sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X4_mean_qmc_sampler.pickle
Starting optimization for X4_mean with gene sampler


[I 2024-05-05 17:56:57,246] Trial 40 finished with value: 0.012719271123391197 and parameters: {'lambda': 4.2498823591800484e-05, 'alpha': 1.0492936586153866e-08, 'colsample_bytree': 0.11563079465127198, 'subsample': 0.4576783617274871, 'learning_rate': 0.8725350841882131, 'max_depth': 17, 'min_child_weight': 49, 'feature_selector': 'greedy', 'boosting': 'dart', 'selector': 'f_regression', 'num_selected': 32, 'n_estimators': 487, 'early_stopping_rounds': 24}. Best is trial 21 with value: 0.005340167057966362.
[I 2024-05-05 17:56:57,737] Trial 41 finished with value: 0.005889072513273662 and parameters: {'lambda': 8.576350798591015e-08, 'alpha': 0.11220639241553058, 'colsample_bytree': 0.19042746246364653, 'subsample': 0.6644485359606103, 'learning_rate': 0.4775635118331565, 'max_depth': 4, 'min_child_weight': 59, 'feature_selector': 'random', 'boosting': 'gbtree', 'selector': 'f_regression', 'num_selected': 27, 'n_estimators': 967, 'early_stopping_rounds': 26}. Best is trial 21 with va

Gene optimization finished in 0:00:38.380932
Saving gene sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X4_mean_genesampler.pickle
Starting optimization for X4_mean with TPE sampler


[I 2024-05-05 17:57:35,758] Trial 50 finished with value: 0.005359784507792078 and parameters: {'lambda': 0.001237218877774996, 'alpha': 2.5119664347921785e-08, 'colsample_bytree': 0.38555164530036623, 'subsample': 0.6335506472014392, 'learning_rate': 0.1069535561759053, 'max_depth': 4, 'min_child_weight': 99, 'feature_selector': 'shuffle', 'boosting': 'gblinear', 'selector': 'f_regression', 'num_selected': 149, 'n_estimators': 950, 'early_stopping_rounds': 28}. Best is trial 21 with value: 0.005340167057966362.
[I 2024-05-05 17:57:50,873] Trial 51 finished with value: 0.005481788818519207 and parameters: {'lambda': 0.7037531734794105, 'alpha': 5.545745527833352e-06, 'colsample_bytree': 0.6577205581858975, 'subsample': 0.6794587362609297, 'learning_rate': 0.2697555226204822, 'max_depth': 2, 'min_child_weight': 98, 'feature_selector': 'cyclic', 'boosting': 'gbtree', 'selector': 'mutual_info_regression', 'num_selected': 71, 'n_estimators': 856, 'early_stopping_rounds': 15}. Best is trial

TPE optimization finished in 0:01:07.052412
Saving TPE sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X4_mean_tpe_sampler.pickle
Optimization finished in 0:02:53.927336
Cross-validated MSE: 0.005799572228717827 for X4_mean
Best parameters for X4_mean:  {'lambda': 6.54428600324243e-07, 'alpha': 1.4269242343350145e-06, 'colsample_bytree': 0.8470215547037809, 'subsample': 0.8676309178164017, 'learning_rate': 0.03938488615077708, 'max_depth': 3, 'min_child_weight': 99, 'feature_selector': 'thrifty', 'boosting': 'gblinear', 'selector': 'mutual_info_regression', 'num_selected': 105, 'n_estimators': 250, 'early_stopping_rounds': 9}
Train data shape without selector 182
Time taken for crosvalidation: 0:05:50.294007
Fitting model for X4_mean with best parameters
Selector is mutual_info_regression with num selected 105


[I 2024-05-05 18:06:06,827] Using an existing study with name '426_convnextbase_003_998_1' instead of creating a new one.


Time taken for fitting whole data: 0:01:34.139886
Time taken: 0:10:18.691775


Optimizing model for X11_mean using train fold 1 and validation fold 2


Loading gene sampler from file <_io.BufferedReader name='./NN_search/426_convnextbase_003_998_1_1_2_X11_mean_genesampler.pickle'>
Loading QMC sampler from file <_io.BufferedReader name='./NN_search/426_convnextbase_003_998_1_1_2_X11_mean_qmc_sampler.pickle'>
Loading TPE sampler from file <_io.BufferedReader name='./NN_search/426_convnextbase_003_998_1_1_2_X11_mean_tpe_sampler.pickle'>
Starting optimization for X11_mean with qmc sampler


[I 2024-05-05 18:06:08,473] Trial 30 finished with value: 18.02338038206304 and parameters: {'lambda': 0.0031622776601683764, 'alpha': 0.0316227766016838, 'colsample_bytree': 0.45000000000000007, 'subsample': 0.85, 'learning_rate': 0.071875, 'max_depth': 17, 'min_child_weight': 94, 'feature_selector': 'shuffle', 'boosting': 'gblinear', 'selector': 'f_regression', 'num_selected': 80, 'n_estimators': 73, 'early_stopping_rounds': 42}. Best is trial 26 with value: 16.734624746503638.
[I 2024-05-05 18:06:09,523] Trial 31 finished with value: 30.531034950484234 and parameters: {'lambda': 0.31622776601683833, 'alpha': 3.16227766016837e-08, 'colsample_bytree': 0.65, 'subsample': 0.25, 'learning_rate': 0.319375, 'max_depth': 12, 'min_child_weight': 19, 'feature_selector': 'cyclic', 'boosting': 'gblinear', 'selector': 'none', 'num_selected': 34, 'n_estimators': 23, 'early_stopping_rounds': 30}. Best is trial 26 with value: 16.734624746503638.
[I 2024-05-05 18:06:24,476] Trial 32 finished with va

QCM optimization finished in 0:01:21.623417
Saving QMC sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X11_mean_qmc_sampler.pickle
Starting optimization for X11_mean with gene sampler


[I 2024-05-05 18:07:28,962] Trial 40 finished with value: 20.005922983165572 and parameters: {'lambda': 0.00016672547771566718, 'alpha': 0.005906736019114186, 'colsample_bytree': 0.7497930419052727, 'subsample': 0.4547019463524856, 'learning_rate': 0.5529079129893028, 'max_depth': 2, 'min_child_weight': 97, 'feature_selector': 'thrifty', 'boosting': 'gblinear', 'selector': 'f_regression', 'num_selected': 82, 'n_estimators': 246, 'early_stopping_rounds': 16}. Best is trial 26 with value: 16.734624746503638.
[I 2024-05-05 18:07:44,085] Trial 41 finished with value: 36.78236753078171 and parameters: {'lambda': 4.2230714704160367e-07, 'alpha': 7.764432359755265e-07, 'colsample_bytree': 0.20295067093288913, 'subsample': 0.6547960822510221, 'learning_rate': 0.6720193415700016, 'max_depth': 20, 'min_child_weight': 19, 'feature_selector': 'greedy', 'boosting': 'dart', 'selector': 'mutual_info_regression', 'num_selected': 132, 'n_estimators': 10, 'early_stopping_rounds': 27}. Best is trial 26 w

Gene optimization finished in 0:01:51.504862
Saving gene sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X11_mean_genesampler.pickle
Starting optimization for X11_mean with TPE sampler


[I 2024-05-05 18:09:20,561] Trial 50 finished with value: 17.540012019265777 and parameters: {'lambda': 2.2796341721498006e-08, 'alpha': 0.006991800440140998, 'colsample_bytree': 0.8618311861479878, 'subsample': 0.7240850382359543, 'learning_rate': 0.20497137383195987, 'max_depth': 6, 'min_child_weight': 37, 'feature_selector': 'cyclic', 'boosting': 'gblinear', 'selector': 'f_regression', 'num_selected': 35, 'n_estimators': 103, 'early_stopping_rounds': 11}. Best is trial 26 with value: 16.734624746503638.
[I 2024-05-05 18:09:21,223] Trial 51 finished with value: 18.411663468623658 and parameters: {'lambda': 2.505344672296442e-07, 'alpha': 0.3019665877850606, 'colsample_bytree': 0.8061623333337632, 'subsample': 0.6494907006731959, 'learning_rate': 0.22899528170684988, 'max_depth': 8, 'min_child_weight': 44, 'feature_selector': 'cyclic', 'boosting': 'gblinear', 'selector': 'f_regression', 'num_selected': 81, 'n_estimators': 80, 'early_stopping_rounds': 8}. Best is trial 26 with value: 1

TPE optimization finished in 0:00:06.632936
Saving TPE sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X11_mean_tpe_sampler.pickle
Optimization finished in 0:03:19.789392
Cross-validated MSE: 18.226619487158406 for X11_mean
Best parameters for X11_mean:  {'lambda': 1.5430235778155575e-08, 'alpha': 0.0029855762290906147, 'colsample_bytree': 0.7765525971815931, 'subsample': 0.7763679889102095, 'learning_rate': 0.1262360744430131, 'max_depth': 4, 'min_child_weight': 65, 'feature_selector': 'cyclic', 'boosting': 'gblinear', 'selector': 'f_regression', 'num_selected': 28, 'n_estimators': 148, 'early_stopping_rounds': 18}
Train data shape without selector 182
Time taken for crosvalidation: 0:00:09.175508
Fitting model for X11_mean with best parameters
Selector is f_regression with num selected 28


[I 2024-05-05 18:09:37,803] Using an existing study with name '426_convnextbase_003_998_1' instead of creating a new one.


Time taken for fitting whole data: 0:00:01.788543
Time taken: 0:13:49.667464


Optimizing model for X18_mean using train fold 1 and validation fold 2


Loading gene sampler from file <_io.BufferedReader name='./NN_search/426_convnextbase_003_998_1_1_2_X18_mean_genesampler.pickle'>
Loading QMC sampler from file <_io.BufferedReader name='./NN_search/426_convnextbase_003_998_1_1_2_X18_mean_qmc_sampler.pickle'>
Loading TPE sampler from file <_io.BufferedReader name='./NN_search/426_convnextbase_003_998_1_1_2_X18_mean_tpe_sampler.pickle'>
Starting optimization for X18_mean with qmc sampler


[I 2024-05-05 18:09:39,922] Trial 30 finished with value: 8.747147238887496 and parameters: {'lambda': 0.0031622776601683764, 'alpha': 0.0316227766016838, 'colsample_bytree': 0.45000000000000007, 'subsample': 0.85, 'learning_rate': 0.071875, 'max_depth': 17, 'min_child_weight': 94, 'feature_selector': 'random', 'boosting': 'gblinear', 'selector': 'none', 'num_selected': 80, 'n_estimators': 73, 'early_stopping_rounds': 42}. Best is trial 30 with value: 8.747147238887496.
[I 2024-05-05 18:09:40,526] Trial 31 finished with value: 12.409154848032996 and parameters: {'lambda': 0.31622776601683833, 'alpha': 3.16227766016837e-08, 'colsample_bytree': 0.65, 'subsample': 0.25, 'learning_rate': 0.319375, 'max_depth': 12, 'min_child_weight': 19, 'feature_selector': 'greedy', 'boosting': 'gblinear', 'selector': 'f_regression', 'num_selected': 34, 'n_estimators': 23, 'early_stopping_rounds': 30}. Best is trial 30 with value: 8.747147238887496.
[I 2024-05-05 18:09:40,858] Trial 32 finished with value

QCM optimization finished in 0:00:51.390442
Saving QMC sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X18_mean_qmc_sampler.pickle
Starting optimization for X18_mean with gene sampler


[I 2024-05-05 18:10:30,662] Trial 40 finished with value: 11.492588449979538 and parameters: {'lambda': 9.371082090029149e-07, 'alpha': 6.447379047293763e-06, 'colsample_bytree': 0.6563612751960839, 'subsample': 0.7799545295589877, 'learning_rate': 0.32838830117116735, 'max_depth': 19, 'min_child_weight': 34, 'feature_selector': 'cyclic', 'boosting': 'gblinear', 'selector': 'f_regression', 'num_selected': 124, 'n_estimators': 120, 'early_stopping_rounds': 39}. Best is trial 30 with value: 8.747147238887496.
[I 2024-05-05 18:10:31,332] Trial 41 finished with value: 40.26296862296515 and parameters: {'lambda': 0.3361433660472676, 'alpha': 0.024856855395751207, 'colsample_bytree': 0.5557790700525106, 'subsample': 0.3087021008145894, 'learning_rate': 0.6465067144674166, 'max_depth': 19, 'min_child_weight': 8, 'feature_selector': 'shuffle', 'boosting': 'gbtree', 'selector': 'f_regression', 'num_selected': 87, 'n_estimators': 242, 'early_stopping_rounds': 16}. Best is trial 30 with value: 8.

Gene optimization finished in 0:00:37.614607
Saving gene sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X18_mean_genesampler.pickle
Starting optimization for X18_mean with TPE sampler


[I 2024-05-05 18:11:08,274] Trial 50 finished with value: 8.773899051985813 and parameters: {'lambda': 0.00030351778454801293, 'alpha': 0.01769077114083086, 'colsample_bytree': 0.24246296442982157, 'subsample': 0.8705131873772601, 'learning_rate': 0.06416189680388838, 'max_depth': 9, 'min_child_weight': 80, 'feature_selector': 'random', 'boosting': 'gblinear', 'selector': 'none', 'num_selected': 127, 'n_estimators': 95, 'early_stopping_rounds': 43}. Best is trial 30 with value: 8.747147238887496.
[I 2024-05-05 18:11:10,677] Trial 51 finished with value: 8.724843135504448 and parameters: {'lambda': 0.0658344823633622, 'alpha': 0.01255338645303623, 'colsample_bytree': 0.4025980908501509, 'subsample': 0.6946391173948826, 'learning_rate': 0.06119178796011024, 'max_depth': 17, 'min_child_weight': 93, 'feature_selector': 'random', 'boosting': 'gblinear', 'selector': 'none', 'num_selected': 118, 'n_estimators': 232, 'early_stopping_rounds': 35}. Best is trial 51 with value: 8.724843135504448.

TPE optimization finished in 0:00:32.923162
Saving TPE sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X18_mean_tpe_sampler.pickle
Optimization finished in 0:02:01.956042
Cross-validated MSE: 8.7034639164638 for X18_mean
Best parameters for X18_mean:  {'lambda': 0.014675504783417191, 'alpha': 0.02073609173114297, 'colsample_bytree': 0.4965955628584916, 'subsample': 0.7733613122132487, 'learning_rate': 0.024105082134949996, 'max_depth': 20, 'min_child_weight': 79, 'feature_selector': 'cyclic', 'boosting': 'dart', 'selector': 'none', 'num_selected': 107, 'n_estimators': 268, 'early_stopping_rounds': 25}
Train data shape without selector 182
Time taken for crosvalidation: 0:02:42.718064
Fitting model for X18_mean with best parameters
None selector


  genemachine = optuna.samplers.NSGAIISampler(crossover = optuna.samplers.nsgaii.VSBXCrossover())
  qmc_sampler = optuna.samplers.QMCSampler(warn_independent_sampling = False)
[I 2024-05-05 18:15:01,064] A new study created in RDB with name: 426_convnextbase_003_998_1


Time taken for fitting whole data: 0:00:38.209346
Time taken: 0:19:12.821934


Optimizing model for X50_mean using train fold 1 and validation fold 2


Creating new gene sampler
Creating new QMC sampler
Creating new TPE sampler
Starting optimization for X50_mean with qmc sampler


[I 2024-05-05 18:15:02,323] Trial 0 finished with value: 0.14785715420898068 and parameters: {'lambda': 4.327884108144229e-08, 'alpha': 0.003357026635271244, 'colsample_bytree': 0.6776516932054872, 'subsample': 0.595545008419595, 'learning_rate': 0.17130712532836093, 'max_depth': 20, 'min_child_weight': 25, 'feature_selector': 'cyclic', 'boosting': 'gbtree', 'selector': 'none', 'num_selected': 171, 'n_estimators': 11, 'early_stopping_rounds': 49}. Best is trial 0 with value: 0.14785715420898068.
[I 2024-05-05 18:15:02,598] Trial 1 finished with value: 0.42715746215373596 and parameters: {'lambda': 0.0031330069386680264, 'alpha': 4.321875103377964e-05, 'colsample_bytree': 0.1, 'subsample': 0.1, 'learning_rate': 0.01, 'max_depth': 2, 'min_child_weight': 1, 'feature_selector': 'random', 'boosting': 'dart', 'selector': 'f_regression', 'num_selected': 1, 'n_estimators': 10, 'early_stopping_rounds': 5}. Best is trial 0 with value: 0.14785715420898068.
[I 2024-05-05 18:15:18,086] Trial 2 fini

QCM optimization finished in 0:01:35.555244
Saving QMC sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X50_mean_qmc_sampler.pickle
Starting optimization for X50_mean with gene sampler


[I 2024-05-05 18:16:51,600] Trial 10 finished with value: 0.17806802395639112 and parameters: {'lambda': 0.4972331550972564, 'alpha': 1.2022609523820401e-05, 'colsample_bytree': 0.4339237836522487, 'subsample': 0.1827041380326831, 'learning_rate': 0.6854085180947131, 'max_depth': 12, 'min_child_weight': 87, 'feature_selector': 'greedy', 'boosting': 'gblinear', 'selector': 'mutual_info_regression', 'num_selected': 11, 'n_estimators': 282, 'early_stopping_rounds': 23}. Best is trial 4 with value: 0.12231334098932448.
[I 2024-05-05 18:16:52,313] Trial 11 finished with value: 0.28634539423521826 and parameters: {'lambda': 2.691808485597488e-08, 'alpha': 1.458765048831049e-08, 'colsample_bytree': 0.10653137126147111, 'subsample': 0.21369060565374864, 'learning_rate': 0.6356079096968569, 'max_depth': 13, 'min_child_weight': 87, 'feature_selector': 'thrifty', 'boosting': 'gblinear', 'selector': 'none', 'num_selected': 129, 'n_estimators': 363, 'early_stopping_rounds': 24}. Best is trial 4 wit

Gene optimization finished in 0:01:36.691635
Saving gene sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X50_mean_genesampler.pickle
Starting optimization for X50_mean with TPE sampler


[I 2024-05-05 18:18:28,271] Trial 20 finished with value: 0.11978468841405948 and parameters: {'lambda': 1.0463999477086027e-06, 'alpha': 1.0409865444685902e-05, 'colsample_bytree': 0.4370989533218633, 'subsample': 0.7379718342920095, 'learning_rate': 0.2528470677312082, 'max_depth': 4, 'min_child_weight': 93, 'feature_selector': 'greedy', 'boosting': 'gbtree', 'selector': 'mutual_info_regression', 'num_selected': 150, 'n_estimators': 29, 'early_stopping_rounds': 23}. Best is trial 20 with value: 0.11978468841405948.
[I 2024-05-05 18:18:43,326] Trial 21 finished with value: 0.12153160519081341 and parameters: {'lambda': 3.062880076168353e-07, 'alpha': 0.002346618840954737, 'colsample_bytree': 0.706987498530418, 'subsample': 0.6354504803386416, 'learning_rate': 0.23589845060837986, 'max_depth': 7, 'min_child_weight': 92, 'feature_selector': 'shuffle', 'boosting': 'gblinear', 'selector': 'mutual_info_regression', 'num_selected': 69, 'n_estimators': 62, 'early_stopping_rounds': 18}. Best 

TPE optimization finished in 0:02:02.556506
Saving TPE sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X50_mean_tpe_sampler.pickle
Optimization finished in 0:05:14.935817
Cross-validated MSE: 0.13211833771455167 for X50_mean
Best parameters for X50_mean:  {'lambda': 1.1408444114319381e-07, 'alpha': 5.142462062173059e-07, 'colsample_bytree': 0.3641872045731693, 'subsample': 0.7429419102054463, 'learning_rate': 0.1663199295572259, 'max_depth': 5, 'min_child_weight': 63, 'feature_selector': 'greedy', 'boosting': 'dart', 'selector': 'mutual_info_regression', 'num_selected': 81, 'n_estimators': 61, 'early_stopping_rounds': 14}
Train data shape without selector 182
Time taken for crosvalidation: 0:05:36.078275
Fitting model for X50_mean with best parameters
Selector is mutual_info_regression with num selected 81


  genemachine = optuna.samplers.NSGAIISampler(crossover = optuna.samplers.nsgaii.VSBXCrossover())
  qmc_sampler = optuna.samplers.QMCSampler(warn_independent_sampling = False)
[I 2024-05-05 18:27:25,427] A new study created in RDB with name: 426_convnextbase_003_998_1


Time taken for fitting whole data: 0:01:33.007990
Time taken: 0:31:37.178081


Optimizing model for X26_mean using train fold 1 and validation fold 2


Creating new gene sampler
Creating new QMC sampler
Creating new TPE sampler
Starting optimization for X26_mean with qmc sampler


[I 2024-05-05 18:27:40,327] Trial 0 finished with value: 141759.78799048415 and parameters: {'lambda': 0.41545719044863394, 'alpha': 0.14660715436933877, 'colsample_bytree': 0.33428557679661824, 'subsample': 0.22285966625754627, 'learning_rate': 0.5935371319912967, 'max_depth': 6, 'min_child_weight': 9, 'feature_selector': 'cyclic', 'boosting': 'gblinear', 'selector': 'mutual_info_regression', 'num_selected': 89, 'n_estimators': 106, 'early_stopping_rounds': 42}. Best is trial 0 with value: 141759.78799048415.
[I 2024-05-05 18:27:40,695] Trial 1 finished with value: 65266.04993754402 and parameters: {'lambda': 0.004454292321452891, 'alpha': 0.0014736943859280987, 'colsample_bytree': 0.1, 'subsample': 0.1, 'learning_rate': 0.01, 'max_depth': 2, 'min_child_weight': 1, 'feature_selector': 'greedy', 'boosting': 'dart', 'selector': 'none', 'num_selected': 1, 'n_estimators': 10, 'early_stopping_rounds': 5}. Best is trial 1 with value: 65266.04993754402.
[I 2024-05-05 18:27:55,503] Trial 2 fi

QCM optimization finished in 0:01:04.478511
Saving QMC sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X26_mean_qmc_sampler.pickle
Starting optimization for X26_mean with gene sampler


[I 2024-05-05 18:28:30,431] Trial 10 finished with value: 83767.72491574234 and parameters: {'lambda': 0.29846832478206786, 'alpha': 0.0021269543257719146, 'colsample_bytree': 0.8046340710265917, 'subsample': 0.36400630194072203, 'learning_rate': 0.9365074624137575, 'max_depth': 19, 'min_child_weight': 39, 'feature_selector': 'greedy', 'boosting': 'gblinear', 'selector': 'f_regression', 'num_selected': 52, 'n_estimators': 17, 'early_stopping_rounds': 45}. Best is trial 4 with value: 44553.95654018811.
[I 2024-05-05 18:28:45,359] Trial 11 finished with value: 50705.55220232113 and parameters: {'lambda': 9.748072817801005e-08, 'alpha': 1.8800492908710473e-05, 'colsample_bytree': 0.7488340570724618, 'subsample': 0.7580535153567873, 'learning_rate': 0.5127263424464967, 'max_depth': 8, 'min_child_weight': 80, 'feature_selector': 'cyclic', 'boosting': 'gbtree', 'selector': 'mutual_info_regression', 'num_selected': 114, 'n_estimators': 355, 'early_stopping_rounds': 24}. Best is trial 4 with v

Gene optimization finished in 0:01:32.059157
Saving gene sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X26_mean_genesampler.pickle
Starting optimization for X26_mean with TPE sampler


[I 2024-05-05 18:30:02,481] Trial 20 finished with value: 44555.70173047969 and parameters: {'lambda': 0.9530703238222433, 'alpha': 3.667445909914517e-05, 'colsample_bytree': 0.8732436787593901, 'subsample': 0.6773436950729914, 'learning_rate': 0.21026365034468264, 'max_depth': 3, 'min_child_weight': 78, 'feature_selector': 'shuffle', 'boosting': 'gbtree', 'selector': 'none', 'num_selected': 169, 'n_estimators': 22, 'early_stopping_rounds': 26}. Best is trial 19 with value: 44271.758813630615.
[I 2024-05-05 18:30:17,133] Trial 21 finished with value: 47129.26796607737 and parameters: {'lambda': 0.13702077164439158, 'alpha': 7.780765694431643e-08, 'colsample_bytree': 0.8582018172282827, 'subsample': 0.4967399270749242, 'learning_rate': 0.37401210156359055, 'max_depth': 7, 'min_child_weight': 38, 'feature_selector': 'shuffle', 'boosting': 'gbtree', 'selector': 'mutual_info_regression', 'num_selected': 142, 'n_estimators': 12, 'early_stopping_rounds': 23}. Best is trial 19 with value: 442

TPE optimization finished in 0:00:20.268989
Saving TPE sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X26_mean_tpe_sampler.pickle
Optimization finished in 0:02:56.946923
Cross-validated MSE: 38855.12066830857 for X26_mean
Best parameters for X26_mean:  {'lambda': 0.06406141435145717, 'alpha': 5.394766767007468e-05, 'colsample_bytree': 0.7940225466372881, 'subsample': 0.7647281163676138, 'learning_rate': 0.10922388471575302, 'max_depth': 9, 'min_child_weight': 81, 'feature_selector': 'shuffle', 'boosting': 'gbtree', 'selector': 'none', 'num_selected': 135, 'n_estimators': 38, 'early_stopping_rounds': 33}
Train data shape without selector 182
Time taken for crosvalidation: 0:00:13.799965
Fitting model for X26_mean with best parameters
None selector


  genemachine = optuna.samplers.NSGAIISampler(crossover = optuna.samplers.nsgaii.VSBXCrossover())
  qmc_sampler = optuna.samplers.QMCSampler(warn_independent_sampling = False)
[I 2024-05-05 18:30:38,975] A new study created in RDB with name: 426_convnextbase_003_998_1


Time taken for fitting whole data: 0:00:02.590558
Time taken: 0:34:50.730684


Optimizing model for X3112_mean using train fold 1 and validation fold 2


Creating new gene sampler
Creating new QMC sampler
Loading TPE sampler from file <_io.BufferedReader name='./NN_search/426_convnextbase_003_998_1_1_2_X3112_mean_tpe_sampler.pickle'>
Starting optimization for X3112_mean with qmc sampler


[I 2024-05-05 18:30:54,562] Trial 0 finished with value: 17870859.061547503 and parameters: {'lambda': 3.3672757852564163e-05, 'alpha': 0.48298839537332755, 'colsample_bytree': 0.11276003511203508, 'subsample': 0.3537851989764109, 'learning_rate': 0.65977935632663, 'max_depth': 15, 'min_child_weight': 13, 'feature_selector': 'cyclic', 'boosting': 'gblinear', 'selector': 'mutual_info_regression', 'num_selected': 125, 'n_estimators': 392, 'early_stopping_rounds': 29}. Best is trial 0 with value: 17870859.061547503.
[I 2024-05-05 18:30:54,868] Trial 1 finished with value: 13160057.231803507 and parameters: {'lambda': 0.855224875651823, 'alpha': 0.01405722792569848, 'colsample_bytree': 0.1, 'subsample': 0.1, 'learning_rate': 0.01, 'max_depth': 2, 'min_child_weight': 1, 'feature_selector': 'shuffle', 'boosting': 'dart', 'selector': 'none', 'num_selected': 1, 'n_estimators': 10, 'early_stopping_rounds': 5}. Best is trial 1 with value: 13160057.231803507.
[I 2024-05-05 18:30:55,635] Trial 2 f

QCM optimization finished in 0:00:36.921843
Saving QMC sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X3112_mean_qmc_sampler.pickle
Starting optimization for X3112_mean with gene sampler


[I 2024-05-05 18:31:30,939] Trial 10 finished with value: 3414535.5461593964 and parameters: {'lambda': 2.0695938060970025e-05, 'alpha': 0.0026870821606643056, 'colsample_bytree': 0.4719519145789307, 'subsample': 0.586288312697555, 'learning_rate': 0.2186992507164754, 'max_depth': 10, 'min_child_weight': 28, 'feature_selector': 'cyclic', 'boosting': 'dart', 'selector': 'mutual_info_regression', 'num_selected': 36, 'n_estimators': 25, 'early_stopping_rounds': 14}. Best is trial 10 with value: 3414535.5461593964.
[I 2024-05-05 18:31:31,876] Trial 11 finished with value: 5494706.401296452 and parameters: {'lambda': 0.006422704017926952, 'alpha': 8.347326274165704e-08, 'colsample_bytree': 0.2517570552102464, 'subsample': 0.42735376348664267, 'learning_rate': 0.6033776691605403, 'max_depth': 13, 'min_child_weight': 98, 'feature_selector': 'shuffle', 'boosting': 'gbtree', 'selector': 'none', 'num_selected': 117, 'n_estimators': 353, 'early_stopping_rounds': 19}. Best is trial 10 with value: 

Gene optimization finished in 0:00:52.142819
Saving gene sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X3112_mean_genesampler.pickle
Starting optimization for X3112_mean with TPE sampler


[I 2024-05-05 18:32:08,499] Trial 20 finished with value: 3089004.2594992653 and parameters: {'lambda': 0.6377858223973664, 'alpha': 0.00536181398192638, 'colsample_bytree': 0.20129206920088138, 'subsample': 0.7480050267093725, 'learning_rate': 0.2383207222228183, 'max_depth': 6, 'min_child_weight': 53, 'feature_selector': 'random', 'boosting': 'dart', 'selector': 'f_regression', 'num_selected': 29, 'n_estimators': 24, 'early_stopping_rounds': 6}. Best is trial 12 with value: 2924214.4759506476.
[I 2024-05-05 18:32:09,021] Trial 21 finished with value: 3245677.3499918208 and parameters: {'lambda': 0.11494929302107552, 'alpha': 0.03666030443247161, 'colsample_bytree': 0.18550612810307449, 'subsample': 0.8071802136424877, 'learning_rate': 0.23990503617817072, 'max_depth': 9, 'min_child_weight': 37, 'feature_selector': 'random', 'boosting': 'dart', 'selector': 'f_regression', 'num_selected': 42, 'n_estimators': 26, 'early_stopping_rounds': 15}. Best is trial 12 with value: 2924214.4759506

TPE optimization finished in 0:00:24.845976
Saving TPE sampler to file ./NN_search/426_convnextbase_003_998_1_1_2_X3112_mean_tpe_sampler.pickle
Optimization finished in 0:01:54.044254
Cross-validated MSE: 3266541.525374479 for X3112_mean
Best parameters for X3112_mean:  {'lambda': 3.380793656543689e-05, 'alpha': 6.63414198119049e-08, 'colsample_bytree': 0.7488892925355332, 'subsample': 0.5180445842959263, 'learning_rate': 0.04497247283610099, 'max_depth': 11, 'min_child_weight': 22, 'feature_selector': 'random', 'boosting': 'dart', 'selector': 'none', 'num_selected': 100, 'n_estimators': 57, 'early_stopping_rounds': 13}
Train data shape without selector 182
Time taken for crosvalidation: 0:00:38.693569
Fitting model for X3112_mean with best parameters
None selector
Time taken for fitting whole data: 0:00:08.027991
Time taken: 0:37:31.793595


In [13]:
features_array = np.array(train_df['combined_features'].tolist())
X_combined_train = np.hstack([train_df[FEATURE_COLS].values, features_array])

train_pred = np.zeros((train_df.shape[0], len(target_columns)))

selectors = {}

for i, target in enumerate(target_columns):    
    
    study = optuna.create_study(direction='minimize',
                        study_name=study_name,
                        storage=f'sqlite:///502_xgboost_{target}.db',
                        load_if_exists=True                            
                            )
    
    best_params = study.best_trial.params

    testa = best_params['selector']
    print(f'best params {testa}' )

    if best_params['selector'] != 'none':
        if best_params['selector'] == 'f_regression':
            print(f'Selector for {target}: {best_params["selector"]} with {best_params["num_selected"]} features')
            select = SelectKBest(f_regression, k=best_params['num_selected'])
            
        else:
            print(f'Selector for {target}: {best_params["selector"]} with {best_params["num_selected"]} features')
            select = SelectKBest(mutual_info_regression, k=best_params['num_selected'])
            
        X_selected = select.fit_transform(X_combined_train, train_df[target])
        selectors[target] = select

    else:
                 
        X_selected = X_combined_train
        print(f'X_selected shape {X_selected.shape}')
        print(f'None selector in target {target}')
        selectors[target] = None
    
    train_pred[:, i] = models[target].predict(X_selected)

train_r2 = r2_score(train_df[target_columns], train_pred)
print(f'Training R2: {train_r2}')

  if best_params['selector'] is not 'none':
[I 2024-05-05 18:46:17,865] Using an existing study with name '426_convnextbase_003_998_1' instead of creating a new one.


best params none
Selector for X18_mean: none with 107 features


In [None]:
features_array = np.array(test_df['combined_features'].tolist())
X_combined_test = np.hstack([test_df[FEATURE_COLS].values, features_array])

test_preds = np.zeros((len(test_df), len(target_columns)))

for i, target in enumerate(target_columns):
    print(f'Predicting {target} with model {models[target]}')
    study = optuna.create_study(direction='minimize',
                        study_name=study_name,
                        storage=f'sqlite:///502_xgboost_{target}_selectestii.db',
                        load_if_exists=True                            
                            )
    
    if selectors[target] is not None:
        X_selected = selectors[target].transform(X_combined_test)
    else:
        X_selected = X_combined_test

    test_preds[:, i] = models[target].predict(X_selected)
     

In [None]:
target_columns = ['X4', 'X11', 'X18', 'X50', 'X26', 'X3112']

test_df_copy = test_df.copy()
submission_df = test_df_copy[['id']].copy()
submission_df[target_columns] = test_preds

In [None]:
submission_df.describe()

In [None]:
target_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
train_df[target_columns].describe()

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv('./data/submission.csv', index=False)