In [1]:
import numpy as np
import xgboost as xgb
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error
import optuna
import pickle
from datetime import timedelta
import time

In [2]:
feat = pd.read_csv('./data/test.csv')
FEATURE_COLS = feat.columns[1:].tolist()

In [3]:
# pickle_file_path = './data/test_df.pickle'

# with open(pickle_file_path, 'rb') as f:
#     test_df = pickle.load(f)

pickle_file_path = './data/train_testi_df.pickle'

with open(pickle_file_path, 'rb') as f:
    train_df = pickle.load(f)

In [4]:
testi = train_df[:2000]
train_df = train_df[2000:]

In [5]:
def get_combined_data(df):
    # Oletetaan, että FEATURES_COLS on jo määritelty olemassa oleville piirteille
    data = [df[col].values for col in FEATURE_COLS]
    # Lisää mallin piirteet
    data.append(np.vstack(df['combined_features'].values))
    return np.column_stack(data)

def objective(trial, df, target):
    param = {        
        'objective': 'reg:squarederror',        
        'device' : 'cuda',
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log = True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log = True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.5, 0.7, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.6, 0.8, 1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.05]),
        'n_estimators': trial.suggest_categorical('n_estimators', [100, 200, 150, 300]),
        'max_depth': trial.suggest_categorical('max_depth', [2, 3, 5, 7, 9]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'early_stopping_rounds': 10  }
    
    mse_scores = []

    for fold in df['fold'].unique():
        train_data = df[df['fold'] != fold]
        valid_data = df[df['fold'] == fold]

        X_train = get_combined_data(train_data)
        X_valid = get_combined_data(valid_data)

        y_train = train_data[target]
        y_valid = valid_data[target]

    
        model = xgb.XGBRegressor(**param)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0)
        preds = model.predict(X_valid)
        mse = mean_squared_error(y_valid, preds)
        mse_scores.append(mse)

    average_mse = np.mean(mse_scores)
    return average_mse

def optimize_model(df, target):

    start_time = time.time()
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective(trial, df, target), n_trials=15)
    print(f'Optimization finished in {timedelta(seconds=time.time() - start_time)}')

    best_params = study.best_trial.params    
    if 'early_stopping_rounds' in best_params:
        del best_params['early_stopping_rounds']
    print(f"Best parameters for {target}: ", best_params)
    
    # Käytä parhaita parametreja koko datan kouluttamiseen
    X = get_combined_data(df)
    y = df[target]
    model = xgb.XGBRegressor(**best_params)
    model.fit(X, y)
    
    return model



In [6]:
def prepare_features(row):
    return np.array(row[f'model_features_423_std_powerlog_3_finetuned'])

train_df['combined_features'] = train_df.apply(prepare_features, axis=1)
# test_df['combined_features'] = test_df.apply(prepare_features, axis=1)
testi['combined_features'] = testi.apply(prepare_features, axis=1)


In [7]:
target_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

# Mallien kouluttaminen jokaiselle kohdemuuttujalle
models = {}
for target in target_columns:    
    print(f'\n\nOptimizing model for {target}\n\n')
    models[target] = optimize_model(train_df, target)

[I 2024-04-25 17:34:14,149] A new study created in memory with name: no-name-7d0ecf96-8a8e-4ff6-aaa0-f54ae723be50




Optimizing model for X4_mean




Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


[I 2024-04-25 17:34:47,186] Trial 0 finished with value: 0.014767306798050186 and parameters: {'lambda': 4.45328572879446e-06, 'alpha': 4.8413489204266845e-06, 'colsample_bytree': 1.0, 'subsample': 0.8, 'learning_rate': 0.05, 'n_estimators': 100, 'max_depth': 9, 'min_child_weight': 56}. Best is trial 0 with value: 0.014767306798050186.
[I 2024-04-25 17:35:01,805] Trial 1 finished with value: 0.015395489288075163 and parameters: {'lambda': 0.00013050522763633686, 'alpha': 0.371704356841617, 'colsample_bytree': 0.3, 'subsample': 1.0, 'learning_rate': 0.05, 'n_estimators': 200, 'max_depth': 3, 'min_child_weight': 27}. Best is trial 0 with value: 0.014767306798050186.
[I 2024-04-25 17:35:37,386] Trial 2 finished with value: 0.015818162167329782 and parameters: {'lambda': 0.003071991248201804, 'alpha': 7.026839136556473e-06, 'colsample_by

Optimization finished in 0:02:34.224883
Best parameters for X4_mean:  {'lambda': 4.45328572879446e-06, 'alpha': 4.8413489204266845e-06, 'colsample_bytree': 1.0, 'subsample': 0.8, 'learning_rate': 0.05, 'n_estimators': 100, 'max_depth': 9, 'min_child_weight': 56}


[I 2024-04-25 17:37:15,979] A new study created in memory with name: no-name-b9bc0e33-3669-4dc1-90ed-99bc3758510f




Optimizing model for X11_mean




[I 2024-04-25 17:37:39,334] Trial 0 finished with value: 40.762398859222515 and parameters: {'lambda': 4.565401947671788e-05, 'alpha': 0.006463735410213292, 'colsample_bytree': 0.3, 'subsample': 0.6, 'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 5, 'min_child_weight': 24}. Best is trial 0 with value: 40.762398859222515.
[I 2024-04-25 17:39:04,457] Trial 1 finished with value: 38.19353471580814 and parameters: {'lambda': 4.4534597828663984e-07, 'alpha': 3.0359528699302198e-05, 'colsample_bytree': 1.0, 'subsample': 0.6, 'learning_rate': 0.01, 'n_estimators': 200, 'max_depth': 7, 'min_child_weight': 26}. Best is trial 1 with value: 38.19353471580814.
[I 2024-04-25 17:39:34,340] Trial 2 finished with value: 37.931249293667484 and parameters: {'lambda': 1.3700545580176212e-08, 'alpha': 8.616392369610164e-08, 'colsample_bytree': 0.3, 'subsample': 1.0, 'learning_rate': 0.05, 'n_estimators': 200, 'max_depth': 5, 'min_child_weight': 24}. Best is trial 2 with value: 37.931249293667484

Optimization finished in 0:02:53.300904
Best parameters for X11_mean:  {'lambda': 1.3700545580176212e-08, 'alpha': 8.616392369610164e-08, 'colsample_bytree': 0.3, 'subsample': 1.0, 'learning_rate': 0.05, 'n_estimators': 200, 'max_depth': 5, 'min_child_weight': 24}


[I 2024-04-25 17:40:29,379] A new study created in memory with name: no-name-6204b0cf-3219-4e1c-a60e-ebea3d9b8daf




Optimizing model for X18_mean




[I 2024-04-25 17:41:20,448] Trial 0 finished with value: 15.035526680914359 and parameters: {'lambda': 0.15659445130096575, 'alpha': 0.002813637554420172, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.02, 'n_estimators': 300, 'max_depth': 5, 'min_child_weight': 26}. Best is trial 0 with value: 15.035526680914359.
[I 2024-04-25 17:42:38,800] Trial 1 finished with value: 14.684137451996055 and parameters: {'lambda': 0.031433276806015806, 'alpha': 0.14843686668481, 'colsample_bytree': 1.0, 'subsample': 0.6, 'learning_rate': 0.02, 'n_estimators': 300, 'max_depth': 7, 'min_child_weight': 17}. Best is trial 1 with value: 14.684137451996055.
[I 2024-04-25 17:43:10,061] Trial 2 finished with value: 15.004700130950633 and parameters: {'lambda': 1.0538540518631162e-05, 'alpha': 9.643128339510929e-05, 'colsample_bytree': 1.0, 'subsample': 0.6, 'learning_rate': 0.05, 'n_estimators': 200, 'max_depth': 5, 'min_child_weight': 90}. Best is trial 1 with value: 14.684137451996055.
[I 202

Optimization finished in 0:03:52.897955
Best parameters for X18_mean:  {'lambda': 0.031433276806015806, 'alpha': 0.14843686668481, 'colsample_bytree': 1.0, 'subsample': 0.6, 'learning_rate': 0.02, 'n_estimators': 300, 'max_depth': 7, 'min_child_weight': 17}


[I 2024-04-25 17:45:11,667] A new study created in memory with name: no-name-3fd4e05e-6e00-4c38-a356-ed8c549f7364




Optimizing model for X50_mean




[I 2024-04-25 17:46:34,638] Trial 0 finished with value: 0.2946775349542087 and parameters: {'lambda': 0.010456514269312772, 'alpha': 1.8387765258555765e-08, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.02, 'n_estimators': 300, 'max_depth': 7, 'min_child_weight': 64}. Best is trial 0 with value: 0.2946775349542087.
[I 2024-04-25 17:46:59,739] Trial 1 finished with value: 0.2987816952910762 and parameters: {'lambda': 3.537941236501808e-05, 'alpha': 0.5346889894513339, 'colsample_bytree': 0.3, 'subsample': 1.0, 'learning_rate': 0.05, 'n_estimators': 150, 'max_depth': 5, 'min_child_weight': 32}. Best is trial 0 with value: 0.2946775349542087.
[I 2024-04-25 17:47:21,377] Trial 2 finished with value: 0.31456508217408113 and parameters: {'lambda': 6.907886942057413e-07, 'alpha': 1.0102734924888815e-08, 'colsample_bytree': 0.3, 'subsample': 0.6, 'learning_rate': 0.01, 'n_estimators': 150, 'max_depth': 3, 'min_child_weight': 87}. Best is trial 0 with value: 0.2946775349542087.

Optimization finished in 0:03:22.811451
Best parameters for X50_mean:  {'lambda': 0.0018940029791569275, 'alpha': 2.3886845476137073e-07, 'colsample_bytree': 0.5, 'subsample': 1.0, 'learning_rate': 0.05, 'n_estimators': 200, 'max_depth': 7, 'min_child_weight': 75}


[I 2024-04-25 17:49:01,833] A new study created in memory with name: no-name-3923d791-9fb7-4303-8852-8d913d7adea8




Optimizing model for X26_mean




[I 2024-04-25 17:49:32,556] Trial 0 finished with value: 4530.0317574806995 and parameters: {'lambda': 1.2950622468904335e-08, 'alpha': 0.0001709665528726724, 'colsample_bytree': 0.3, 'subsample': 1.0, 'learning_rate': 0.02, 'n_estimators': 100, 'max_depth': 7, 'min_child_weight': 59}. Best is trial 0 with value: 4530.0317574806995.
[I 2024-04-25 17:49:56,299] Trial 1 finished with value: 4507.48282272682 and parameters: {'lambda': 1.89937059558303e-08, 'alpha': 0.03826896768772912, 'colsample_bytree': 0.5, 'subsample': 0.6, 'learning_rate': 0.05, 'n_estimators': 100, 'max_depth': 7, 'min_child_weight': 54}. Best is trial 1 with value: 4507.48282272682.
[I 2024-04-25 17:50:24,884] Trial 2 finished with value: 4542.849951398067 and parameters: {'lambda': 0.0023916125469666907, 'alpha': 0.11270418176111754, 'colsample_bytree': 0.3, 'subsample': 0.6, 'learning_rate': 0.02, 'n_estimators': 100, 'max_depth': 7, 'min_child_weight': 46}. Best is trial 1 with value: 4507.48282272682.
[I 2024-0

Optimization finished in 0:02:38.566113
Best parameters for X26_mean:  {'lambda': 1.89937059558303e-08, 'alpha': 0.03826896768772912, 'colsample_bytree': 0.5, 'subsample': 0.6, 'learning_rate': 0.05, 'n_estimators': 100, 'max_depth': 7, 'min_child_weight': 54}


[I 2024-04-25 17:51:55,509] A new study created in memory with name: no-name-fecefb42-7d24-4b86-89f3-f7eaf1dccd56




Optimizing model for X3112_mean




[I 2024-04-25 17:52:09,480] Trial 0 finished with value: 4038742.4890485955 and parameters: {'lambda': 0.011651712715905014, 'alpha': 1.9834579040302317e-08, 'colsample_bytree': 1.0, 'subsample': 0.6, 'learning_rate': 0.02, 'n_estimators': 100, 'max_depth': 2, 'min_child_weight': 80}. Best is trial 0 with value: 4038742.4890485955.
[I 2024-04-25 17:53:01,823] Trial 1 finished with value: 3742098.718862331 and parameters: {'lambda': 0.28525264165915365, 'alpha': 1.8531908483751415e-06, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.02, 'n_estimators': 150, 'max_depth': 7, 'min_child_weight': 58}. Best is trial 1 with value: 3742098.718862331.
[I 2024-04-25 17:53:23,876] Trial 2 finished with value: 3850206.8316072794 and parameters: {'lambda': 0.0013324611247932676, 'alpha': 1.0604149868308294e-06, 'colsample_bytree': 0.5, 'subsample': 1.0, 'learning_rate': 0.02, 'n_estimators': 100, 'max_depth': 5, 'min_child_weight': 19}. Best is trial 1 with value: 3742098.718862331.
[

Optimization finished in 0:03:32.459077
Best parameters for X3112_mean:  {'lambda': 0.02794196288107175, 'alpha': 1.363677972637457e-06, 'colsample_bytree': 0.5, 'subsample': 1.0, 'learning_rate': 0.01, 'n_estimators': 300, 'max_depth': 7, 'min_child_weight': 41}


In [8]:
features_array = np.array(testi['combined_features'].tolist())
X_combined_train = np.hstack([testi[FEATURE_COLS].values, features_array])

train_pred = np.zeros((testi.shape[0], len(target_columns)))

for i, target in enumerate(target_columns):    
    train_pred[:, i] = models[target].predict(X_combined_train)

train_r2 = r2_score(testi[target_columns], train_pred)
print(f'Training R2: {train_r2}')

Training R2: 0.2211687070920861


In [9]:
features_array = np.array(testi['combined_features'].tolist())
X_combined_test = np.hstack([testi[FEATURE_COLS].values, features_array])

test_preds = np.zeros((len(testi), len(target_columns)))

for i, target in enumerate(target_columns):
    print(f'Predicting {target} with model {models[target]}')
    test_preds[:, i] = models[target].predict(X_combined_test)
     

Predicting X4_mean with model XGBRegressor(alpha=4.8413489204266845e-06, base_score=None, booster=None,
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=1.0, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, lambda=4.45328572879446e-06,
             learning_rate=0.05, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=9,
             max_leaves=None, min_child_weight=56, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=100,
             n_jobs=None, ...)
Predicting X11_mean with model XGBRegressor(alpha=8.616392369610164e-08, base_score=None, booster=None,
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.3,

In [10]:
target_columns = ['X4', 'X11', 'X18', 'X50', 'X26', 'X3112']

test_df_copy = test_df.copy()
submission_df = test_df_copy[['id']].copy()
submission_df[target_columns] = test_preds

NameError: name 'test_df' is not defined

In [None]:
submission_df.describe()

In [None]:
train_df[target_columns].describe()

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv('./data/submission.csv', index=False)