In [None]:
import numpy as np
import xgboost as xgb
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error
import optuna
import pickle
from datetime import timedelta
import time
import os 
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
import gc
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
feat = pd.read_csv('./data/test.csv')
FEATURE_COLS = feat.columns[1:].tolist()

In [None]:
study_name = '426_convnextbase_003_998_1'

In [None]:
pickle_file_path = f'./data/test_{study_name}.pickle'

with open(pickle_file_path, 'rb') as f:
    test_df = pickle.load(f)

pickle_file_path = f'./data/train_{study_name}.pickle'

with open(pickle_file_path, 'rb') as f:
    train_df = pickle.load(f)

In [None]:
print(train_df['fold'].value_counts())

In [None]:



def plot_data(df, columns_names):
    plt.figure(figsize=(15, 3))

    # Setting up a grid of plots with 2 columns
    n_cols = 6
    n_rows = len(columns_names) // n_cols + (len(columns_names) % n_cols > 0)

    for i, col in enumerate(columns_names):
    
        plt.subplot(n_rows, n_cols, i+1)
        sns.kdeplot(df[col], bw_adjust=0.5, fill=False, color='blue')
        plt.title(f'Distribution of {col}')
        plt.xlabel('Value')
        plt.ylabel('Density')

    plt.tight_layout()
    plt.show()
    


In [None]:
mean_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

plot_data(train_df, mean_columns)

for column in mean_columns:
    lower_quantile = train_df[column].quantile(0.01)
    upper_quantile = train_df[column].quantile(0.975)  
    train_df = train_df[(train_df[column] >= lower_quantile) & (train_df[column] <= upper_quantile)]
    

plot_data(train_df, mean_columns)

In [None]:
import warnings

# Ohita tietyn tyyppiset varoitukset
warnings.filterwarnings('ignore', category=UserWarning)

def get_combined_data(df):
    # Oletetaan, että FEATURES_COLS on jo määritelty olemassa oleville piirteille
    data = [df[col].values for col in FEATURE_COLS]
    # Lisää mallin piirteet
    data.append(np.vstack(df['combined_features'].values))
    return np.column_stack(data)

def objective(trial, df, target, fold_train, fold_validation):
    param = {        
        'objective': 'reg:squarederror',
        'device' : 'cuda',
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log = True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log = True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 1.0, log = True),        
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 142),
        'feature_selector': trial.suggest_categorical('feature_selector', ['shuffle', 'greedy', 'thrifty', 'cyclic', 'random']),
        'boosting': trial.suggest_categorical('boosting', ['gbtree', 'gblinear', 'dart'])

        }
    
    num_comb = df['combined_features'].iloc[0].shape[0]
    # print(f'Num comb: {num_comb}')
    num_feat = len(FEATURE_COLS)
    # print(f'Num feat: {num_feat}')
    num_total = num_comb + num_feat
    # print(f'Num total: {num_total}')
    

    selector = trial.suggest_categorical('selector', ['f_regression', 'mutual_info_regression', 'none'])
    num_selected = trial.suggest_int('num_selected', 1, num_total - 1)

    train_data = df[df['fold'] != fold_validation]
    valid_data = df[df['fold'] == fold_validation]

    if selector == 'f_regression':
        select = SelectKBest(f_regression, k=num_selected)
    elif selector == 'mutual_info_regression':
        select = SelectKBest(mutual_info_regression, k=num_selected)
    elif selector == 'none':
        select = None

    if select is not None:
        X_train = select.fit_transform(get_combined_data(train_data), train_data[target])
        X_valid = select.transform(get_combined_data(valid_data))
    else:
        X_train = get_combined_data(train_data)
        X_valid = get_combined_data(valid_data)

    y_train = train_data[target]
    y_valid = valid_data[target]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    num_boost_round = trial.suggest_int('n_estimators', 10, 1420, log=True) 
    # early_stopping_rounds = trial.suggest_int('early_stopping_rounds', 5, 50)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    model = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=watchlist, verbose_eval=False)
    preds = model.predict(dvalid)
    mse = mean_squared_error(y_valid, preds)
    r2 = r2_score(y_valid, preds)
    
    return r2 
    

def optimize_model(df, target, fold_train, fold_validation):

    # if os.path.exists(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_genesampler.pickle'):            
    #     with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_genesampler.pickle', 'rb') as f:
    #         print(f'Loading gene sampler from file {f}')
    #         genemachine = pickle.load(f)

    # else:            
    #     print('Creating new gene sampler')
    #     genemachine = optuna.samplers.NSGAIISampler(crossover = optuna.samplers.nsgaii.VSBXCrossover())

    if os.path.exists(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_qmc_sampler.pickle'):
        with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_qmc_sampler.pickle', 'rb') as f:
            print(f'Loading QMC sampler from file {f}')
            qmc_sampler = pickle.load(f)
    else:
        print(f'Creating new QMC sampler')
        qmc_sampler = optuna.samplers.QMCSampler(warn_independent_sampling = False)

    if os.path.exists(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_tpe_sampler.pickle'):
        with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_tpe_sampler.pickle', 'rb') as f:
            print(f'Loading TPE sampler from file {f}')
            tpe_sampler = pickle.load(f)
    else:
        print(f'Creating new TPE sampler')
        tpe_sampler = optuna.samplers.TPESampler(n_startup_trials=0, multivariate=True, warn_independent_sampling = False)

    # if os.path.exists(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_pruner.pickle'):
    #     with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_pruner.pickle', 'rb') as f:
    #         print(f'Loading pruner from file {f}')
    #         pruner = pickle.load(f)
    # else:
    #     print(f'Creating new pruner')
    #     pruner = optuna.pruners.MedianPruner(n_startup_trials=5)

    start_time = time.time()
    study = optuna.create_study(direction='maximize',
                            study_name=study_name,
                            storage=f'sqlite:///502_xgboost_{target}.db',
                            load_if_exists=True                                    
                            )
    
    print(f'Starting optimization for {target} with qmc sampler')
    random_time = time.time()
    study.sampler = qmc_sampler
    study.optimize(lambda trial: objective(trial, df, target, fold_train, fold_validation), n_trials=5)
    print(f'QCM optimization finished in {timedelta(seconds=time.time() - random_time)}')

    print(f'Saving QMC sampler to file ./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_qmc_sampler.pickle')
    with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_qmc_sampler.pickle', 'wb') as f:
        pickle.dump(qmc_sampler, f)

    # print(f'Starting optimization for {target} with gene sampler')
    # gene_time = time.time()
    # study.sampler = genemachine
    # study.optimize(lambda trial: objective(trial, df, target, fold_train, fold_validation), n_trials=20)
    # print(f'Gene optimization finished in {timedelta(seconds=time.time() - gene_time)}')

    # print(f'Saving gene sampler to file ./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_genesampler.pickle')
    # with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_genesampler.pickle', 'wb') as f:
    #     pickle.dump(genemachine, f)

    print(f'Starting optimization for {target} with TPE sampler')
    tpe_time = time.time()
    study.sampler = tpe_sampler
    study.optimize(lambda trial: objective(trial, df, target, fold_train, fold_validation), n_trials=20)
    print(f'TPE optimization finished in {timedelta(seconds=time.time() - tpe_time)}')

    print(f'Saving TPE sampler to file ./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_tpe_sampler.pickle')
    with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_tpe_sampler.pickle', 'wb') as f:
        pickle.dump(tpe_sampler, f)

    # print(f'Saving pruner to file ./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_pruner.pickle')
    # with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_pruner.pickle', 'wb') as f:
    #     pickle.dump(pruner, f)

    print(f'Optimization finished in {timedelta(seconds=time.time() - start_time)}')

    
    



In [None]:
train_df.head()

In [None]:
def prepare_features(row):
    return np.array(row[f'model_features_{study_name}'])

train_df['combined_features'] = train_df.apply(prepare_features, axis=1)
test_df['combined_features'] = test_df.apply(prepare_features, axis=1)



In [None]:
target_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
train_fold = 80
validation_fold = 1



time_search_start = time.time()
time_taken = 0

while time_taken < 3600 * 18:
    for target in target_columns:    
        print(f'\n\nOptimizing model for {target} using train fold {train_fold} and validation fold {validation_fold}\n\n')
        optimize_model(train_df, target, train_fold, validation_fold)
        time_taken = time.time() - time_search_start
        print(f'Time taken: {timedelta(seconds=time_taken)}')   

In [None]:
target_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
features_array = np.array(train_df['combined_features'].tolist())
X_combined_train = np.hstack([train_df[FEATURE_COLS].values, features_array])

train_pred = np.zeros((train_df.shape[0], len(target_columns)))

selectors = {}

models = {}

for i, target in enumerate(target_columns):    

    tf.keras.backend.clear_session()
    gc.collect()
    
    study = optuna.create_study(direction='maximize',
                        study_name=study_name,
                        storage=f'sqlite:///502_xgboost_{target}.db',
                        load_if_exists=True                            
                            )
    
    best_params = study.best_trial.params


    if best_params['selector'] != 'none':
        if best_params['selector'] == 'f_regression':
            print(f'Selector for {target}: {best_params["selector"]} with {best_params["num_selected"]} features')
            select = SelectKBest(f_regression, k=best_params['num_selected'])
            
        else:
            print(f'Selector for {target}: {best_params["selector"]} with {best_params["num_selected"]} features')
            select = SelectKBest(mutual_info_regression, k=best_params['num_selected'])
            
        X_selected = select.fit_transform(X_combined_train, train_df[target])
        selectors[target] = select

    else:
                 
        X_selected = X_combined_train
        print(f'X_selected shape {X_selected.shape}')
        print(f'None selector in target {target}')
        selectors[target] = None

    print(f'Best params for {target}: {best_params}')

    
    del best_params['selector']
    del best_params['num_selected']
    del best_params['feature_selector']

    
    best_params['objective'] = 'reg:squarederror'
    best_params['device'] = 'cuda'

    
    model = xgb.XGBRegressor(**best_params)
    model.fit(X_selected, train_df[target])
    models[target] = model
    
    pred = models[target].predict(X_selected)
    r2 = r2_score(train_df[target], pred)
    mse = mean_squared_error(train_df[target], pred)
    print(f'Training R2 for {target}: {r2} and MSE: {mse}')

    print(f'Model for {target} {models[target]}')

    train_pred[:, i] = models[target].predict(X_selected)


train_r2 = r2_score(train_df[target_columns], train_pred)
print(f'Training R2: {train_r2}')

In [None]:
features_array = np.array(test_df['combined_features'].tolist())
X_combined_test = np.hstack([test_df[FEATURE_COLS].values, features_array])

test_preds = np.zeros((len(test_df), len(target_columns)))

for i, target in enumerate(target_columns):
    print(f'Predicting {target} with model {models[target]}')
    study = optuna.create_study(direction='minimize',
                        study_name=study_name,
                        storage=f'sqlite:///502_xgboost_{target}_selectestii.db',
                        load_if_exists=True                            
                            )
    
    if selectors[target] is not None:
        X_selected = selectors[target].transform(X_combined_test)
    else:
        X_selected = X_combined_test

    test_preds[:, i] = models[target].predict(X_selected)
     

In [None]:
target_columns = ['X4', 'X11', 'X18', 'X50', 'X26', 'X3112']

test_df_copy = test_df.copy()
submission_df = test_df_copy[['id']].copy()
submission_df[target_columns] = test_preds

In [None]:
submission_df.describe()

In [None]:
target_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
train_df[target_columns].describe()

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv('./data/submission.csv', index=False)