In [None]:
import numpy as np
import xgboost as xgb
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error
import optuna
import pickle
from datetime import timedelta
import time
import os 
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
import gc
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:

tf.keras.backend.clear_session()
gc.collect()

In [None]:
feat = pd.read_csv('./data/test.csv')
FEATURE_COLS = feat.columns[1:].tolist()

In [None]:
study_name = '525_combined_fold_2_3'

In [None]:


pickle_file_path = f'./data/train_df.pickle'

with open(pickle_file_path, 'rb') as f:
    train_df = pickle.load(f)

In [None]:
pd.set_option('display.max_rows', None)

# Aseta näyttämään rajoittamaton määrä sarakkeita
pd.set_option('display.max_columns', None)

In [None]:
train_df.head()

In [None]:
print(train_df['fold'].value_counts())

In [None]:



def plot_data(df, columns_names):
    plt.figure(figsize=(15, 3))

    # Setting up a grid of plots with 2 columns
    n_cols = 6
    n_rows = len(columns_names) // n_cols + (len(columns_names) % n_cols > 0)

    for i, col in enumerate(columns_names):
    
        plt.subplot(n_rows, n_cols, i+1)
        sns.kdeplot(df[col], bw_adjust=0.5, fill=False, color='blue')
        plt.title(f'Distribution of {col}')
        plt.xlabel('Value')
        plt.ylabel('Density')

    plt.tight_layout()
    plt.show()
    


In [None]:
mean_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

plot_data(train_df, mean_columns)

for column in mean_columns:
    lower_quantile = train_df[column].quantile(0.01)
    upper_quantile = train_df[column].quantile(0.975)  
    train_df = train_df[(train_df[column] >= lower_quantile) & (train_df[column] <= upper_quantile)]
    

plot_data(train_df, mean_columns)

In [None]:
import warnings

# Ohita tietyn tyyppiset varoitukset
warnings.filterwarnings('ignore', category=UserWarning)

# def get_combined_data(df):
#     # Oletetaan, että FEATURES_COLS on jo määritelty olemassa oleville piirteille
#     data = [df[col].values for col in FEATURE_COLS]
#     # Lisää mallin piirteet
#     data.append(np.vstack(df['combined_features'].values))
#     return np.column_stack(data)

def get_features_array(features_series):
    # Muuntaa sarjan, joka sisältää taulukoita, yhdeksi 2D-taulukoksi
    return np.array(list(features_series))

def objective(trial, df, target):
    param = {        
        'objective': 'reg:squarederror',
        'device' : 'cuda',
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log = True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log = True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 1.0, log = True),        
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 142),
        'feature_selector': trial.suggest_categorical('feature_selector', ['shuffle', 'greedy', 'thrifty', 'cyclic', 'random']),
        'boosting': trial.suggest_categorical('boosting', ['gbtree', 'gblinear', 'dart'])

        }
    
    num_total = df['all_features'].iloc[0]
    num_total = len(num_total)
    # print(f'num_total {num_total}')
    
    
    
    folds = [0, 1, 2, 3, 4]
    mse_scores = []
    r2_scores = []

    for fold in folds:

        
        tf.keras.backend.clear_session()
        gc.collect()

        train_data = df[df['fold'] != fold]
        valid_data = df[df['fold'] == fold]

    
        X_train = get_features_array(train_data['all_features'])
        X_valid = get_features_array(valid_data['all_features'])
        # print(f'Shape X_train {X_train.shape}')

        y_train = train_data[target]
        y_valid = valid_data[target]

        # print(f'Y_train shape {y_train.shape}')

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dvalid = xgb.DMatrix(X_valid, label=y_valid)

        # print(f'Done creating DMatrix')

        num_boost_round = trial.suggest_int('n_estimators', 10, 1420, log=True) 
        
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        # print(f'Starting training')
        model = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=watchlist, verbose_eval=False)
        preds = model.predict(dvalid)
        mse = mean_squared_error(y_valid, preds)
        r2 = r2_score(y_valid, preds)

        trial.report(r2, fold)

        if trial.should_prune():
            print(f'Pruned fold {fold} with value {r2} and mse {mse}')
            raise optuna.TrialPruned()

        print(f'Fold {fold} MSE: {mse} R2: {r2}')
        mse_scores.append(mse)
        r2_scores.append(r2)
    

    tf.keras.backend.clear_session()
    gc.collect()
    
    return np.mean(r2_scores)
    

def optimize_model(df, target, fold_train, fold_validation):

    if os.path.exists(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_qmc_sampler.pickle'):
        with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_qmc_sampler.pickle', 'rb') as f:
            print(f'Loading QMC sampler from file {f}')
            qmc_sampler = pickle.load(f)
    else:
        print(f'Creating new QMC sampler')
        qmc_sampler = optuna.samplers.QMCSampler(warn_independent_sampling = False)

    if os.path.exists(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_tpe_sampler.pickle'):
        with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_tpe_sampler.pickle', 'rb') as f:
            print(f'Loading TPE sampler from file {f}')
            tpe_sampler = pickle.load(f)
    else:
        print(f'Creating new TPE sampler')
        tpe_sampler = optuna.samplers.TPESampler(n_startup_trials=0, multivariate=True, warn_independent_sampling = False)

    if os.path.exists(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_pruner.pickle'):
        with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_pruner.pickle', 'rb') as f:
            print(f'Loading pruner from file {f}')
            pruner = pickle.load(f)
    else:
        print(f'Creating new pruner')
        pruner = optuna.pruners.MedianPruner(n_startup_trials=5)

    start_time = time.time()
    study = optuna.create_study(direction='maximize',
                            study_name=study_name,
                            storage=f'sqlite:///525_xgboost_{target}.db',
                            load_if_exists=True                                    
                            )
    
    print(f'Starting optimization for {target} with qmc sampler')
    random_time = time.time()
    study.sampler = qmc_sampler
    study.optimize(lambda trial: objective(trial, df, target), n_trials=5)
    print(f'QCM optimization finished in {timedelta(seconds=time.time() - random_time)}')

    print(f'Saving QMC sampler to file ./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_qmc_sampler.pickle')
    with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_qmc_sampler.pickle', 'wb') as f:
        pickle.dump(qmc_sampler, f)

    print(f'Starting optimization for {target} with TPE sampler')
    tpe_time = time.time()
    study.sampler = tpe_sampler
    study.optimize(lambda trial: objective(trial, df, target), n_trials=20)
    print(f'TPE optimization finished in {timedelta(seconds=time.time() - tpe_time)}')

    print(f'Saving TPE sampler to file ./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_tpe_sampler.pickle')
    with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_tpe_sampler.pickle', 'wb') as f:
        pickle.dump(tpe_sampler, f)

    print(f'Saving pruner to file ./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_pruner.pickle')
    with open(f'./NN_search/{study_name}_{fold_train}_{fold_validation}_{target}_pruner.pickle', 'wb') as f:
        pickle.dump(pruner, f)

    print(f'Optimization finished in {timedelta(seconds=time.time() - start_time)}')

    
    



In [None]:
# feature_columns = ['model_features_514_convnextlarge_maxavg_2', 'model_features_511_convnextlarge_3'] 
feature_columns = ['model_features_511_convnextlarge_3'] 
# feature_columns = ['model_features_514_convnextlarge_maxavg_2'] 

In [None]:
print(f'Featu {feature_columns}')

In [None]:
def prepare_features(df, feature_columns):
    # Yhdistää useita sarakkeita, oletetaan että jokainen arvo on listamuodossa tai pienenä NumPy-taulukkona
    combined_features = np.hstack([np.vstack(df[col].values) for col in feature_columns])
    # Muodosta data lista, jossa on kaikki tarvittavat feature sarakkeet
    data = [df[col].values for col in FEATURE_COLS]
    # Lisää combined_features listaan
    data.append(combined_features)
    # Muunna data NumPy-taulukoksi ja yhdistä sarakkeittain, muunna sitten lista vektoriksi jokaiselle riville
    all_features = np.column_stack(data).tolist()
    # Aseta tämä lista DataFrameen uutena sarakeena
    df['all_features'] = all_features
    return df


In [None]:
train_df_feat = prepare_features(train_df, feature_columns)

In [None]:
train_df.head()

In [None]:
testa = train_df_feat['all_features'].iloc[0]
testa = np.array(testa)
print(f'{testa.shape}')

In [None]:
testi = train_df_feat['model_features_511_convnextlarge_3'].iloc[0]
testi = np.array(testi)
print(f'{testi.shape}')

In [None]:
print(f'Feta len {len(FEATURE_COLS)}')

In [None]:
target_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
train_fold = 80
validation_fold = 1

time_search_start = time.time()
time_taken = 0

while time_taken < 3600 * 3:
    for target in target_columns:    
        print(f'\n\nOptimizing model for {target}\n\n')
        optimize_model(train_df_feat, target, train_fold, validation_fold)
        time_taken = time.time() - time_search_start
        print(f'Time taken: {timedelta(seconds=time_taken)}')   

In [None]:
target_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

best_r2 = []

train_pred = np.zeros((train_df.shape[0], len(target_columns)))

selectors = {}
models = {}

for i, target in enumerate(target_columns):    

    tf.keras.backend.clear_session()
    gc.collect()
    
    study = optuna.create_study(direction='maximize',
                        study_name=study_name,
                        storage=f'sqlite:///525_xgboost_{target}.db',
                        load_if_exists=True                            
                            )
    
    best_params = study.best_trial.params

    print(f'Best value {study.best_value}')
    print(f'Best params for {target}: {best_params}')
    print(f'Num trials {len(study.trials)}')

    best_r2.append(study.best_value)


                
    X_selected = get_features_array(train_df_feat['all_features'])
    
    
    best_params['objective'] = 'reg:squarederror'
    best_params['device'] = 'cuda'

    
    model = xgb.XGBRegressor(**best_params)
    model.fit(X_selected, train_df_feat[target])
    models[target] = model
    
    pred = models[target].predict(X_selected)
    r2 = r2_score(train_df[target], pred)
    mse = mean_squared_error(train_df[target], pred)
    print(f'Training R2 for {target}: {r2} and MSE: {mse}')

    print(f'Model for {target} {models[target]}')

    train_pred[:, i] = models[target].predict(X_selected)


train_r2 = r2_score(train_df_feat[target_columns], train_pred)
print(f'Training R2: {train_r2}')

for idx, name in enumerate(target_columns):
    print(f'Target {name} R2: {best_r2[idx]}')

In [None]:
pickle_file_path = f'./data/test_df.pickle'

with open(pickle_file_path, 'rb') as f:
    test_df = pickle.load(f)

In [None]:
del train_df 
gc.collect()

In [None]:
test_df.head()

In [None]:
test_df_feat = prepare_features(test_df, feature_columns)

In [None]:
test_df.head()

In [None]:


test_preds = np.zeros((len(test_df), len(target_columns)))

for i, target in enumerate(target_columns):
    print(f'Predicting {target} with model {models[target]}')
  
    X_selected = get_features_array(test_df_feat['all_features'])

    test_preds[:, i] = models[target].predict(X_selected)
     

In [None]:
target_columns = ['X4', 'X11', 'X18', 'X50', 'X26', 'X3112']

test_df_copy = test_df.copy()
submission_df = test_df_copy[['id']].copy()
submission_df[target_columns] = test_preds

In [None]:
submission_df.describe()

In [None]:
target_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
train_df[target_columns].describe()

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv('./data/submission.csv', index=False)