In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import os 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler,  RobustScaler
import pickle
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetV2M
import numpy as np
import gc


In [None]:
pickle_file_path = './data/test_df.pickle'

with open(pickle_file_path, 'rb') as f:
    test_df = pickle.load(f)

pickle_file_path = './data/train_df.pickle'

with open(pickle_file_path, 'rb') as f:
    train_df = pickle.load(f)
    



In [None]:



mean_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
# FEATURE_COLS = test_df.columns[1:-4].tolist()

selected_features_pickle_path = './data/selected_features_list.pickle'
with open(selected_features_pickle_path, 'rb') as f:
    FEATURE_COLS = pickle.load(f)

print(FEATURE_COLS)



In [None]:
def plot_data(df, columns_names):
    plt.figure(figsize=(15, 3))

    # Setting up a grid of plots with 2 columns
    n_cols = 6
    n_rows = len(columns_names) // n_cols + (len(columns_names) % n_cols > 0)

    for i, col in enumerate(columns_names):
        plt.subplot(n_rows, n_cols, i+1)
        sns.kdeplot(df[col], bw_adjust=0.5, fill=False, color='blue')
        plt.title(f'Distribution of {col}')
        plt.xlabel('Value')
        plt.ylabel('Density')

    plt.tight_layout()
    plt.show()
    


In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
train_df[mean_columns].describe()

In [None]:
plot_data(train_df, mean_columns)

In [None]:
# train_df[mean_columns] = np.log10(train_df[mean_columns])

In [None]:
train_df[mean_columns].describe()

In [None]:
plot_data(train_df, mean_columns)

In [None]:
print(train_df['fold'].value_counts())

# scaler = StandardScaler() # TODO testaa robustscaler
scaler = RobustScaler()

sample_df = train_df.copy()
train_df = sample_df[sample_df.fold != 3]
valid_df = sample_df[sample_df.fold == 3]
print(f"# Num Train: {len(train_df)} | Num Valid: {len(valid_df)}")


train_df[FEATURE_COLS] = scaler.fit_transform(train_df[FEATURE_COLS].values)
valid_df[FEATURE_COLS] = scaler.transform(valid_df[FEATURE_COLS].values)

with open('./data/scaler_tabufeatures_train.pickle', 'wb') as f:
    pickle.dump(scaler, f)




In [None]:
X_train_tab = train_df[FEATURE_COLS].values
X_train_feat = np.stack(train_df['features'].values)
# y_train = train_df[mean_columns].values
y_train = train_df[mean_columns]

X_valid_tab = valid_df[FEATURE_COLS].values 
X_valid_feat = np.stack(valid_df['features'].values)
# y_valid = valid_df[mean_columns].values
y_valid = valid_df[mean_columns]



In [None]:
print(X_train_tab.shape)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from optuna.integration import TFKerasPruningCallback
import optuna
from keras import regularizers, layers, optimizers, initializers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN
from datetime import timedelta
import time
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler,  RobustScaler

# os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'



def r2_score(y_true, y_pred):
    ss_res = tf.reduce_sum(tf.square(y_true - y_pred), axis=0)
    ss_tot = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true, axis=0)), axis=0)
    r2 = 1 - ss_res/(ss_tot + tf.keras.backend.epsilon())
    r2 = tf.where(tf.math.is_nan(r2), tf.zeros_like(r2), r2)  # Korvaa NaN-arvot nollilla
    return tf.reduce_mean(tf.maximum(r2, 0.0)) 


def huber_loss_wrapper(delta):
    def huber_loss(y_true, y_pred):
        return tf.keras.losses.Huber(delta=delta)(y_true, y_pred)
    return huber_loss


global current_delta
def create_model(trial):

    image_features_input = Input(shape=(X_train_feat.shape[1],), name='image_features_input')
    tabular_data_input = Input(shape=(X_train_tab.shape[1],), name='tabular_data_input')

    img_num_layers = trial.suggest_int('Imgage layers', 1, 2)
    max_img_units = 2000
    img_dense = image_features_input
    
    image_init = trial.suggest_categorical(f'Img_init', choices = ['glorot_uniform', 'he_normal', 'he_uniform', 'lecun_normal', 'lecun_uniform',  'random_normal', 'random_uniform'])
    activation_img = trial.suggest_categorical(f'Act_img', choices = ['relu', 'tanh', 'selu', 'LeakyReLU', 'swish', 'elu'])
    drop_img = trial.suggest_float(f'Drop_img', 0.2, 0.7, step=0.1)
    batch_norm_img = trial.suggest_categorical(f'Img_BatchN', choices = ['On', 'Off'])
    for i in range(img_num_layers):

        num_img_units = trial.suggest_int(f'Num_img_{i}', 128, max_img_units, log = True)        
        img_dense = Dense(num_img_units, activation=activation_img, kernel_initializer = image_init)(img_dense)
        if batch_norm_img == 'On':
            img_dense = layers.BatchNormalization()(img_dense)
        img_dense = Dropout(drop_img)(img_dense)
        max_img_units = min(max_img_units, num_img_units)


    tab_num_layers = trial.suggest_int('Tabular layers', 1, 2)
    max_tab_units = 1000
    tab_dense = tabular_data_input
    tab_init = trial.suggest_categorical(f'Tab_init', choices = ['glorot_uniform', 'he_normal', 'he_uniform', 'lecun_normal', 'lecun_uniform',  'random_normal', 'random_uniform'])
    activation_tab = trial.suggest_categorical(f'Act_tab', choices = ['relu', 'tanh', 'selu', 'LeakyReLU', 'swish', 'elu'])
    drop_tab = trial.suggest_float(f'Drop_tab', 0.2, 0.7, step = 0.1)
    batch_norm_tab = trial.suggest_categorical(f'Tab_BatchN', choices = ['On', 'Off'])
    for i in range(tab_num_layers):

        num_tab_units = trial.suggest_int(f'Num_tab_{i}', 64, max_tab_units, log = True)    
        tab_dense = Dense(num_tab_units, activation=activation_tab, kernel_initializer = tab_init)(tab_dense)
        if batch_norm_tab == 'On':
            tab_dense = layers.BatchNormalization()(tab_dense)
        tab_dense = Dropout(drop_tab)(tab_dense)

        max_tab_units = min(max_tab_units, num_tab_units)


    concatenated = Concatenate()([img_dense, tab_dense])
    com_num_layers = trial.suggest_int('Concat layers', 1, 2)
    max_com_units = 3000
    con_init = trial.suggest_categorical(f'Con_init', choices = ['glorot_uniform', 'he_normal', 'he_uniform', 'lecun_normal', 'lecun_uniform', 'random_normal', 'random_uniform'])
    activation_common = trial.suggest_categorical(f'Act_con',  choices = ['relu', 'tanh', 'selu', 'LeakyReLU', 'swish', 'elu'])
    drop_common = trial.suggest_float(f'Drop_con', 0.2, 0.7, step = 0.1)
    batch_norm_common = trial.suggest_categorical(f'Com_BatchN', ['On', 'Off'])
    for i in range(com_num_layers):
        
        num_common_units = trial.suggest_int(f'Num_con_{i}', 128, max_com_units, log = True)
        concatenated = Dense(num_common_units, activation=activation_common, kernel_initializer = con_init)(concatenated)
        if batch_norm_common == 'On':
            concatenated = layers.BatchNormalization()(concatenated)
        concatenated = Dropout(drop_common)(concatenated)

        max_com_units = min(max_com_units, num_common_units)

    output = Dense(6, activation='linear')(concatenated)  
    model = Model(inputs=[image_features_input, tabular_data_input], outputs=output)
    
    
    optimizer_options = ['adam', 'rmsprop', 'Nadam', 'adamax']
    optimizer_selected = trial.suggest_categorical('optimizer', optimizer_options)
    
    if optimizer_selected == 'adam':
        optimizer = optimizers.Adam()
    elif optimizer_selected == 'rmsprop':
        optimizer = optimizers.RMSprop()
    elif optimizer_selected == 'Nadam':
        optimizer = optimizers.Nadam()
    else:
        optimizer = optimizers.Adamax()

    # delta = trial.suggest_float('delta', 0.0, 2.5)
    # global current_delta
    # current_delta = delta

    model.compile(optimizer=optimizer, loss='mse', metrics=['mse','mae', r2_score])
    # model.compile(optimizer=optimizers.Adam(), loss=huber_loss_wrapper(delta), metrics=['mse','mae', r2_score])
    # model.compile(optimizer= optimizers.Adam(), loss='mse', metrics=['mse','mae', r2_score])
    return model



def objective(trial):

    model = create_model(trial)

    y_train_transformed = y_train.copy()
    y_valid_transformed = y_valid.copy()


    log_base_options = {'none': None, 'log2': 2, 'log10': 10}
    log_transforms = {}
    for target in mean_columns:
        log_base = trial.suggest_categorical(f'Log_{target}', list(log_base_options.keys()))
        log_transforms[target] = log_base_options[log_base]

    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=f"./data/{study_name}_search_model.h5",            
            monitor='val_r2_score',
            mode='max',
            save_best_only=True,
            save_weights_only=True,
            verbose=0)


    callbacks = [
                 ReduceLROnPlateau('val_r2_score', patience=2, factor=0.7, mode = 'max'), 
                 TerminateOnNaN(),                
                 model_checkpoint_callback,
                 EarlyStopping(monitor='val_r2_score', patience=3, mode='max', verbose = 1)
                 ]
    
    for target, log_base in log_transforms.items():
        if log_base:
            y_train_transformed[target] = np.log(y_train[target]) / np.log(log_base)
            y_valid_transformed[target] = np.log(y_valid[target]) / np.log(log_base)
        else:
            y_train_transformed[target] = y_train[target]
            y_valid_transformed[target] = y_valid[target]

    scaler_base_options = {'Std': StandardScaler(), 'Minmax': MinMaxScaler(), 'Robust': RobustScaler(), 'None': None}
    scaler_transforms = {}
    for target in mean_columns:
        scaler_base = trial.suggest_categorical(f'Scaler_{target}', list(scaler_base_options.keys()))
        scaler_transforms[target] = scaler_base_options[scaler_base]

    for target in mean_columns:
        if scaler_transforms[target]:
            y_train_transformed[target] = scaler_transforms[target].fit_transform(y_train_transformed[target].values.reshape(-1, 1)).flatten()
            y_valid_transformed[target] = scaler_transforms[target].transform(y_valid_transformed[target].values.reshape(-1, 1)).flatten()


    history = model.fit([X_train_feat, X_train_tab], y_train_transformed, validation_data=([X_valid_feat, X_valid_tab], y_valid_transformed), batch_size=256, epochs=50, callbacks=callbacks, verbose = 0)

    best_epoch = history.history['val_r2_score'].index(max(history.history['val_r2_score'])) + 1

    model.load_weights(f'./data/{study_name}_search_model.h5')

    preds = model.predict([X_valid_feat, X_valid_tab], verbose = 0)    

    preds_transformed = preds.copy()


    for i, target in enumerate(mean_columns):
        scaler = scaler_transforms[target]
        if scaler is not None:
            preds_transformed[:, i] = scaler.inverse_transform(preds_transformed[:, i].reshape(-1, 1)).flatten()
            y_valid_transformed[target] = scaler.inverse_transform(y_valid_transformed[target].values.reshape(-1, 1)).flatten()

    for i, target in enumerate(mean_columns):
        log_base = log_transforms[target]
        if log_base is not None:
            preds_transformed[:, i] = np.power(log_base, preds_transformed[:, i])
            y_valid_transformed[target] = np.power(log_base, y_valid_transformed[target])


    r2_score_inv = r2_score(y_valid_transformed, preds_transformed)

    if trial.number > 0:
        if r2_score_inv > study.best_value:

            print("*" * 50)
            print(f'Old best R2 : {study.best_value:.5f}')
            print(f'New best R2 : {r2_score_inv:.5f}')
                    
            r2 = r2_score(y_valid_transformed, preds_transformed)
            mse  = tf.keras.losses.MeanSquaredError()(y_valid_transformed, preds_transformed)
            mae = tf.keras.losses.MeanAbsoluteError()(y_valid_transformed, preds_transformed)
            # huberloss = tf.keras.losses.Huber(delta=current_delta)(y_valid_transformed, preds_transformed)
            print(f'Best epoch power errors R2 : {r2:.5f}, MSE : {mse:.5f}, MAE : {mae:.5f}')
            
            
            # model.load_weights(f'./data/{study_name}_search_model.h5')
            
            # preds = model.predict([X_valid_feat, X_valid_tab])
            # preds_2 = np.square(preds)
            # # y_valid_2 = np.square(y_valid)
            # preds = model.predict([X_valid_feat, X_valid_tab], verbose = 0)    
            
            # r2 = r2_score(y_valid_transformed, preds)
            # mse  = tf.keras.losses.MeanSquaredError()(y_valid, preds)
            # mae = tf.keras.losses.MeanAbsoluteError()(y_valid, preds)
            # huberloss = tf.keras.losses.Huber(delta=current_delta)(y_valid_2, preds_2)
            # # print(f'Best epoch raw R2 : {r2:.5f}, MSE : {mse:.5f}, MAE : {mae:.5f}')
            # print(f'Best epoch raw errors : {r2:.5f}, MSE : {mse:.5f}, MAE : {mae:.5f}, huber_loss : {huberloss:.5f}')

            # preds_inv = scaler_minmax.inverse_transform(preds)
            # y_valid_inv = scaler_minmax.inverse_transform(y_valid)
            # r2 = r2_score(y_valid_inv, preds_inv)
            # mse  = tf.keras.losses.MeanSquaredError()(y_valid_inv, preds_inv)
            # mae = tf.keras.losses.MeanAbsoluteError()(y_valid_inv, preds_inv)
            # huberloss = tf.keras.losses.Huber(delta=current_delta)(y_valid_2, preds_2)
            # print(f'Best epoch minmax.inv R2 : {r2:.5f}, MSE : {mse:.5f}, MAE : {mae:.5f}')
            # print(f'Best epoch raw R2 : {r2:.5f}, MSE : {mse:.5f}, MAE : {mae:.5f}, huber_loss : {huberloss:.5f}')

            # preds = model.predict([X_valid_feat, X_valid_tab])
            # r2 = r2_score(y_valid, preds)
            # mse  = tf.keras.losses.MeanSquaredError()(y_valid, preds)
            # mae = tf.keras.losses.MeanAbsoluteError()(y_valid, preds)
            # huberloss = tf.keras.losses.Huber(delta=current_delta)(y_valid, preds)
            # print(f'Best epoch raw R2 : {r2:.5f}, MSE : {mse:.5f}, MAE : {mae:.5f}, huber_loss : {huberloss:.5f}')
            print(f'Best epoch : {best_epoch}')

            best_filename = f'./data/{study_name}_best_val_{r2_score_inv:.5f}_model.h5'
            if os.path.exists(best_filename):
                os.remove(best_filename)

            print(f'Saving model to {best_filename}')    
            model.save(best_filename)

            log_transforms = {'log_base': log_base}
            print(f'Saving log transforms to {study_name}_{r2_score_inv:.5f}_best_log_transforms.pickle')
            with open(f'./data/{study_name}_{r2_score_inv}_best_log_transforms.pickle', 'wb') as f:
                pickle.dump(log_transforms, f)

            print(f'Saving scaler transforms to {study_name}_{r2_score_inv:.5f}_scalers.pickle')
            scaler_filename = f"./data/{study_name}_{r2_score_inv:.5f}_scalers.pickle"
            with open(scaler_filename, 'wb') as f:
                pickle.dump(scaler_transforms, f)



            print("*" * 50)

    if os.path.exists(f'./data/{study_name}_search_model.h5'):
        os.remove(f'./data/{study_name}_search_model.h5')

    tf.keras.backend.clear_session()
    gc.collect()
            
    return r2_score_inv


study_name = '409_logselect_hyvaayotanukuhyvin_meenukkuupatsilla_scalerselect_fold_3'
num_random_trials = 10
num_tpe_trial = 3
search_time_max = 3600 * 18

study = optuna.create_study(direction='maximize',                            
                            study_name=study_name,
                            storage=f'sqlite:///409_logselect_scalerselect_huber.db',
                            load_if_exists=True                            
                            )

search_time_taken = 0
search_start = time.time()
round = 0

while search_time_taken < search_time_max:

    round_start = time.time()

    print(f'Starting study with {num_random_trials} random trials, round {round}')
    print(f'Search time so far taken : {timedelta(seconds=search_time_taken)}')
    print('-' * 50)
    study.sampler = optuna.samplers.QMCSampler(warn_independent_sampling = False) 
    study.optimize(objective, n_trials=num_random_trials)
    print(f'Time taken for random trials: {timedelta(seconds= (time.time() - round_start))}')
    print(f'Starting TPE {num_tpe_trial} trials...')    
    study.sampler = optuna.samplers.TPESampler(n_startup_trials=0, multivariate=True, warn_independent_sampling = False)
    study.optimize(objective, n_trials=num_tpe_trial)
    print(f'Time taken for one trial: {timedelta(seconds= (time.time() - round_start) / (num_random_trials + num_tpe_trial))}')    
    print(f'Time this round: {timedelta(seconds= time.time() - round_start)}')
    
    search_time_taken = time.time() - search_start
    round += 1

print(f'Search time total : {timedelta(seconds=time.time() - search_start)}')




In [None]:
print(test_df.columns)
print(FEATURE_COLS)
print(len(FEATURE_COLS))

print(train_df.columns)
print(train_df.shape)

In [None]:

def r2_score(y_true, y_pred):
    ss_res = tf.reduce_sum(tf.square(y_true - y_pred), axis=0)
    ss_tot = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true, axis=0)), axis=0)
    r2 = 1 - ss_res/(ss_tot + tf.keras.backend.epsilon())
    r2 = tf.where(tf.math.is_nan(r2), tf.zeros_like(r2), r2)  # Korvaa NaN-arvot nollilla
    return tf.reduce_mean(tf.maximum(r2, 0.0)) 

def huber_loss_wrapper(delta):
    def huber_loss(y_true, y_pred):
        return tf.keras.losses.Huber(delta=delta)(y_true, y_pred)
    return huber_loss




custom_objects = {"r2_score": r2_score, "huber_loss": huber_loss_wrapper(1.5)}


with open('./data/scaler_tabufeatures_train.pickle', 'rb') as f:
    scaler = pickle.load(f)

with open('./data/scaler_minmax.pickle', 'rb') as f:
    scaler_minmax = pickle.load(f)


best_model_name = './data/407_kuten_artikkeli_openheimer_rmsprop_r2pruning_fold_3_best_val_0.27421_model.h5'

best_model = tf.keras.models.load_model(f'{best_model_name}', custom_objects=custom_objects)

test_df[FEATURE_COLS] = scaler.transform(test_df[FEATURE_COLS].values)


 
submission_df = test_df[['id']].copy()

X_test_tab = test_df[FEATURE_COLS].values
X_test_feat = np.stack(test_df['features'].values) 

predictions = best_model.predict([X_test_feat, X_test_tab])
predictions = np.power(10, predictions)

target_columns = ['X4', 'X11', 'X18', 'X50', 'X26', 'X3112']

submission_df[target_columns] = predictions

print(submission_df.head())
print(submission_df.shape)
print(submission_df.info())

submission_df.to_csv('./data/submission.csv', index=False)
submission_df.head()




