In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import os 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pickle
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetV2M
import numpy as np
import gc


In [None]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

mean_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
FEATURE_COLS = test_df.columns[1:-1].tolist()

train_df.info()
train_df.head()
train_df.describe()




In [None]:
sd_columns = [col for col in train_df.columns if col.endswith('_sd')]
train_df.drop(columns=sd_columns, inplace=True)


In [None]:

train_images_path = './data/train_images/'
test_images_path = './data/test_images/'    

train_df['image_path'] = train_df['id'].apply(lambda x: os.path.join(train_images_path, f'{x}.jpeg'))
test_df['image_path'] = test_df['id'].apply(lambda x: os.path.join(test_images_path, f'{x}.jpeg'))

train_df.head()


In [None]:



def plot_data(df, columns_names):
    plt.figure(figsize=(15, 3))

    # Setting up a grid of plots with 2 columns
    n_cols = 6
    n_rows = len(columns_names) // n_cols + (len(columns_names) % n_cols > 0)

    for i, col in enumerate(columns_names):
        plt.subplot(n_rows, n_cols, i+1)
        sns.kdeplot(df[col], bw_adjust=0.5, fill=False, color='blue')
        plt.title(f'Distribution of {col}')
        plt.xlabel('Value')
        plt.ylabel('Density')

    plt.tight_layout()
    plt.show()
    


In [None]:
plot_data(train_df, mean_columns)

for column in mean_columns:
    upper_quantile = train_df[column].quantile(0.98)  
    train_df = train_df[(train_df[column] < upper_quantile)]
    train_df = train_df[(train_df[column] > 0)]    

plot_data(train_df, mean_columns)

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, trait in enumerate(mean_columns):

    # Determine the bin edges dynamically based on the distribution of traits
    bin_edges = np.percentile(train_df[trait], np.linspace(0, 100, 5 + 1))
    train_df[f"bin_{i}"] = np.digitize(train_df[trait], bin_edges)

# Concatenate the bins into a final bin
train_df["final_bin"] = (
    train_df[[f"bin_{i}" for i in range(len(mean_columns))]]
    .astype(str)
    .agg("".join, axis=1)
)

# Perform the stratified split using final bin
train_df = train_df.reset_index(drop=True)
for fold, (train_idx, valid_idx) in enumerate(skf.split(train_df, train_df["final_bin"])):
    train_df.loc[valid_idx, "fold"] = fold



In [None]:
scaler = StandardScaler()

train_df[mean_columns] = scaler.fit_transform(train_df[mean_columns])

with open('./data/scaler_targets_train.pickle', 'wb') as f:
    pickle.dump(scaler, f)



# plot_data(train_df, mean_columns)

# with open('./data/scaler_train.pickle', 'rb') as f:
#     loaded_scaler = pickle.load(f)    
#     train_df[mean_columns] = loaded_scaler.inverse_transform(train_df[mean_columns])

# plot_data(train_df)





In [None]:
tf.keras.mixed_precision.set_global_policy('mixed_float16')

batch_size = 64

os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

base_model = EfficientNetV2M(weights='imagenet', include_top=False, pooling='avg')
base_model.trainable = False

def load_and_preprocess_image(img_path):
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (480, 480))
    return img


def extract_features_batch(image_paths):
    img_batch = np.stack([load_and_preprocess_image(img_path) for img_path in image_paths])
    features = base_model.predict(img_batch)        
    return features

image_paths = train_df['image_path'].values

features_list = []
j = 0
for i in range(0, len(image_paths), batch_size):
    batch_paths = image_paths[i:i+batch_size]
    batch_features = extract_features_batch(batch_paths)
    features_list.append(batch_features)
    j += 1
    if j % 30 == 0:
        tf.keras.backend.clear_session()
        gc.collect()
        print(f'Clearing session')

all_features = np.vstack(features_list)
train_df['features'] = list(all_features)




In [None]:
print(train_df.head(10))
print(train_df.describe())
print(train_df.info())

In [None]:
import pickle

# Specify the file path to save the pickle file
pickle_file_path = './data/train_df.pickle'

# Save the train_df dataframe as a pickle file
with open(pickle_file_path, 'wb') as f:
    pickle.dump(train_df, f)


# # Load the train_df dataframe from the pickle file
# with open(pickle_file_path, 'rb') as f:
#     train_df = pickle.load(f)
    


In [None]:
print(train_df['fold'].value_counts())

scaler = StandardScaler()

sample_df = train_df.copy()
train_df = sample_df[sample_df.fold != 3]
valid_df = sample_df[sample_df.fold == 3]
print(f"# Num Train: {len(train_df)} | Num Valid: {len(valid_df)}")


train_df[FEATURE_COLS] = scaler.fit_transform(train_df[FEATURE_COLS].values)
valid_df[FEATURE_COLS] = scaler.transform(valid_df[FEATURE_COLS].values)

with open('./data/scaler_tabufeatures_train.pickle', 'wb') as f:
    pickle.dump(scaler, f)


In [None]:
X_train_tab = train_df[FEATURE_COLS].values
X_train_feat = np.stack(train_df['features'].values)
y_train = train_df[mean_columns].values

X_valid_tab = valid_df[FEATURE_COLS].values 
X_valid_feat = np.stack(valid_df['features'].values)
y_valid = valid_df[mean_columns].values

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from optuna.integration import TFKerasPruningCallback
import optuna
from keras import regularizers, layers, optimizers, initializers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN
from datetime import timedelta
import time
import os


tf.keras.mixed_precision.set_global_policy('mixed_float16')
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

def r2_score(y_true, y_pred):
    ss_res = tf.reduce_sum(tf.square(y_true - y_pred), axis=0)
    ss_tot = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true, axis=0)), axis=0)
    r2 = 1 - ss_res/(ss_tot + tf.keras.backend.epsilon())
    r2 = tf.where(tf.math.is_nan(r2), tf.zeros_like(r2), r2)  # Korvaa NaN-arvot nollilla
    return tf.reduce_mean(tf.maximum(r2, 0.0)) 



def create_model(trial):

    image_features_input = Input(shape=(X_train_feat.shape[1],), name='image_features_input')
    tabular_data_input = Input(shape=(X_train_tab.shape[1],), name='tabular_data_input')

    img_num_layers = trial.suggest_int('Imgage layers', 1, 3)
    max_img_units = 1024
    img_dense = image_features_input
    for i in range(img_num_layers):

        num_img_units = trial.suggest_int(f'Num_img_{i}', 32, max_img_units)
        activation_img = trial.suggest_categorical(f'Act_img_{i}', ['relu', 'tanh', 'selu', 'LeakyReLU', 'swish', 'mish'])
        drop_img = trial.suggest_float(f'Drop_img_{i}', 0.2, 0.7, step=0.1)

        img_dense = Dense(num_img_units, activation=activation_img)(img_dense)
        img_dense = Dropout(drop_img)(img_dense)

        max_img_units = min(max_img_units, num_img_units)


    tab_num_layers = trial.suggest_int('Tabular layers', 1, 3)
    max_tab_units = 1024   
    tab_dense = tabular_data_input
    for i in range(tab_num_layers):

        num_tab_units = trial.suggest_int(f'Num_tab_{i}', 32, max_tab_units)
        activation_tab = trial.suggest_categorical(f'Act_tab_{i}', ['relu', 'tanh', 'selu', 'LeakyReLU', 'swish', 'mish'])
        drop_tab = trial.suggest_float(f'Drop_tab_{i}', 0.2, 0.7, step = 0.1)

        tab_dense = Dense(num_tab_units, activation=activation_tab)(tab_dense)
        tab_dense = Dropout(drop_tab)(tab_dense)

        max_tab_units = min(max_tab_units, num_tab_units)


    concatenated = Concatenate()([img_dense, tab_dense])
    com_num_layers = trial.suggest_int('Concat layers', 1, 3)
    max_com_units = 2048
    
    for i in range(com_num_layers):

        num_common_units = trial.suggest_int(f'Num_con_{i}', 32, max_com_units)
        activation_common = trial.suggest_categorical(f'Act_con_{i}', ['relu', 'tanh', 'selu', 'LeakyReLU', 'swish', 'mish'])
        drop_common = trial.suggest_float(f'Drop_con_{i}', 0.2, 0.7, step = 0.1)

        concatenated = Dense(num_common_units, activation=activation_common)(concatenated)
        concatenated = Dropout(drop_common)(concatenated)

        max_com_units = min(max_com_units, num_common_units)

    output = Dense(6, activation='linear')(concatenated)  # Käytä linear aktivointifunktiota, jos kyseessä on regressio-ongelma
    model = Model(inputs=[image_features_input, tabular_data_input], outputs=output)
    
    optimizer_options = ['adam', 'rmsprop', 'Nadam', 'adamax', 'Adagrad', 'Adadelta']
    optimizer_selected = trial.suggest_categorical('optimizer', optimizer_options)
    
    if optimizer_selected == 'adam':
        optimizer = optimizers.Adam()
    elif optimizer_selected == 'rmsprop':
        optimizer = optimizers.RMSprop()
    elif optimizer_selected == 'Nadam':
        optimizer = optimizers.Nadam()
    elif optimizer_selected == 'Adagrad':
        optimizer = optimizers.Adagrad()
    elif optimizer_selected == 'Adadelta':
        optimizer = optimizers.Adadelta()
    else:
        optimizer = optimizers.Adamax()


    model.compile(optimizer=optimizer, loss='mse', metrics=['mae', r2_score])
    return model



def objective(trial):

    model = create_model(trial)

    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=f"./data/{study_name}_search_model.h5",            
            monitor='val_r2_score',
            mode='max',
            save_best_only=True,
            save_weights_only=True)


    callbacks = [TFKerasPruningCallback(trial, 'val_r2_score'),
                 ReduceLROnPlateau('val_loss', patience=5, factor=0.5), 
                 TerminateOnNaN(),
                 model_checkpoint_callback]


    history = model.fit([X_train_feat, X_train_tab], y_train, validation_data=([X_valid_feat, X_valid_tab], y_valid), batch_size=512, epochs=100, callbacks=callbacks, verbose = 0)

    best_val_r2 = max(history.history['val_r2_score'])

    best_epoch = history.history['val_r2_score'].index(max(history.history['val_r2_score'])) + 1
    

    if trial.number > 0:
        if best_val_r2 > study.best_value:

            print("*" * 50)
            print(f'Old best R2 : {study.best_value:.5f}')
            print(f'New best R2 : {best_val_r2:.5f}')
            mse, mae, r2 = model.evaluate([X_valid_feat, X_valid_tab], y_valid, verbose=0)
            print(f'Last epoch scores : MSE {mse:.5f}, MAE {mae:.5f}, R2 {r2:.5f}')
            
            model.load_weights(f'./data/{study_name}_search_model.h5')
            
            mse, mae, r2 = model.evaluate([X_valid_feat, X_valid_tab], y_valid, verbose=0)
            print(f'Best model scores : MSE {mse:.5f}, MAE {mae:.5f}, R2 {r2:.5f}')
            print(f'Best epoch : {best_epoch}')

            best_filename = f'./data/{study_name}_best_val_{best_val_r2:.5f}_model.h5'
            if os.path.exists(best_filename):
                os.remove(best_filename)

            print(f'Saving model to {best_filename}')    
            model.save(best_filename)
            print("*" * 50)
            
    return best_val_r2


study_name = '404_ekayo_fold_3'
num_random_trials = 10
num_tpe_trial = 1
search_time_max = 60

study = optuna.create_study(direction='maximize',
                            pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps = 5),
                            study_name=study_name,
                            storage=f'sqlite:///kukat404_ajot.db',
                            load_if_exists=True
                            )

search_time_taken = 0
search_start = time.time()
round = 0
while search_time_taken < search_time_max:

    round_start = time.time()

    print(f'Starting study with {num_random_trials} random trials, round {round}')
    print(f'Search time so far taken : {timedelta(seconds=search_time_taken)}')
    print('-' * 50)
    study.sampler = optuna.samplers.QMCSampler(warn_independent_sampling = False) 
    study.optimize(objective, n_trials=num_random_trials)
    print(f'Time taken for random trials: {timedelta(seconds= (time.time() - round_start) / num_random_trials)}')
    print(f'Starting TPE {num_tpe_trial} trials...')    
    print(f'Time taken for one trial: {timedelta(seconds= (time.time() - round_start) / (num_random_trials + num_tpe_trial))}')    
    print(f'Time this round: {timedelta(seconds= time.time() - round_start)}')
    
    search_time_taken = time.time() - search_start
    round += 1

print(f'Search time total : {timedelta(seconds=time.time() - search_start)}')


