In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import os 
import seaborn as sns
from sklearn.model_selection import train_test_split
import pickle
import tensorflow as tf
import numpy as np
import gc


In [None]:
pickle_file_path = './data/test_df.pickle'

with open(pickle_file_path, 'rb') as f:
    test_df = pickle.load(f)

pickle_file_path = './data/train_df.pickle'

with open(pickle_file_path, 'rb') as f:
    train_df = pickle.load(f)
    

study_name = '414_std_3'


In [None]:



mean_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']


selected_features_pickle_path = './data/selected_features_list.pickle'
with open(selected_features_pickle_path, 'rb') as f:
    FEATURE_COLS = pickle.load(f)

print(FEATURE_COLS)



In [None]:
def plot_data(df, columns_names):
    plt.figure(figsize=(15, 3))

    # Setting up a grid of plots with 2 columns
    n_cols = 6
    n_rows = len(columns_names) // n_cols + (len(columns_names) % n_cols > 0)

    for i, col in enumerate(columns_names):
        plt.subplot(n_rows, n_cols, i+1)
        sns.kdeplot(df[col], bw_adjust=0.5, fill=False, color='blue')
        plt.title(f'Distribution of {col}')
        plt.xlabel('Value')
        plt.ylabel('Density')

    plt.tight_layout()
    plt.show()
    


In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
train_df[mean_columns].describe()

In [None]:
plot_data(train_df, mean_columns)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler,  RobustScaler, PowerTransformer, QuantileTransformer

# TODO automatize this
train_plot = train_df.copy()

train_plot['X4_mean'] = np.log(train_plot['X4_mean']) / np.log(2)
train_plot['X11_mean'] = np.log(train_plot['X11_mean']) / np.log(15)
train_plot['X18_mean'] = np.sqrt(train_plot['X18_mean'])
train_plot['X26_mean'] = np.sqrt(train_plot['X26_mean'])
train_plot['X3112_mean'] = np.log(train_plot['X3112_mean']) / np.log(15)

train_plot['X11_mean'] = StandardScaler().fit_transform(train_plot[['X11_mean']])
train_plot['X26_mean'] = MinMaxScaler().fit_transform(train_plot[['X26_mean']])


{'optimizer': 'rmsprop', 'Log_X4_mean': 'log2', 'Log_X11_mean': 'log15', 'Log_X18_mean': 'sqrt', 'Log_X50_mean': 'none', 'Log_X26_mean': 'sqrt', 'Log_X3112_mean': 'log15', 'Scaler_X4_mean': 'None', 'Scaler_X11_mean': 'Std', 'Scaler_X18_mean': 'None', 'Scaler_X50_mean': 'None', 'Scaler_X26_mean': 'minmax', 'Scaler_X3112_mean': 'None'}. Best is trial 285 with value: 0.2963744779721767.

In [None]:
plot_data(train_plot, mean_columns)

In [None]:
train_plot[mean_columns].describe()

In [None]:
train_df[FEATURE_COLS].describe()
train_df_original = train_df.copy()

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler

print(train_df['fold'].value_counts())

# scaler = StandardScaler() # TODO testaa robustscaler
scaler = RobustScaler()

sample_df = train_df.copy()
train_df = sample_df[sample_df.fold != 3]
valid_df = sample_df[sample_df.fold == 3]
print(f"# Num Train: {len(train_df)} | Num Valid: {len(valid_df)}")


train_df[FEATURE_COLS] = scaler.fit_transform(train_df[FEATURE_COLS].values)
valid_df[FEATURE_COLS] = scaler.transform(valid_df[FEATURE_COLS].values)

scaler_tabufeatures_name = f'./NN_search/scaler_tabufeatures_{study_name}_train.pickle'
print(f"Saving scaler to {scaler_tabufeatures_name}")
with open(f'{scaler_tabufeatures_name}', 'wb') as f:
    pickle.dump(scaler, f)




In [None]:
train_df[FEATURE_COLS].describe()

In [None]:
valid_df[FEATURE_COLS].describe()

In [None]:
X_train_tab = train_df[FEATURE_COLS].values
X_train_feat = np.stack(train_df['features'].values)
# y_train = train_df[mean_columns].values
y_train = train_df[mean_columns]

X_valid_tab = valid_df[FEATURE_COLS].values 
X_valid_feat = np.stack(valid_df['features'].values)
# y_valid = valid_df[mean_columns].values
y_valid = valid_df[mean_columns]



In [None]:
study_name = '415_std_minmax_3'

In [None]:
import glob
import os

directory_path = './NN_search'
pattern = f"{directory_path}/{study_name}_best_val_*.h5"

files = glob.glob(pattern)

max_r2_score = float('-inf')
best_model = None

# Käy läpi jokainen tiedosto ja etsi suurin r2_score_inv
for file in files:
    value = float(file.split('best_val')[1].split('_')[1])
    if value > max_r2_score:
        max_r2_score = value
        best_model = file


# Tulosta suurin löydetty r2_score_inv ja vastaava tiedosto
print(f"Best R2-score: {max_r2_score:.5f}")
if best_model:
    print(f"Best model: {best_model}")
else:
    print("No best model found")

best_log_transforms_name =  f'./NN_search/{study_name}_{max_r2_score:.5f}_best_log_transforms.pickle'
best_scalers_name = f'./NN_search/{study_name}_{max_r2_score:.5f}_best_scalers.pickle'

In [None]:


def r2_score_tf(y_true, y_pred):
    ss_res = tf.reduce_sum(tf.square(y_true - y_pred), axis=0)
    ss_tot = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true, axis=0)), axis=0)
    r2 = 1 - ss_res/(ss_tot + tf.keras.backend.epsilon())
    r2 = tf.where(tf.math.is_nan(r2), tf.zeros_like(r2), r2)  # Korvaa NaN-arvot nollilla
    return tf.reduce_mean(tf.maximum(r2, 0.0))

custom_objects = {"r2_score_tf": r2_score_tf}


with open(f'./NN_search/scaler_tabufeatures_{study_name}_train.pickle', 'rb') as f:
    scaler_tabular = pickle.load(f)

print(f'Tabu features scaler: {scaler}')



best_model = tf.keras.models.load_model(f'{best_model}', custom_objects=custom_objects)

test_df_copy = test_df.copy()

test_df_copy[FEATURE_COLS] = scaler_tabular.transform(test_df_copy[FEATURE_COLS].values)

submission_df = test_df_copy[['id']].copy()

X_test_tab = test_df_copy[FEATURE_COLS].values
X_test_feat = np.stack(test_df_copy['features'].values) 

tf.keras.backend.clear_session()
gc.collect()

predictions = best_model.predict([X_test_feat, X_test_tab])

print(f'Opening log transforms from {best_log_transforms_name}')
with open(best_log_transforms_name, 'rb') as f:
    log_transforms = pickle.load(f)

print(f'Opening scalers from {best_scalers_name}')
with open(best_scalers_name, 'rb') as f:
    scaler_transforms = pickle.load(f)
        

print(log_transforms)
print(scaler_transforms)

for i, target in enumerate(mean_columns):
    print(f'Scaler transforming target : {target} with scaler : {scaler_transforms[target]}')
    scaler = scaler_transforms[target]
    if scaler is not None:
        predictions[:, i] = scaler.inverse_transform(predictions[:, i].reshape(-1, 1)).flatten()


for i, target in enumerate(mean_columns):
    print(f'Logpot transforming target: : {target}, log transform : {log_transforms[target]}')
    log_base = log_transforms[target]
    if log_base is not None and log_base != 'sqrt' and log_base != 'cbrt':
        predictions[:, i] = np.power(log_base, predictions[:, i])
    elif log_base == 'sqrt':
        predictions[:, i] = np.square(predictions[:, i])
    elif log_base == 'cbrt':
        predictions[:, i] = np.power(predictions[:, i], 3)





In [None]:

target_columns = ['X4', 'X11', 'X18', 'X50', 'X26', 'X3112']

submission_df[target_columns] = predictions


In [None]:
results_training_name = './data/results_training.pickle'

if os.path.exists(results_training_name):
    results_training = pd.read_pickle(results_training_name)
else:
    columns = ['Train R2', 'Train MSE', 'Train MAE', 'Valid R2', 'Valid MSE', 'Valid MAE', 'Train preds Desc', 'Valid preds Desc', 'Test preds Desc' , 'Original data Desc' 'Kaggle R2', 'Scalers', 'Log/Pot transforms', 'NN Search space', 'Tabular scaler']
    results_training = pd.DataFrame(columns = columns)
    results_training.index.name = 'Study name'

if study_name not in results_training.index:    
    results_training.loc[study_name] = [None]*len(results_training.columns)


test_preds_desc = submission_df[target_columns].describe().to_json()
results_training.at[study_name, 'Test preds Desc'] = test_preds_desc 

original_data_desc = train_df_original[mean_columns].describe().to_json()
results_training.at[study_name, 'Original data Desc'] = original_data_desc


In [None]:
print(f'{str(log_transforms.items())}')
print(f'{str(scaler_transforms.items())}')
print(f'{str(scaler_tabular)}')

results_training.at[study_name, 'Scalers'] = f'{scaler_transforms}'
results_training.at[study_name, 'Log/Pot transforms'] = f'{str(log_transforms.items())}'
results_training.at[study_name, 'Tabular scaler'] = f'{scaler_tabular}'


In [None]:
## JÄRKEVYYSKOKEILU TESTATAAN train dataan

import numpy as np
import tensorflow as tf
import optuna
from keras import regularizers, layers, optimizers, initializers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN
from datetime import timedelta
import time
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, QuantileTransformer, RobustScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

## TEST DATA TEST

tf.keras.backend.clear_session()
gc.collect()

train_pred = best_model.predict([X_train_feat, X_train_tab])

for i, target in enumerate(mean_columns):
    print(f'Scaler transforming target : {target} with scaler : {scaler_transforms[target]}')
    scaler = scaler_transforms[target]
    if scaler is not None:
        train_pred[:, i] = scaler.inverse_transform(train_pred[:, i].reshape(-1, 1)).flatten()


for i, target in enumerate(mean_columns):
    print(f'Logpot transforming target : {target}, log transform : {log_transforms[target]}')
    log_base = log_transforms[target]
    if log_base is not None and log_base != 'sqrt' and log_base != 'cbrt':
        train_pred[:, i] = np.power(log_base, train_pred[:, i])
    elif log_base == 'sqrt':
        train_pred[:, i] = np.square(train_pred[:, i])
    elif log_base == 'cbrt':
        train_pred[:, i] = np.power(train_pred[:, i], 3)

R2_train = r2_score(y_train, train_pred)
MSE_train = mean_squared_error(y_train, train_pred)
MAE_train = mean_absolute_error(y_train, train_pred)

print(f'Train scores:\nR2 : {R2_train:.5f}, MSE : {MSE_train:.5f}, MAE : {MAE_train:.5f}')

results_training.at[study_name, 'Train R2'] = R2_train
results_training.at[study_name, 'Train MSE'] = MSE_train
results_training.at[study_name, 'Train MAE'] = MAE_train

trainining_preds_desc = pd.DataFrame(train_pred, columns = mean_columns).describe().to_json()
results_training.at[study_name, 'Train preds Desc'] = trainining_preds_desc


In [None]:

## VALIDATION DATA TEST

tf.keras.backend.clear_session()
gc.collect()

valid_pred = best_model.predict([X_valid_feat, X_valid_tab])

for i, target in enumerate(mean_columns):
    print(f'Scaler transforming target : {target} with scaler : {scaler_transforms[target]}')
    scaler = scaler_transforms[target]
    if scaler is not None:
        valid_pred[:, i] = scaler.inverse_transform(valid_pred[:, i].reshape(-1, 1)).flatten()


for i, target in enumerate(mean_columns):
    log_base = log_transforms[target]
    if log_base is not None and log_base != 'sqrt' and log_base != 'cbrt':
        valid_pred[:, i] = np.power(log_base, valid_pred[:, i])
    elif log_base == 'sqrt':
        valid_pred[:, i] = np.square(valid_pred[:, i])
    elif log_base == 'cbrt':
        valid_pred[:, i] = np.power(valid_pred[:, i], 3)

R2_valid = r2_score(y_valid, valid_pred)
MSE_valid = mean_squared_error(y_valid, valid_pred)
MAE_valid = mean_absolute_error(y_valid, valid_pred)

print(f'Valid scores:\nR2 : {R2_valid:.5f}, MSE : {MSE_valid:.5f}, MAE : {MAE_valid:.5f}')

results_training.at[study_name, 'Valid R2'] = R2_valid
results_training.at[study_name, 'Valid MSE'] = MSE_valid
results_training.at[study_name, 'Valid MAE'] = MAE_valid

valid_preds_desc = pd.DataFrame(valid_pred, columns = mean_columns).describe().to_json()
results_training.at[study_name, 'Valid preds Desc'] = valid_preds_desc

In [None]:

display(results_training.head(100))
logpot = results_training['Log/Pot transforms'].to_list()[0]


In [None]:
submission_df.head()

In [None]:
print(submission_df.info())

submission_df.to_csv('./data/submission.csv', index=False)

In [None]:
# Kaggle = 0.0 
# results_training.at[study_name, 'Kaggle R2'] = Kaggle

for index, row in results_training.iterrows():
    print(f"Study Name: {index}")
    print(f'Kaggle R2: {row["Kaggle R2"]}')
    print(f"Train R2: {row['Train R2']}, Train MSE: {row['Train MSE']}, Train MAE : {row['Train MAE']}")
    print(f'Valid R2: {row["Valid R2"]}, Valid MSE: {row["Valid MSE"]}, Valid MAE: {row["Valid MAE"]}')
    print("-" * 50)
    print("Train preds Description:")
    display(pd.read_json(row['Train preds Desc']))
    print("Valid preds Description:")
    display(pd.read_json(row['Valid preds Desc']))
    print("Test preds Description:")
    display(pd.read_json(row['Test preds Desc']))
    print("Original data Description:")
    display(pd.read_json(row['Original data Desc']))
    

In [None]:
with open(results_training_name, 'wb') as f:
    results_training.to_pickle(f)

In [None]:
results_training.head(100)