In [None]:
import pandas as pd
import numpy as np
df_train_filtered = pd.read_pickle('./data/df_train_filtered.pkl')


In [None]:
from sklearn.model_selection import train_test_split

X = df_train_filtered.drop('Hinta', axis=1)
y = df_train_filtered['Hinta']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=df_train_filtered['Kaupunginosa'], random_state=42)





In [None]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler 

# Skaalataan numeeriset muuttujat
robust_scaler = RobustScaler()
std_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()
df_train_NN = df_train_filtered.copy()
df_train_NN[['Pituusaste', 'Leveysaste']] = minmax_scaler.fit_transform(df_train_NN[['Pituusaste', 'Leveysaste']])
df_train_NN['Rv'] = minmax_scaler.fit_transform(df_train_NN[['Rv']])
df_train_NN['m2'] = minmax_scaler.fit_transform(df_train_NN[['m2']])

# One hot koodataan kategoriset muuttujat
df_hot = pd.get_dummies(df_train_NN['Kaupunginosa'], prefix='Kaupunginosa').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN['kerros'], prefix='kerros').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN['max_kerros'], prefix='max_kerros').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN['Kunto'], prefix='Kunto').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN['Hissi'], prefix='Hissi').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN['Asunnon tyyppi'], prefix='Asunnon tyyppi').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN["Talot."], prefix='Talot.').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)


df_train_NN.drop(['Kaupunginosa', 'kerros', 'max_kerros', 'Kunto', 'Hissi', 'Asunnon tyyppi', "Talot."], axis=1, inplace=True)




In [None]:
# Muodostetaan X ja y sekä jaetaan data harjoitus- ja testijoukkoihin

X = df_train_NN.drop('Hinta', axis=1)
y = df_train_NN['Hinta']

X = X.to_numpy()
y = y.to_numpy().astype('float32')

X_train_NN, X_test_NN, y_train_NN, y_test_NN = train_test_split(X, y, test_size=0.1, random_state=42, stratify=df_train_filtered['Kaupunginosa'])

In [None]:
import optuna
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from keras import regularizers, layers, optimizers, initializers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN
import time 
from optuna.integration import TFKerasPruningCallback
import os 
import pickle 
from datetime import timedelta


# Haun nimi
study_name = 'rmsle5_random_2503'
# Montako osittelua käytettiin
folds = 5
# Montako epochia kullekin osittelulle
epochs_search = 100
# Montako satunnaista hakua kieroksella
num_random = 42
# Montako TPE hakua kieroksella
num_tpe = 0

# Aika sekuntteina jota hakuun käytetän
max_search_time = 36000
# Neuroneiden maksimimäärä 
max_units = 512

def rmsle_loss(y_true, y_pred):
    # Asetetaan suuri rangaistusarvo, jos y_pred sisältää arvon nolla tai alle
    penalty = tf.constant(1e5, dtype=tf.float32)
    
    # Maski, joka on tosi, kun y_pred on > 0
    valid_mask = tf.math.greater(y_pred, 0.0)
    
    # Käytä maskia valitsemaan joko oikea RMSLE laskenta tai suuri rangaistus
    safe_y_pred = tf.where(valid_mask, y_pred, penalty)
    
    # Laske RMSLE vain, jos y_pred on suurempi kuin 0, muuten palauta rangaistus
    rmsle = tf.sqrt(tf.reduce_mean(tf.square(tf.math.log1p(safe_y_pred) - tf.math.log1p(y_true))))
    
    # Palauta suuri rangaistus, jos y_pred sisälsi nollan tai negatiivisen arvon
    return tf.where(tf.reduce_any(~valid_mask), penalty, rmsle)


def create_model(trial):
        
    model = keras.Sequential()
    model.add(keras.layers.InputLayer(input_shape=(X_train_NN.shape[1],)))
    
    num_units = trial.suggest_int(f'n_units_0', 4, 512, log=True)
    dropout_rate = trial.suggest_float(f'dropout_0', 0.0, 0.5)
    kernel_regularizer=regularizers.l1_l2(
            l1= trial.suggest_float(f'l1_reg_0', 1e-6, 1, log=True),
            l2= trial.suggest_float(f'l2_reg_0', 1e-6, 1, log=True)
        )
    activation = trial.suggest_categorical(f'activation_0', ['relu', 'tanh', 'selu', 'linear', 'sigmoid', 'elu'])    
    model.add(keras.layers.Dense(num_units, activation=activation, kernel_regularizer=kernel_regularizer))    
    model.add(keras.layers.Dropout(rate=dropout_rate))

    num_units = trial.suggest_int(f'n_units_1', 4, 256, log=True)
    dropout_rate = trial.suggest_float(f'dropout_1', 0.0, 0.5)
    kernel_regularizer=regularizers.l1_l2(
            l1= trial.suggest_float(f'l1_reg_1', 1e-6, 1, log=True),
            l2= trial.suggest_float(f'l2_reg_1', 1e-6, 1, log=True)
        )
    activation = trial.suggest_categorical(f'activation_1', ['relu', 'tanh', 'selu', 'linear', 'sigmoid', 'elu'])    
    model.add(keras.layers.Dense(num_units, activation=activation, kernel_regularizer=kernel_regularizer))    
    model.add(keras.layers.Dropout(rate=dropout_rate))
    
    num_last = trial.suggest_int('n_units_last', 1, 32)
    dropout_last = trial.suggest_float('dropout_last', 0.0, 0.5)
    activation_last = trial.suggest_categorical('activation_last', ['relu', 'tanh', 'selu', 'linear', 'sigmoid', 'elu'])
    kernel_regularizer_last = regularizers.l1_l2( 
        l1= trial.suggest_float('l1_reg_last', 1e-6, 1, log=True),
        l2= trial.suggest_float('l2_reg_last', 1e-6, 1, log=True)
    )
    model.add(keras.layers.Dense(num_last, activation=activation_last, kernel_regularizer=kernel_regularizer_last))        
    model.add(keras.layers.Dropout(rate=dropout_last))
    model.add(keras.layers.Dense(1, activation='linear')) 
    
    # Optimisaattorin ja oppimisnopeuden valinta
    optimizer_options = ['adam', 'rmsprop', 'Nadam', 'adamax', 'Adagrad', 'Adadelta']
    optimizer_selected = trial.suggest_categorical('optimizer', optimizer_options)
    learning_rate = trial.suggest_float('lr', 1e-4, 1.0, log=True)
    
    if optimizer_selected == 'adam':
        optimizer = optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_selected == 'rmsprop':
        optimizer = optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer_selected == 'Nadam':
        optimizer = optimizers.Nadam(learning_rate=learning_rate)
    elif optimizer_selected == 'Adagrad':
        optimizer = optimizers.Adagrad(learning_rate=learning_rate)
    elif optimizer_selected == 'Adadelta':
        optimizer = optimizers.Adadelta(learning_rate=learning_rate)
    else:
        optimizer = optimizers.Adamax(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss=rmsle_loss, metrics=['mse', 'mae'])
    
    return model

def objective(trial):

    model = create_model(trial)
    batch_size = trial.suggest_int('batch_size', 16, 128, log=True)    
    callbacks = [TFKerasPruningCallback(trial, 'val_loss'),
                 ReduceLROnPlateau('val_loss', patience=5, factor=0.7), 
                 TerminateOnNaN()]

    history = model.fit(X_train_b, y_train_b, epochs=epochs_search, validation_data=(X_val_b, y_val_b) ,batch_size=batch_size, verbose=0, callbacks=callbacks)
    val_loss = np.min(history.history['val_loss'])
    
    return val_loss


####

total_time_start = time.time()  
search_time_start = time.time() 
num_completed_trials = 0
search_rounds = 0
time_taken = 0
while time_taken < max_search_time:
        
    fold = 0 
    kf = KFold(n_splits=folds)
    
    time_fold_start = time.time()    
    for train_index, val_index in kf.split(X_train_NN):

        print('-------------------')
        print(f"Starting fold {fold} search...")
        X_train_b, X_val_b = X_train_NN[train_index], X_train_NN[val_index]    
        y_train_b, y_val_b = y_train_NN[train_index], y_train_NN[val_index]

        fold_name = f'{study_name}_{fold}'
       
        study = optuna.create_study(direction='minimize',
                                    pruner=optuna.pruners.HyperbandPruner(min_resource=5),
                                    study_name=fold_name,
                                    storage=f'sqlite:///tampere_reg.db',
                                    load_if_exists=True                                 
                                    )

        fold_time = time.time()    

        fold_random = time.time()
        # # study.sampler = optuna.samplers.RandomSampler()
        study.sampler = optuna.samplers.QMCSampler(warn_independent_sampling = False) # TODO tämä testiin, vaikutti paljon paremmalta kuin random 
        print(f'Random search for fold {fold}...')
        study.optimize(objective, n_trials=num_random)
        print(f'Time taken for random search: {str(timedelta(seconds=(time.time() - fold_random)))}')

        # fold_tpe = time.time()  
        # study.sampler = optuna.samplers.TPESampler(n_startup_trials=0)
        # print(f'TPE search for fold {fold}...')
        # study.optimize(objective, n_trials=num_tpe)
        # print(f'Time taken for TPE search: {str(timedelta(seconds=(time.time() - fold_tpe)))}')

        num_completed_trials += num_random + num_tpe
        print('-------------------')
        print(f'Finished fold {fold} search.')
        print(f"Time taken for this fold: {str(timedelta(seconds=(time.time() - fold_time)))}")                
        print(f'Fold {fold} best value so far: {study.best_value}')
        print(f'Mean time for one trial this fold: {str(timedelta(seconds=(time.time() - fold_time) / (num_random + num_tpe)))}')

        fold += 1
    search_rounds += 1
    
    time_taken = time.time() - search_time_start
    
    print(f'\n# Completed search round: {search_rounds} #')
    print(f'Time taken for all folds this round: {str(timedelta(seconds=(time.time() - time_fold_start)))}')
    print(f'Total time taken for search: {str(timedelta(seconds=(time.time() - search_time_start)))}')
    print(f'Made trials this far: {num_completed_trials}')
    print(f"Current mean time for one trial: {str(timedelta(seconds=(time.time() - search_time_start) / num_completed_trials))}\n")

####

####

# num_tpe = 42
    # num_random = 0
    # max_search_time = 7200
    
# while time_taken < max_search_time:
        
#     fold = 0 
#     kf = KFold(n_splits=folds)
    
#     time_fold_start = time.time()    
#     for train_index, val_index in kf.split(X_train_NN):

#         print('-------------------')
#         print(f"Starting fold {fold} search...")
#         X_train_b, X_val_b = X_train_NN[train_index], X_train_NN[val_index]    
#         y_train_b, y_val_b = y_train_NN[train_index], y_train_NN[val_index]

#         fold_name = f'{study_name}_{fold}'
       
#         study = optuna.create_study(direction='minimize',
#                                     pruner=optuna.pruners.HyperbandPruner(min_resource=20),
#                                     study_name=fold_name,
#                                     storage=f'sqlite:///tampere_reg.db',
#                                     load_if_exists=True                                 
#                                     )

#         fold_time = time.time()    

#         # fold_random = time.time()
#         # study.sampler = optuna.samplers.RandomSampler()
#         # print(f'Random search for fold {fold}...')
#         # study.optimize(objective, n_trials=num_random)
#         # print(f'Time taken for random search: {str(timedelta(seconds=(time.time() - fold_random)))}')

#         fold_tpe = time.time()  
#         study.sampler = optuna.samplers.TPESampler(n_startup_trials=0)
#         print(f'TPE search for fold {fold}...')
#         study.optimize(objective, n_trials=num_tpe)
#         print(f'Time taken for TPE search: {str(timedelta(seconds=(time.time() - fold_tpe)))}')

#         num_completed_trials += num_random + num_tpe
#         print('-------------------')
#         print(f'Finished fold {fold} search.')
#         print(f"Time taken for this fold: {str(timedelta(seconds=(time.time() - fold_time)))}")                
#         print(f'Fold {fold} best value so far: {study.best_value}')
#         print(f'Mean time for one trial this fold: {str(timedelta(seconds=(time.time() - fold_time) / (num_random + num_tpe)))}')

#         fold += 1
#     search_rounds += 1
    
#     time_taken = time.time() - search_time_start
    
#     print(f'\n# Completed search round: {search_rounds} #')
#     print(f'Time taken for all folds this round: {str(timedelta(seconds=(time.time() - time_fold_start)))}')
#     print(f'Total time taken for search: {str(timedelta(seconds=(time.time() - search_time_start)))}')
#     print(f'Made trials this far: {num_completed_trials}')
#     print(f"Current mean time for one trial: {str(timedelta(seconds=(time.time() - search_time_start) / num_completed_trials))}\n")

###

print('='*20)    
print(f'Finished search.')    
print(f'Total time taken for all folds: {str(timedelta(seconds=(time.time() - search_time_start)))}')
print(f'Made {num_completed_trials} trials in total.')
print(f"Mean time for one trial: {str(timedelta(seconds=(time.time() - search_time_start) / num_completed_trials))}")
print('='*20)







In [None]:


# for fold in range(folds):
    
#     study_name_fold = f'{study_name}_{fold}'
#     print('*'*50)
#     print(f'{study_name_fold}')
#     print('*'*50)
#     study = optuna.load_study(study_name=study_name_fold, storage='sqlite:///tampere_reg.db')
    
#     # fig = optuna.visualization.plot_param_importances(study)
#     # fig.update_layout(title=f'Param Importance for Fold {fold}', width=1800, height=1800)
#     # fig.show()

#     # fig = optuna.visualization.plot_slice(study)
#     # fig.update_layout(title=f'Slice for Fold {fold}', width=800, height=800)
#     # fig.show()
    
#     fig = optuna.visualization.plot_parallel_coordinate(study)
#     fig.update_layout(width=2000, height=2000)
#     fig.show()

#     # fig = optuna.visualization.plot_contour(study, params=['n_units_0', 'n_units_1', 'n_units_last'])
#     # fig.update_layout(width=1800, height=1800)
#     # fig.show()

#     # fig = optuna.visualization.plot_contour(study)
#     # fig.update_layout(width=3600, height=3600)
#     # fig.show()

#     # fig = optuna.visualization.plot_edf(study)
#     # fig.update_layout(width=1800, height=1800)
#     # fig.show()

    

In [None]:
from datetime import timedelta
from sklearn.model_selection import train_test_split, KFold
import optuna
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from keras import regularizers, layers, optimizers, initializers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN
import time 
import os 


folds = 5
# Montako epochia kullekin parhaalle sovitetaan malli
epochs_best_fit = 500
# Montako paras otetaan mukaan osittelusta
num_best = 6
# Montako kertaa kullekin parhaalle sovitetaan malli
num_best_fits = 1

best_optuna_models = []
best_val_scores = []
best_optuna_trials = [] 

kf = KFold(n_splits=folds)
fold_num = 0
fitting_search_start = time.time()

for train_index, val_index in kf.split(X_train_NN):

    best_fitting_time = time.time()
    print(f"Fold {fold_num} Best best trial fitting...")

    X_train_b, X_val_b = X_train_NN[train_index], X_train_NN[val_index]    
    y_train_b, y_val_b = y_train_NN[train_index], y_train_NN[val_index]
    
    fold_name = f'{study_name}_{fold_num}'
       
    study = optuna.create_study(                                
                                study_name=fold_name,
                                storage=f'sqlite:///tampere_reg.db',
                                load_if_exists=True
                                )

    valid_trials = [trial for trial in study.trials if trial.value is not None]
    sorted_trials = sorted(valid_trials, key=lambda trial: trial.value)
    best_trials = sorted_trials[:num_best]
    best_val = np.inf
    best_model = None

    print('='*30)
    print(f'Fitting best trials for fold {fold_num}...')
    fitting_fold_best_start = time.time()
    
    for trial in best_trials:

        for fit_num in range(num_best_fits):
            
            print('-'*30)
            print(f"Trial ID: {trial.number}, Value: {trial.value}, fit number: {fit_num}")

            checkpoint_filepath = f'./NN_search/optuna_search_checkpoint.h5'
            model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
                filepath=checkpoint_filepath,
                save_weights_only=True,
                monitor='val_loss',
                mode='min',
                save_best_only=True)

            best_callback = [model_checkpoint_callback,                  
                            ReduceLROnPlateau('val_loss', patience=10, factor=0.8), 
                            TerminateOnNaN(),
                            EarlyStopping(monitor='val_loss', patience=100, verbose=1)
                        ]


            model = create_model(trial)
            model.fit(X_train_b, y_train_b, epochs=epochs_best_fit, validation_data=(X_val_b, y_val_b), batch_size=trial.params['batch_size'], verbose=0, callbacks=best_callback)
            model.load_weights(checkpoint_filepath)

            predictions = model.predict(X_val_b, verbose=0)
            mse = mean_squared_error(y_val_b, predictions)
            mae = mean_absolute_error(y_val_b, predictions)
            r2 = r2_score(y_val_b, predictions)
            rmsle = rmsle_score(y_val_b, predictions)

                        
            print(f'MSE:{mse:.5f}\nMAE:{mae:.5f}\nRMSLE:{rmsle:.5f}\nR2:{r2:.5f}')

            if rmsle < best_val:
                best_model = model
                best_val = rmsle
                best_trial_num = trial.number
                best_trial = trial
                print(f'*** New best model for fold {fold_num} is Trial {best_trial_num} with RMSLE {best_val} ***')
                print(f'Best trial hyperparameters: {trial.params}')
    
    if best_model is not None:

        best_optuna_models.append(best_model)
        best_val_scores.append(best_val)
        best_optuna_trials.append(best_trial)
        print('*'*40)
        print(f"Best model for fold {fold_num} RMSLE: {best_val}\nTrial number: {best_trial_num}\nHyperparameters: {best_trial.params}")
        print(f"Time taken for best fitting in fold {fold_num}: {str(timedelta(seconds=(time.time() - best_fitting_time)) )}")
        print('*'*40)

    fold_num += 1

print('*'*40)
print(f'Best models fitting time total:', str(timedelta(seconds=(time.time() - fitting_search_start))))
print(f"Total time taken for search and fitting best models: {str(timedelta(seconds=(time.time() - total_time_start)))}")
print('*'*40)   

from datetime import datetime

for i, (model, score) in enumerate(zip(best_optuna_models, best_val_scores)):
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    directory = f"./NN_search/{study_name}_foldmodel{i}_score_{score:.4f}_{timestamp}.h5"
    print(f"Saving model {i} with score {score:.4f} to {directory}")
    model.save(directory)




In [None]:
for idx, model in enumerate(best_optuna_models):
    print(f"\nModel {idx} Summary:")
    # model.summary()
    
    # Testaa mallia testidatalla
    predictions = model.predict(X_test_NN, verbose = 0)
    mse = mean_squared_error(y_test_NN, predictions)
    mae = mean_absolute_error(y_test_NN, predictions)
    r2 = r2_score(y_test_NN, predictions)
    rmsle = rmsle_score(y_test_NN, predictions)
    
    print(f"\nModel {idx} Performance on Test Data:")
    print(f"MSE: {mse:.3f}")
    print(f"MAE: {mae:.3f}")
    print(f"R2: {r2:.3f}")
    print(f"RMSLE: {rmsle:.3f}")
    print("*"*40)




In [None]:
import optuna
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from datetime import timedelta
import time
from tensorflow.keras.models import Model
import glob
import pickle
from tensorflow.keras.models import load_model




time_started_xgb = time.time()

num_of_trials = 720

# Kerätään ensin kaikkien mallien ominaisuusvektorit
X_train_features_list = []
X_test_features_list = []
features_names_list = []

best_optuna_models = []

def rmsle_loss(y_true, y_pred):
    penalty = tf.constant(1e5, dtype=tf.float32)
    valid_mask = tf.math.greater(y_pred, 0.0)
    safe_y_pred = tf.where(valid_mask, y_pred, penalty)
    rmsle = tf.sqrt(tf.reduce_mean(tf.square(tf.math.log1p(safe_y_pred) - tf.math.log1p(y_true))))
    return tf.where(tf.reduce_any(~valid_mask), penalty, rmsle)

# Kun lataat mallin, määritä mukautettu häviöfunktio custom_objects-parametrissa
custom_objects = {"rmsle_loss": rmsle_loss}

model_best_vals = []

# for fold_num in [1]:
for fold_num in range(folds): # TODO testiä parhailla malleilla
    pattern = f"./NN_search/{study_name}_foldmodel{fold_num}_score_*.h5"  # Oletetaan, että mallit on tallennettu .h5-muodossa
    model_files = glob.glob(pattern)
    
    # Etsi suurin score kunkin foldin mallitiedostoista
    best_score = float('inf')
    best_model_file = None
    for model_file in model_files:
        score_part = model_file.split('_score_')[1]  # Erottaa score osan tiedostonimestä        
        score = float(score_part.split('_')[0])  # Muuttaa scoren float-arvoksi
        if score < best_score:
            best_score = score
            best_model_file = model_file

    model_best_vals.append(best_score)    
    # Lataa parhaan mallin tiedosto
    if best_model_file:
        best_model = load_model(best_model_file, custom_objects=custom_objects)
        best_optuna_models.append(best_model)
        print(f"Loaded best model for fold {fold_num} from {best_model_file} with score {best_score:.4f}")
    else:
        print(f"No model files found for fold {fold_num} matching pattern {pattern}")


# best_models_per_fold-listas

original_feature_names = list(X_train.columns) 

for idx, model in enumerate(best_optuna_models):
    feature_extractor = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    X_train_features = feature_extractor.predict(X_train_NN)
    X_test_features = feature_extractor.predict(X_test_NN)
    
    X_train_features_list.append(X_train_features)
    X_test_features_list.append(X_test_features)

    print(f'Model train feature shape: {X_train_features.shape}')
    print(f'Model test feature shape: {X_test_features.shape}')

    num_features = X_train_features.shape[1]
    model_feature_names = [f"model_{idx}_feature_{feature_idx}" for feature_idx in range(num_features)]
    features_names_list.extend(model_feature_names)

combined_feature_names = original_feature_names + features_names_list

# Yhdistetään ominaisuusvektorit
X_train_combined = np.concatenate(X_train_features_list, axis=1)
X_test_combined = np.concatenate(X_test_features_list, axis=1)

X_train_combined = np.concatenate([X_train_combined, X_train], axis=1)   
X_test_combined = np.concatenate([X_test_combined, X_test], axis=1)

y_train = np.array(y_train)


#### Ei valintaa 
X_train_combined_selected = X_train_combined
X_test_combined_selected = X_test_combined
selected_features_names = combined_feature_names


##### mutual_info_regression Valinta

# from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

# max_feature = X_train_combined.shape[1] // 2

# selector = SelectKBest(mutual_info_regression, k=max_feature)

# X_train_combined_selected = selector.fit_transform(X_train_combined, y_train)
# X_test_combined_selected = selector.transform(X_test_combined)

# selected_indices = selector.get_support(indices=True)
# selected_features_names = np.array(combined_feature_names)[selected_indices]
# selected_features_scores = selector.scores_[selected_indices]

# print("Selected features and their scores:")
# for name, score in zip(selected_features_names, selected_features_scores):
#     print(f"{name}: {score}")

# selected_features_names = selected_features_names.tolist()

# print(f'X_train_combined shape: {X_train_combined.shape}')
# print(f'X_train_combined_selected shape: {X_train_combined_selected.shape}')

# print(f'X_test_combined shape: {X_test_combined.shape}')
# print(f'X_test combined selected shape: {X_test_combined_selected.shape}')





# ##### PCA valinta



# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler

# # Oletetaan, että X_train_combined ja X_test_combined ovat datasi
# # Ja combined_feature_names on alkuperäisten featureiden nimilista

# # Skaalataan data
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train_combined)
# X_test_scaled = scaler.transform(X_test_combined)

# # Sovellamme PCA:ta
# pca = PCA(n_components=0.85) # tai voit määrittää n_components arvon eksplisiittisesti
# X_train_combined_selected = pca.fit_transform(X_train_scaled)
# X_test_combined_selected = pca.transform(X_test_scaled)

# # Luodaan uudet feature-nimet pääkomponenteille
# selected_features_names = [f"PC{i+1}" for i in range(X_train_combined_selected.shape[1])]

# print(f"Alkuperäinen featureiden määrä: {X_train_combined.shape[1]}")
# print(f"Featureiden määrä PCA:n jälkeen: {X_train_combined_selected.shape[1]}")

# # Nyt sinulla on X_train_pca ja X_test_pca datat sekä niiden vastaavat feature-nimet
# # Voit jatkaa näiden käyttämistä mallisi koulutukseen



#####

def rmsle(y_true, y_pred):
    if np.any(y_pred <= 0):
        return 1e6
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
from sklearn.model_selection import KFold



max_feature = X_train_combined.shape[1]

def objective(trial):
    # Määritetään parametrit, jotka optimoidaan
    param = {
        'tree_method': 'hist',          
        'objective': trial.suggest_categorical('objective', ['reg:squarederror', 'reg:absoluteerror']),        
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 10)        
    }
    num_boost_round = trial.suggest_int('num_boost_round', 10, 142)

    # K-fold cross-validation
    kf = KFold(n_splits=5)
    rmsle_scores = []
  
    for train_index, val_index in kf.split(X_train_combined_selected):
        X_train_k, X_val_k = X_train_combined_selected[train_index], X_train_combined_selected[val_index]
        y_train_k, y_val_k = y_train[train_index], y_train[val_index]

        dtrain = xgb.DMatrix(X_train_k, label=y_train_k, feature_names=selected_features_names)
        dval = xgb.DMatrix(X_val_k, label=y_val_k, feature_names=selected_features_names)

        evals_result = {}
        bst = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=[(dval, 'val')], evals_result=evals_result, verbose_eval=False, early_stopping_rounds=300)
        # Käytä paras iteraatio määrä laskemaan ennusteet ja RMSLE
        best_iteration = bst.best_iteration
        preds = bst.predict(dval, iteration_range=(0, best_iteration + 1))
        loss = rmsle(y_val_k, preds)
        rmsle_scores.append(loss)

    average_rmsle = np.mean(rmsle_scores)
    return average_rmsle

# study = optuna.create_study(direction='minimize', 
#                             storage='sqlite:///tampere_reg.db', 
#                             study_name='xgb_combined_selection_2503', # TODO muuta nimeä tarvittaessa
#                             load_if_exists=False) 

study = optuna.create_study(direction='minimize')

# study.sampler = optuna.samplers.RandomSampler()
study.sampler = optuna.samplers.QMCSampler(warn_independent_sampling = False)
print(f'Random sampling {num_of_trials} trials...')
study.optimize(objective, n_trials=num_of_trials)
study.sampler = optuna.samplers.TPESampler()
print(f'TPE sampling {num_of_trials} trials...')
study.optimize(objective, n_trials=num_of_trials)


print(f'Time taken for XGBoost optimization: {str(timedelta(seconds=(time.time() - time_started_xgb)))}')
print(f'Time taken for one trial: {str(timedelta(seconds=(time.time() - time_started_xgb) / (num_of_trials*2)))}')


# Parhaiden parametrien tulostus ja mallin koulutus
print(f"Best val: {study.best_trial.value}")
print(f'Best params: {study.best_params}')

dtrain = xgb.DMatrix(X_train_combined_selected, label=y_train, feature_names=selected_features_names)
best_model = xgb.train(best_params, dtrain, num_boost_round=best_params['num_boost_round'])

# Ennustukset ja evaluointi testidatalla
dtest = xgb.DMatrix(X_test_combined_selected, label=y_test, feature_names=selected_features_names)
predictions = best_model.predict(dtest)

# Visualisoidaan ennustettuja arvoja verrattuna todellisiin arvoihin
plt.figure(figsize=(20, 10))
plt.scatter(y_test, predictions, edgecolors=(0, 0, 0))
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Measured')
plt.ylabel('Predicted')
plt.title('Measured vs. Predicted Values')
plt.show()

mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
rmsle_val = rmsle(y_test, predictions)
print(f"MAE: {mae}, MSE: {mse}, R2: {r2}, RMSLE: {rmsle_val}")

fig, ax = plt.subplots(figsize=(20, 30))
xgb.plot_importance(best_model, importance_type='weight', ax=ax)
ax.set_title('Feature Importance by Weight', fontsize=16)
plt.show()

# Asetetaan toisen kuvaajan koko
fig, ax = plt.subplots(figsize=(20, 30))
xgb.plot_importance(best_model, importance_type='gain', ax=ax)
ax.set_title('Feature Importance by Gain', fontsize=16)
plt.show()

# Asetetaan kolmannen kuvaajan koko
fig, ax = plt.subplots(figsize=(20, 30))
xgb.plot_importance(best_model, importance_type='cover', ax=ax)
ax.set_title('Feature Importance by Cover', fontsize=16)
plt.show()




In [None]:
predictions_train = []

for idx, model in enumerate(best_optuna_models):
    pred = model.predict(X_train_NN, verbose=0).flatten()
    predictions_train.append(pred)

predictions_test = []

for idx, model in enumerate(best_optuna_models):
    pred = model.predict(X_test_NN, verbose=0).flatten()
    predictions_test.append(pred)


print('Keskiarvo ')
predictions_mean = np.mean(predictions_test, axis=0)
mse = mean_squared_error(y_test, predictions_mean)
mae = mean_absolute_error(y_test, predictions_mean)
r2 = r2_score(y_test, predictions_mean)
rmsle_val = rmsle(y_test, predictions_mean)  # Oletetaan että sinulla on rmsle funktio määritelty
print(f"MAE: {mae}, MSE: {mse}, R2: {r2}, RMSLE: {rmsle_val}\n\n")

print('Painotettu keskiarvo käänteisillä')
total = sum(model_best_vals)
weights = [x / total for x in model_best_vals]
weighted_predictions = np.average(predictions_test, axis=0, weights=weights)
mse = mean_squared_error(y_test, weighted_predictions)
mae = mean_absolute_error(y_test, weighted_predictions)
r2 = r2_score(y_test, weighted_predictions)
rmsle_val = rmsle(y_test, weighted_predictions)  # Oletetaan että sinulla on rmsle funktio määritelty
print(f"MAE: {mae}, MSE: {mse}, R2: {r2}, RMSLE: {rmsle_val}\n\n")


from sklearn.linear_model import LinearRegression

# Oletetaan, että `predictions` on lista, joka sisältää kunkin mallin ennusteet testidatasetille
X_meta_train = np.stack(predictions_train, axis=1)
X_meta_test = np.stack(predictions_test, axis=1)
# Koulutetaan meta-malli
meta_model = LinearRegression()
meta_model.fit(X_meta_train, y_train)

# Käytetään meta-mallia ennustamaan
linear_predictions = meta_model.predict(X_meta_test)

print('Linear meta')
mse = mean_squared_error(y_test, linear_predictions)
mae = mean_absolute_error(y_test, linear_predictions)
r2 = r2_score(y_test, linear_predictions)
rmsle_val = rmsle(y_test, linear_predictions)  # Oletetaan että sinulla on rmsle funktio määritelty
print(f"MAE: {mae}, MSE: {mse}, R2: {r2}, RMSLE: {rmsle_val}\n\n")



from sklearn.svm import SVR

print('SVM')
X_train_svm = np.column_stack(predictions_train)
X_test_svm = np.column_stack(predictions_test)

svm_regressor = SVR(kernel='linear')
svm_regressor.fit(X_train_svm, y_train)

# Käytetään opetettua SVM-regressoria ennustamaan testidatan "oikeat" arvot
predictions_svm = svm_regressor.predict(X_test_svm)

# Arvioidaan mallin suorituskykyä
mse = mean_squared_error(y_test, predictions_svm)
mae = mean_absolute_error(y_test, predictions_svm)
r2 = r2_score(y_test, predictions_svm)
rmsle_val = rmsle(y_test, predictions_svm)
print(f"SVM MAE: {mae}, MSE: {mse}, R2: {r2}, RMSLE: {rmsle_val}\n\n")

