In [None]:
import pandas as pd
import numpy as np
df_train_filtered = pd.read_pickle('./data/df_train_filtered.pkl')


In [None]:
from sklearn.model_selection import train_test_split

# Koska Talot. tyyppejä ei ole kovin montaa, niin yhdistetään ne kaupunginosan kanssa jonka mukaan tehdään testi data setti
df_train_filtered['combined'] = df_train_filtered[['Kaupunginosa', 'Talot.']].astype(str).agg('-'.join, axis=1)
counts = df_train_filtered['combined'].value_counts()
df_train_filtered['combined'] = df_train_filtered['combined'].map(lambda x: 'other' if counts[x] < 2 else x)
X = df_train_filtered.drop('Hinta', axis=1)
y = df_train_filtered['Hinta']

X_train, X_test, _ , _ = train_test_split(X, y, test_size=0.1, stratify=df_train_filtered['combined'], random_state=42)
X_train.drop('combined', axis=1, inplace=True)
X_test.drop('combined', axis=1, inplace=True)






In [None]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler 

# Skaalataan numeeriset muuttujat. Koska koordinaatit ja Rv sekä m2 ei ole mielestäni sellaisia, että pitäisi murehtia train ja test setin välillä vuotaisi tietoa, niin skaalataan ne kaikki yhdessä
robust_scaler = RobustScaler()
std_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()
df_train_NN = df_train_filtered.copy()
df_train_NN[['Pituusaste', 'Leveysaste']] = minmax_scaler.fit_transform(df_train_NN[['Pituusaste', 'Leveysaste']])
df_train_NN['Rv'] = minmax_scaler.fit_transform(df_train_NN[['Rv']])
df_train_NN['m2'] = minmax_scaler.fit_transform(df_train_NN[['m2']])

# One hot koodataan kategoriset muuttujat
df_hot = pd.get_dummies(df_train_NN['Kaupunginosa'], prefix='Kaupunginosa').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN['kerros'], prefix='kerros').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN['max_kerros'], prefix='max_kerros').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN['Kunto'], prefix='Kunto').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN['Hissi'], prefix='Hissi').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN['Asunnon tyyppi'], prefix='Asunnon tyyppi').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN["Talot."], prefix='Talot.').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)


df_train_NN.drop(['Kaupunginosa', 'kerros', 'max_kerros', 'Kunto', 'Hissi', 'Asunnon tyyppi', "Talot."], axis=1, inplace=True)




In [None]:
from sklearn.model_selection import train_test_split

# Muodostetaan X ja y sekä jaetaan data harjoitus- ja testijoukkoihin

X = df_train_NN.drop('Hinta', axis=1)
y = df_train_NN['Hinta']

X_train_NN, X_test_NN, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=df_train_NN['combined'], random_state=42)
df_strat = X_train_NN['combined'].reset_index(drop=True)

y_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

X_train_NN.drop('combined', axis=1, inplace=True)
X_test_NN.drop('combined', axis=1, inplace=True)

X_train_NN = X_train_NN.to_numpy().astype('float32')    
X_test_NN = X_test_NN.to_numpy().astype('float32')

y_train = y_train.to_numpy().astype('float32')
y_test = y_test.to_numpy().astype('float32')




In [None]:
import optuna
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from keras import regularizers, layers, optimizers, initializers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN
import time 
from optuna.integration import TFKerasPruningCallback
import os 
import pickle 
from datetime import timedelta

# Kokeilaann paljonko tämä nopeuttaa NAS hakua
# from tensorflow.keras import mixed_precision
# mixed_precision.set_global_policy('mixed_float16')



# Haun nimi
study_name = '0329_rmsle_stratfold_layers'
# Montako osittelua käytettiin
folds = 5
# Montako epochia kullekin osittelulle
epochs_search = 50
# Montako satunnaista hakua kieroksella
num_random = 0
# Montako TPE hakua kieroksella
num_tpe = 50

# Aika sekuntteina jota hakuun käytetän
max_search_time = 40000
# Neuroneiden maksimimäärä 
max_units_all = 128

def rmsle_loss(y_true, y_pred):
    penalty = tf.constant(1e5, dtype=tf.float32)
    valid_mask = tf.math.greater(y_pred, 0.0)
    safe_y_pred = tf.where(valid_mask, y_pred, penalty)
    rmsle = tf.sqrt(tf.reduce_mean(tf.square(tf.math.log1p(safe_y_pred) - tf.math.log1p(y_true))))
    return tf.where(tf.reduce_any(~valid_mask), penalty, rmsle)
custom_objects = {"rmsle_loss": rmsle_loss}


def create_model(trial):
        
    model = keras.Sequential()
    model.add(keras.layers.InputLayer(input_shape=(X_train_NN.shape[1],)))
    
    num_layers = trial.suggest_int('n_layers', 1, 3)
    max_units = max_units_all

    for i in range(num_layers):
        # Ehdota neuronien määrää, joka on enintään max_units
        num_units = trial.suggest_int(f'n_units_{i}', 4, max_units)
        dropout_rate = trial.suggest_float(f'dropout_{i}', 0.0, 0.5)
        kernel_regularizer = regularizers.l1_l2(
            l1= trial.suggest_float(f'l1_reg_{i}', 1e-6, 1, log=True),
            l2= trial.suggest_float(f'l2_reg_{i}', 1e-6, 1, log=True)
        )
        activation = trial.suggest_categorical(f'activation_{i}', ['relu', 'elu', 'LeakyReLU', 'tanh'])
        
        model.add(keras.layers.Dense(num_units, activation=activation, kernel_regularizer=kernel_regularizer))
        model.add(keras.layers.Dropout(rate=dropout_rate))
        
        # Päivitä max_units varmistaaksesi, että seuraavan kerroksen neuronien määrä ei ole suurempi
        max_units = min(max_units, num_units)  
    
    model.add(keras.layers.Dense(1, activation='linear')) 
    
    # Optimisaattorin ja oppimisnopeuden valinta
    optimizer_options = ['adam', 'rmsprop', 'Nadam', 'adamax', 'Adagrad', 'Adadelta']
    optimizer_selected = trial.suggest_categorical('optimizer', optimizer_options)
    
    
    if optimizer_selected == 'adam':
        optimizer = optimizers.Adam()
    elif optimizer_selected == 'rmsprop':
        optimizer = optimizers.RMSprop()
    elif optimizer_selected == 'Nadam':
        optimizer = optimizers.Nadam()
    elif optimizer_selected == 'Adagrad':
        optimizer = optimizers.Adagrad()
    elif optimizer_selected == 'Adadelta':
        optimizer = optimizers.Adadelta()
    else:
        optimizer = optimizers.Adamax()

    model.compile(optimizer=optimizer, loss=rmsle_loss, metrics=['mse', 'mae'])
    
    return model

def objective(trial):

    model = create_model(trial)
    batch_size = trial.suggest_int('batch_size', 16, 128, log=True)    
    callbacks = [TFKerasPruningCallback(trial, 'val_loss'),
                 ReduceLROnPlateau('val_loss', patience=5, factor=0.5), 
                 TerminateOnNaN()]

    history = model.fit(X_train_b, y_train_b, epochs=epochs_search, validation_data=(X_val_b, y_val_b) ,batch_size=batch_size, verbose=0, callbacks=callbacks)
    val_loss = np.min(history.history['val_loss'])
    
    return val_loss


####

total_time_start = time.time()  
search_time_start = time.time() 
num_completed_trials = 0
search_rounds = 0
time_taken = 0

while time_taken < max_search_time:
        
    fold = 0 
    time_fold_start = time.time()    
    skf =  StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

    stratified_labels = df_strat
    
    for train_index, val_index in skf.split(X_train_NN, stratified_labels):

        print('-------------------')
        print(f"Starting fold {fold} search...")
        X_train_b, X_val_b = X_train_NN[train_index], X_train_NN[val_index]    
        y_train_b, y_val_b = y_train[train_index], y_train[val_index]

        fold_name = f'{study_name}_{fold}'
       
        study = optuna.create_study(direction='minimize',
                                    pruner=optuna.pruners.HyperbandPruner(min_resource=10),
                                    study_name=fold_name,
                                    storage=f'sqlite:///tampere_reg.db',
                                    load_if_exists=True                                 
                                    )
        
    
        fold_time = time.time()    

        fold_random = time.time()
        optuna.logging.set_verbosity(optuna.logging.WARNING)     

        if num_random > 0:   
            study.sampler = optuna.samplers.QMCSampler(warn_independent_sampling = False) # TODO tämä testiin, vaikutti paljon paremmalta kuin random 
            print(f'Random search for fold {fold}...')
            study.optimize(objective, n_trials=num_random)
            print(f'Time taken for random search: {str(timedelta(seconds=(time.time() - fold_random)))}')

        fold_tpe = time.time()  
        if num_tpe > 0:
            study.sampler = optuna.samplers.TPESampler(n_startup_trials=0)
            print(f'TPE search for fold {fold}...')
            study.optimize(objective, n_trials=num_tpe)
            print(f'Time taken for TPE search: {str(timedelta(seconds=(time.time() - fold_tpe)))}')

        num_completed_trials += num_random + num_tpe
        print('-------------------')
        print(f'Finished fold {fold} search.')
        print(f"Time taken for this fold: {str(timedelta(seconds=(time.time() - fold_time)))}")                
        print(f'Fold {fold} best value so far: {study.best_value}')
        print(f'Best parameters so far: {study.best_params}')
        print(f'Mean time for one trial this fold: {str(timedelta(seconds=(time.time() - fold_time) / (num_random + num_tpe)))}')
        print(f'This fold has made total {study.trials_dataframe().shape[0]} trials.')

        fold += 1
    search_rounds += 1
    
    time_taken = time.time() - search_time_start
    
    print(f'\n# Completed search round: {search_rounds} #')
    print(f'Time taken for all folds this round: {str(timedelta(seconds=(time.time() - time_fold_start)))}')
    print(f'Total time taken for search: {str(timedelta(seconds=(time.time() - search_time_start)))}')
    print(f'Made trials this far: {num_completed_trials}')
    print(f"Current mean time for one trial: {str(timedelta(seconds=(time.time() - search_time_start) / num_completed_trials))}\n")





In [None]:
from datetime import timedelta
from sklearn.model_selection import train_test_split, KFold
import optuna
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from keras import regularizers, layers, optimizers, initializers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN
import time 
import os 

def rmsle_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.log1p(y_true+1), np.log1p(y_pred+1)))

folds = 5
# Montako epochia kullekin parhaalle sovitetaan malli
epochs_best_fit = 500
# Montako paras otetaan mukaan osittelusta
num_best = 4
# Montako kertaa kullekin parhaalle sovitetaan malli
num_best_fits = 1

best_optuna_models = []
best_val_scores = []
best_optuna_trials = [] 
fitting_search_start = time.time()



skf =  StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
stratified_labels = df_strat
fold_num = 0    
for train_index, val_index in skf.split(X_train_NN, stratified_labels):

    best_fitting_time = time.time()
    print(f"Fold {fold_num} Best best trial fitting...")

    X_train_b, X_val_b = X_train_NN[train_index], X_train_NN[val_index]    
    y_train_b, y_val_b = y_train[train_index], y_train[val_index]
    
    fold_name = f'{study_name}_{fold_num}'
       
    study = optuna.create_study(                                
                                study_name=fold_name,
                                storage=f'sqlite:///tampere_reg.db',
                                load_if_exists=True
                                )

    valid_trials = [trial for trial in study.trials if trial.value is not None]
    sorted_trials = sorted(valid_trials, key=lambda trial: trial.value)
    best_trials = sorted_trials[:num_best]
    best_val = np.inf
    best_model = None

    print('='*30)
    print(f'Fitting best trials for fold {fold_num}...')
    fitting_fold_best_start = time.time()
    
    for trial in best_trials:

        for fit_num in range(num_best_fits):
            
            print('-'*30)
            print(f"Trial ID: {trial.number}, Value: {trial.value}, fit number: {fit_num}")

            checkpoint_filepath = f'./NN_search/optuna_search_checkpoint.h5'
            model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
                filepath=checkpoint_filepath,
                save_weights_only=True,
                monitor='val_loss',
                mode='min',
                save_best_only=True)

            best_callback = [model_checkpoint_callback,                  
                            ReduceLROnPlateau('val_loss', patience=10, factor=0.8), 
                            TerminateOnNaN(),
                            EarlyStopping(monitor='val_loss', patience=100, verbose=1)
                        ]


            model = create_model(trial)
            model.fit(X_train_b, y_train_b, epochs=epochs_best_fit, validation_data=(X_val_b, y_val_b), batch_size=trial.params['batch_size'], verbose=0, callbacks=best_callback)
            model.load_weights(checkpoint_filepath)

            predictions = model.predict(X_val_b, verbose=0)
            mse = mean_squared_error(y_val_b, predictions)
            mae = mean_absolute_error(y_val_b, predictions)
            r2 = r2_score(y_val_b, predictions)
            rmsle = rmsle_score(y_val_b, predictions)

                        
            print(f'MSE:{mse:.5f}\nMAE:{mae:.5f}\nRMSLE:{rmsle:.5f}\nR2:{r2:.5f}')

            if rmsle < best_val:
                best_model = model
                best_val = rmsle
                best_trial_num = trial.number
                best_trial = trial
                print(f'*** New best model for fold {fold_num} is Trial {best_trial_num} with RMSLE {best_val} ***')
                print(f'Best trial hyperparameters: {trial.params}')
    
    if best_model is not None:
        
        best_optuna_models.append(best_model)
        best_val_scores.append(best_val)
        best_optuna_trials.append(best_trial)
        print('*'*40)
        print(f"Best model for fold {fold_num} RMSLE: {best_val}\nTrial number: {best_trial_num}\nHyperparameters: {best_trial.params}")
        print('*'*40)

    print(f"Time taken for best fitting in fold {fold_num}: {str(timedelta(seconds=(time.time() - best_fitting_time)) )}")

    fold_num += 1

print('*'*40)
print(f'Best models fitting time total:', str(timedelta(seconds=(time.time() - fitting_search_start))))
print(f"Total time taken for search and fitting best models: {str(timedelta(seconds=(time.time() - total_time_start)))}")
print('*'*40)   

from datetime import datetime

for i, (model, score) in enumerate(zip(best_optuna_models, best_val_scores)):
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    directory = f"./NN_search/{study_name}_foldmodel{i}_score_{score:.4f}_{timestamp}.h5"
    print(f"Saving model {i} with score {score:.4f} to {directory}")
    model.save(directory)




In [None]:
import optuna
from sklearn.model_selection import KFold
import time

# Oletetaan, että rmsle_score ja create_model funktiot ovat määritelty

folds = 5
epochs_best_fit = 500

# Ladataan kaikki studyt ja etsitään globaalisti paras trial
best_global_val = float('inf')
best_global_trial = None
best_optuna_models_global = []

for fold_num in range(folds):
    
     
    fold_name = f'{study_name}_{fold_num}'    
    study = optuna.create_study(                                
                                study_name=fold_name,
                                storage=f'sqlite:///tampere_reg.db',
                                load_if_exists=True
                                )
    valid_trials = [trial for trial in study.trials if trial.value is not None]
    sorted_trials = sorted(valid_trials, key=lambda trial: trial.value)
    best_trial = sorted_trials[0].value

    if best_trial < best_global_val:
        best_global_val = best_trial
        best_global_trial = sorted_trials[0]
        best_fold = fold_num
        print(f'New best global trial value: {best_global_val:.4f} found in fold {best_fold}')
        
print(f'Best global trial value: {best_global_val:.4f} tahat found in fold {best_fold}')

# Nyt meillä on paras trial, jota käytetään kaikkien foldien kouluttamiseen

skf =  StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
stratified_labels = df_strat

fold_num = 0
for train_index, val_index in skf.split(X_train_NN, stratified_labels):
    print(f"Training fold {fold_num} using best global trial...")
    
    X_train_b, X_val_b = X_train_NN[train_index], X_train_NN[val_index]
    y_train_b, y_val_b = y_train[train_index], y_train[val_index]

    # Luodaan malli parhaan trialin parametreilla
    model = create_model(best_global_trial)

    checkpoint_filepath = f'./NN_search/optuna_search_checkpoint.h5'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_loss',
        mode='min',
        save_best_only=True)


    best_callback = [model_checkpoint_callback,                  
                        ReduceLROnPlateau('val_loss', patience=10, factor=0.8), 
                        TerminateOnNaN(),
                        EarlyStopping(monitor='val_loss', patience=100, verbose=1)
                    ]
    
    # Koulutetaan malli
    model.fit(X_train_b, y_train_b, epochs=epochs_best_fit, validation_data=(X_val_b, y_val_b), batch_size=best_global_trial.params['batch_size'], verbose=0, callbacks=best_callback)
    model.load_weights(checkpoint_filepath)
    
    # Tarkistetaan mallin suorituskykyä (tämä osa voi vaatia mukauttamista projektisi tarpeisiin)
    predictions = model.predict(X_val_b)
    rmsle = rmsle_score(y_val_b, predictions)
    print(f"Fold {fold_num} RMSLE: {rmsle}")

    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    directory = f"./NN_search/{study_name}_best_foldmodel{fold_num}_score_{rmsle:.4f}_{timestamp}.h5"
    print(f"Saving model {fold_num} with score {rmsle:.4f} to {directory}")
    model.save(directory)
    best_optuna_models_global.append(model)
    fold_num += 1



In [None]:
for idx, model in enumerate(best_optuna_models_global):
    print(f"\nModel {idx} Summary:")
    # model.summary()
    
    # Testaa mallia testidatalla
    predictions = model.predict(X_test_NN, verbose = 0)
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    rmsle = rmsle_score(y_test, predictions)
    
    print(f"\nModel {idx} Performance on Test Data:")
    print(f"MSE: {mse:.3f}")
    print(f"MAE: {mae:.3f}")
    print(f"R2: {r2:.3f}")
    print(f"RMSLE: {rmsle:.3f}")
    print("*"*40)




In [None]:
print('Models with best fold trial fitted')
for idx, model in enumerate(best_optuna_models):
    print(f"\nModel {idx} Summary:")
    # model.summary()
    
    # Testaa mallia testidatalla
    predictions = model.predict(X_test_NN, verbose = 0)
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    rmsle = rmsle_score(y_test, predictions)
    
    print(f"\nModel {idx} Performance on Test Data:")
    print(f"MSE: {mse:.3f}")
    print(f"MAE: {mae:.3f}")
    print(f"R2: {r2:.3f}")
    print(f"RMSLE: {rmsle:.3f}")
    print("*"*40)




In [None]:
import optuna
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from datetime import timedelta
import time
from tensorflow.keras.models import Model
import glob
from tensorflow.keras.models import load_model
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.model_selection import train_test_split, StratifiedKFold
import warnings

# Estä kaikki UserWarning-varoitukset näkymästä
warnings.filterwarnings("ignore", category=UserWarning)


# study_name = '0329_rmsle5_layers'
# study_name = 'rmsle5_random_2503'

study_name = '0329_rmsle_stratfold_layers'

num_random = 4000
num_tpe = 200

time_started_xgb = time.time()

def rmsle_loss(y_true, y_pred):
    penalty = tf.constant(1e5, dtype=tf.float32)
    valid_mask = tf.math.greater(y_pred, 0.0)
    safe_y_pred = tf.where(valid_mask, y_pred, penalty)
    rmsle = tf.sqrt(tf.reduce_mean(tf.square(tf.math.log1p(safe_y_pred) - tf.math.log1p(y_true))))
    return tf.where(tf.reduce_any(~valid_mask), penalty, rmsle)
custom_objects = {"rmsle_loss": rmsle_loss}

model_best_vals = []
best_optuna_models = []

folds = 5

for fold_num in range(folds): # TODO testiä parhailla malleilla
    patterns = [
        f"./NN_search/{study_name}_best_foldmodel{fold_num}_score_*.h5",
        f"./NN_search/{study_name}_foldmodel{fold_num}_score_*.h5"
    ]
    
    best_score = float('inf')
    best_model_file = None
    for pattern in patterns:
        model_files = glob.glob(pattern)
        for model_file in model_files:
            score_part = model_file.split('_score_')[1]
            score = float(score_part.split('_')[0])
            if score < best_score:
                best_score = score
                best_model_file = model_file
                    
    model_best_vals.append(best_score)    
    # Lataa parhaan mallin tiedosto
    if best_model_file:
        best_model = load_model(best_model_file, custom_objects=custom_objects)
        best_optuna_models.append(best_model)
        print(f"Loaded best model for fold {fold_num} from {best_model_file} with score {best_score:.4f}")
    else:
        print(f"No model files found for fold {fold_num} matching pattern {pattern}")



X_train_features_list = []
X_test_features_list = []
features_names_list = []

for idx, model in enumerate(best_optuna_models):
    feature_extractor = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    X_train_features = feature_extractor.predict(X_train_NN)
    X_test_features = feature_extractor.predict(X_test_NN)
    
    X_train_features_list.append(X_train_features)
    X_test_features_list.append(X_test_features)

    print(f'Model train feature shape: {X_train_features.shape}')
    print(f'Model test feature shape: {X_test_features.shape}')

    num_features = X_train_features.shape[1]
    model_feature_names = [f"model_{idx}_feature_{feature_idx}" for feature_idx in range(num_features)]
    features_names_list.append(model_feature_names)


original_feature_names = list(X_train.columns) 
# combined_feature_names = original_feature_names + features_names_list

# print(f'Train combined feature shape: {X_train_features.shape}')
# print(f'Test combined feature shape: {X_test_features.shape}')

# Yhdistetään ominaisuusvektorit

def select_models(X_train_features_list, X_test_features_list, features_names_list, which_models):

    X_train_selected = []
    X_test_selected = []
    selected_names = []

    if all(not choosenode for choosenode in which_models):        
        X_train_selected = None
        X_test_selected = None
        selected_names = None

    else:
        for idx, choosenode in enumerate(which_models):
            if choosenode: 
                X_train_selected.append(X_train_features_list[idx])
                X_test_selected.append(X_test_features_list[idx])
                selected_names.extend(features_names_list[idx])                
            
        X_train_selected = np.concatenate(X_train_selected, axis=1)
        X_test_selected = np.concatenate(X_test_selected, axis=1)
        
        
    return X_train_selected, X_test_selected, selected_names
    

# X_train_combined = np.concatenate(X_train_features_list, axis=1)
# X_test_combined = np.concatenate(X_test_features_list, axis=1)

# X_train_combined = np.concatenate([X_train_combined, X_train], axis=1)   
# X_test_combined = np.concatenate([X_test_combined, X_test], axis=1)

def select_features(X_train_combined, X_test_combined, y_train, combined_feature_names, method, max_feature):
    
    if method == 'f_regression':
        method_function = f_regression
    elif method == 'mutual_info_regression':
        method_function = mutual_info_regression
    else:
        method_function = None

    if method_function is not None:
        selector = SelectKBest(method_function, k=max_feature)
        X_train_combined_selected = selector.fit_transform(X_train_combined, y_train)
        X_test_combined_selected = selector.transform(X_test_combined)
        selected_indices = selector.get_support(indices=True)
        selected_features_names = np.array(combined_feature_names)[selected_indices]
        selected_features_names = selected_features_names.tolist()
        
        return X_train_combined_selected, X_test_combined_selected, selected_features_names
    else:
        return X_train_combined, X_test_combined, combined_feature_names


def rmsle(y_true, y_pred):
    if np.any(y_pred <= 0):
        return 1e6
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
from sklearn.model_selection import KFold



def objective(trial):
    # Määritetään parametrit, jotka optimoidaan
    param = {
        "booster": "dart",
        # "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        'max_depth': trial.suggest_int('max_depth', 1, 6),
        "lambda": trial.suggest_float("lambda", 1e-4, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-4, 1.0, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-2, 2, log=True),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'gamma': trial.suggest_float('gamma', 0.1, 42, log = True)     
        # 'nthread' : -3 
    }

    num_boost_round = trial.suggest_int('BRounds', 1, 142)  
    selector = trial.suggest_categorical('S', choices = ['f_regression', 'mutual_info_regression', 'None'])

    select_0 = trial.suggest_categorical('S0', [True, False])
    select_1 = trial.suggest_categorical('S1', [True, False])
    select_2 = trial.suggest_categorical('S2', [True, False])
    select_3 = trial.suggest_categorical('S3', [True, False])
    select_4 = trial.suggest_categorical('S4', [True, False])

    X_train_combined_selected, X_test_combined_selected , selected_features_names = select_models(X_train_features_list, X_test_features_list, features_names_list, [select_0, select_1, select_2, select_3, select_4])

    if X_train_combined_selected is not None:
        combined_feature_names = selected_features_names + original_feature_names
        X_train_combined_selected = np.concatenate([X_train_combined_selected, X_train], axis=1)
        X_test_combined_selected = np.concatenate([X_test_combined_selected, X_test], axis=1)
    else:
        X_train_combined_selected = X_train
        X_test_combined_selected = X_test
        combined_feature_names = original_feature_names


    num_selected = trial.suggest_int('N_fea', 1, X_train_combined_selected.shape[1])
    X_train_combined_selected, _ , combined_feature_names = select_features(X_train_combined_selected, X_test_combined_selected, y_train, combined_feature_names, selector, num_selected)
    # print(f"Selected features: {combined_feature_names}")
    
    
    rmsle_scores = []
    dtrain_full = xgb.DMatrix(X_train_combined_selected, label=y_train, feature_names=combined_feature_names)

    skf =  StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    stratified_labels = df_strat
    for train_index, val_index in skf.split(X_train_NN, stratified_labels):
        
        dtrain = dtrain_full.slice(train_index)
        dval = dtrain_full.slice(val_index)

        evals_result = {}
        bst = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=[(dval, 'val')], evals_result=evals_result, verbose_eval=False, early_stopping_rounds=1000)
        best_iteration = bst.best_iteration
        preds = bst.predict(dval, iteration_range=(0, best_iteration + 1))
        y_true = y_train[val_index]
        # loss = r2_score(y_true, preds)
        loss = rmsle(y_true, preds)
        rmsle_scores.append(loss)

    average_rmsle = np.mean(rmsle_scores)
    return average_rmsle

# study = optuna.create_study(direction='maximize', 
#                             storage='sqlite:///tampere_reg.db', 
#                             study_name='0326_xgb_comb_R2', # TODO muuta nimeä tarvittaessa
#                             load_if_exists=False) 

study = optuna.create_study(direction='minimize')

# optuna.logging.set_verbosity(optuna.logging.WARNING)
optuna.logging.set_verbosity(optuna.logging.INFO)
study.sampler = optuna.samplers.QMCSampler(warn_independent_sampling = False)
print(f'Random sampling {num_random} trials...')
study.optimize(objective, n_trials=num_random)
study.sampler = optuna.samplers.TPESampler()
print(f'TPE sampling {num_tpe} trials...')
study.optimize(objective, n_trials=num_tpe)

print(f'Time taken for XGBoost optimization: {str(timedelta(seconds=(time.time() - time_started_xgb)))}')
print(f'Time taken for one trial: {str(timedelta(seconds=(time.time() - time_started_xgb) / (num_random + num_tpe)))}')

# Parhaiden parametrien tulostus ja mallin koulutus
print(f"Best val: {study.best_trial.value}")
print(f'Best params: {study.best_params}')

# X_train_combined_selected, X_test_combined_selected, selected_features_names = select_features(X_train_combined, X_test_combined, y_train, combined_feature_names, study.best_params['selector'], study.best_params['num_selected_features'])

X_train_combined_selected, X_test_combined_selected , selected_features_names = select_models(X_train_features_list, X_test_features_list, features_names_list, [study.best_params['S0'], study.best_params['S1'], study.best_params['S2'], study.best_params['S3'], study.best_params['S4']])

if X_train_combined_selected is not None:
    combined_feature_names = selected_features_names + original_feature_names
    X_train_combined_selected = np.concatenate([X_train_combined_selected, X_train], axis=1)
    X_test_combined_selected = np.concatenate([X_test_combined_selected, X_test], axis=1)
else:
    X_train_combined_selected = X_train
    X_test_combined_selected = X_test
    combined_feature_names = original_feature_names

X_train_combined_selected, X_test_combined_selected, selected_features_names = select_features(X_train_combined_selected, X_test_combined_selected, y_train, combined_feature_names, study.best_params['S'], study.best_params['N_fea'])

dtrain = xgb.DMatrix(X_train_combined_selected, label=y_train, feature_names=selected_features_names)
best_model = xgb.train(study.best_params, dtrain, num_boost_round=study.best_params['BRounds'])

# Testataan mallia koulutusdatalla jotta voidaan arvioida overfittingia
pred_train = best_model.predict(dtrain)
mae_train = mean_absolute_error(y_train, pred_train)
mse_train = mean_squared_error(y_train, pred_train)
r2_train = r2_score(y_train, pred_train)
rmsle_train = rmsle(y_train, pred_train)
print(f"Train MAE: {mae_train}, Train MSE: {mse_train}, Train R2: {r2_train}, Train RMSLE: {rmsle_train}")

dtest = xgb.DMatrix(X_test_combined_selected, label=y_test, feature_names=selected_features_names)
predictions = best_model.predict(dtest)

mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
rmsle_val = rmsle(y_test, predictions)
print(f"MAE: {mae}, MSE: {mse}, R2: {r2}, RMSLE: {rmsle_val}")

# Visualisoidaan ennustettuja arvoja verrattuna todellisiin arvoihin
plt.figure(figsize=(20, 10))
plt.scatter(y_test, predictions, edgecolors=(0, 0, 0))
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Measured')
plt.ylabel('Predicted')
plt.title('Measured vs. Predicted Values')
plt.show()

fig, ax = plt.subplots(figsize=(20, 30))
xgb.plot_importance(best_model, importance_type='weight', ax=ax)
ax.set_title('Feature Importance by Weight', fontsize=16)
plt.show()

fig, ax = plt.subplots(figsize=(20, 30))
xgb.plot_importance(best_model, importance_type='gain', ax=ax)
ax.set_title('Feature Importance by Gain', fontsize=16)
plt.show()

fig, ax = plt.subplots(figsize=(20, 30))
xgb.plot_importance(best_model, importance_type='cover', ax=ax)
ax.set_title('Feature Importance by Cover', fontsize=16)
plt.show()




In [None]:

### Testataan vielä minkälaisia tuloksia vain NN mallien ennustuksilla ### 

predictions_train = []

for idx, model in enumerate(best_optuna_models):
    pred = model.predict(X_train_NN, verbose=0).flatten()
    predictions_train.append(pred)

predictions_test = []

for idx, model in enumerate(best_optuna_models):
    pred = model.predict(X_test_NN, verbose=0).flatten()
    predictions_test.append(pred)

### Keskiarvo 
print('Keskiarvo ')
predictions_mean = np.mean(predictions_test, axis=0)
mse = mean_squared_error(y_test, predictions_mean)
mae = mean_absolute_error(y_test, predictions_mean)
r2 = r2_score(y_test, predictions_mean)
rmsle_val = rmsle(y_test, predictions_mean)  
print(f"MAE: {mae}, MSE: {mse}, R2: {r2}, RMSLE: {rmsle_val}\n\n")

### Painotettu keskiarvo
print('Painotettu keskiarvo käänteisillä')
best_rmsle = float('inf')
for pot in range (1,20):
    weights = [1 / x**pot for x in model_best_vals]
    w_sum = sum(weights)
    weights = [x / w_sum for x in weights]
    weighted_predictions = np.average(predictions_test, axis=0, weights=weights)
    mse = mean_squared_error(y_test, weighted_predictions)
    mae = mean_absolute_error(y_test, weighted_predictions)
    r2 = r2_score(y_test, weighted_predictions)
    rmsle_val = rmsle(y_test, weighted_predictions)  # Oletetaan että sinulla on rmsle funktio määritelty
    if rmsle_val < best_rmsle:
        best_rmsle = rmsle_val
        bestpot = pot
print(f'Paras arvo löytyi potenssilla {bestpot} arvolla {best_rmsle}')
        

### Lineaarinen regressio 
from sklearn.linear_model import LinearRegression
# Oletetaan, että `predictions` on lista, joka sisältää kunkin mallin ennusteet testidatasetille
X_meta_train = np.stack(predictions_train, axis=1)
X_meta_test = np.stack(predictions_test, axis=1)
# Koulutetaan meta-malli
meta_model = LinearRegression()
meta_model.fit(X_meta_train, y_train)

# Käytetään meta-mallia ennustamaan
linear_predictions = meta_model.predict(X_meta_test)

print('Linear meta')
mse = mean_squared_error(y_test, linear_predictions)
mae = mean_absolute_error(y_test, linear_predictions)
r2 = r2_score(y_test, linear_predictions)
rmsle_val = rmsle(y_test, linear_predictions)  # Oletetaan että sinulla on rmsle funktio määritelty
print(f"MAE: {mae}, MSE: {mse}, R2: {r2}, RMSLE: {rmsle_val}\n\n")

### XGBoost 
import optuna
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np
time_xgb = time.time()

NN_names = [f'NN_{i}' for i in range(len(best_optuna_models))]

X_train_XGB = np.column_stack(predictions_train)
X_test_XGB = np.column_stack(predictions_test)

def objective(trial):
    # XGBoostin parametrit, jotka optimoidaan
    param = {        
        'objective': 'reg:absoluteerror',      
        "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-6, 1.0, log = True),
        "alpha": trial.suggest_float("alpha", 1e-6, 1.0, log = True),
        "max_depth": trial.suggest_int("max_depth", 1, 6),
        "learning_rate": trial.suggest_float("eta", 1e-2, 1.0),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 10)
    }

    num_boost_round = trial.suggest_int('num_boost_round', 1, 142)  

    kf = KFold(n_splits=5)
    rmsle_scores = []

    dtrain_full = xgb.DMatrix(X_train_XGB, label=y_train, feature_names=NN_names)

    for train_index, val_index in kf.split(X_train_XGB):
        dtrain = dtrain_full.slice(train_index)
        dval = dtrain_full.slice(val_index)

        evals_result = {}
        bst = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=[(dval, 'val')], evals_result=evals_result, verbose_eval=False, early_stopping_rounds=200)
        
        best_iteration = bst.best_iteration
        preds = bst.predict(dval, iteration_range=(0, best_iteration + 1))
        y_true = y_train[val_index]
        loss = r2_score(y_true, preds)
        # loss = rmsle(y_true, preds)
        rmsle_scores.append(loss)
        
    return np.mean(rmsle_scores)

study = optuna.create_study(direction="maximize")
study.sampler = optuna.samplers.QMCSampler(warn_independent_sampling = False)
study.optimize(objective, n_trials=42)
print(f'Random sampling trials...')
study.sampler = optuna.samplers.TPESampler()
print(f'TPE sampling trials...')
study.optimize(objective, n_trials=10)


print("Best trial:")
trial = study.best_trial

print(f"RMSLE: {trial.value}")
print("Best hyperparameters: {}".format(trial.params))

# Koulutetaan paras malli uudelleen koko datasetillä
best_params = trial.params
dtrain = xgb.DMatrix(X_train_XGB, label=y_train, feature_names= NN_names)
final_model = xgb.train(best_params, dtrain)

dtest = xgb.DMatrix(X_test_XGB, label=y_test, feature_names=NN_names)

predictions_XGB = final_model.predict(dtest)
mse = mean_squared_error(y_test, predictions_XGB)
mae = mean_absolute_error(y_test, predictions_XGB)
r2 = r2_score(y_test, predictions_XGB)
rmsle_val = rmsle(y_test, predictions_XGB)  

print(f"Parhaan mallin tulokset testidatalla:")
print(f"MAE: {mae}, MSE: {mse}, R2: {r2}, RMSLE: {rmsle_val}")
print(f'Time taken for XGBoost optimization: {str(timedelta(seconds=(time.time() - time_xgb)))}')

# Visualisoidaan ennustettuja arvoja verrattuna todellisiin arvoihin
plt.figure(figsize=(20, 10))
plt.scatter(y_test, predictions_XGB, edgecolors=(0, 0, 0))
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Measured')
plt.ylabel('Predicted')
plt.title('Measured vs. Predicted Values')
plt.show()

fig, ax = plt.subplots(figsize=(20, 30))
xgb.plot_importance(final_model, importance_type='weight', ax=ax)
ax.set_title('Feature Importance by Weight', fontsize=16)
plt.show()

# Asetetaan toisen kuvaajan koko
fig, ax = plt.subplots(figsize=(20, 30))
xgb.plot_importance(final_model, importance_type='gain', ax=ax)
ax.set_title('Feature Importance by Gain', fontsize=16)
plt.show()

# Asetetaan kolmannen kuvaajan koko
fig, ax = plt.subplots(figsize=(20, 30))
xgb.plot_importance(final_model, importance_type='cover', ax=ax)
ax.set_title('Feature Importance by Cover', fontsize=16)
plt.show()


