In [1]:
import pandas as pd
import numpy as np
df_train_filtered = pd.read_pickle('./data/df_train_filtered.pkl')


In [2]:
from sklearn.model_selection import train_test_split

X = df_train_filtered.drop('Hinta', axis=1)
y = df_train_filtered['Hinta']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=df_train_filtered['Kaupunginosa'], random_state=42)





In [3]:
from sklearn.metrics import mean_squared_error, r2_score,  mean_absolute_error

def rmsle_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.log1p(y_true+1), np.log1p(y_pred+1)))

In [4]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

r2 = r2_score(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
print(f"Mean squared error: {mse:.2f}\nMean absolute error: {mae:.2f}\nR²-arvo: {r2:.2f}\nRMSLE: {rmsle_score(y_test, predictions):.2f}")


Mean squared error: 9450.37
Mean absolute error: 68.02
R²-arvo: 0.63
RMSLE: 0.32
Parhaan mallin R²-arvo: 0.6270273210094915


In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(20, 10))
ax.scatter(y_test, predictions, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.title('Measured vs. Predicted Values')
plt.show()


In [None]:
import numpy as np
import xgboost
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt


virhe_mallit = [xgboost.XGBRegressor(objective='reg:absoluteerror'), xgboost.XGBRegressor(objective='reg:squarederror')]
virhe_nimi = ['neg_mean_absolute_error', 'neg_mean_squared_error']


for idx, malli in enumerate(virhe_mallit):

    param_space = {
        'n_estimators': np.arange(1, 500, 10),
        'max_depth': np.arange(3, 11),
        'learning_rate': [0.1, 0.01, 0.001],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'gamma': [0, 1, 5],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [1, 1.5, 2]
    }

    random_search = RandomizedSearchCV(
        estimator=malli,
        param_distributions=param_space,
        cv=5,
        n_jobs=-2,
        n_iter= 1000,
        verbose=1,
        scoring=virhe_nimi[idx],  
    )


    random_search.fit(X_train, y_train)

    best_index = random_search.best_index_
    cv_results = random_search.cv_results_
    cv_splits = random_search.cv
    best_scores = [cv_results[f'split{i}_test_score'][best_index] for i in range(cv_splits)]


    print(f"With error: {virhe_nimi[idx]}")
    for i, score in enumerate(best_scores):
        print(f"Ositus {i}: {-score}")

    best_model = random_search.best_estimator_
    predictions = best_model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)    
    print(f"Mean squared error: {mse:.2f}\nMean absolute error: {mae:.2f}\nRMSLE: {rmsle_score:.4f}\nParhaan mallin R²-arvo: {r2:.4f}")

    plt.figure(figsize=(20, 10))
    plt.scatter(y_test, predictions, edgecolors=(0, 0, 0))
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
    plt.xlabel('Measured')
    plt.ylabel('Predicted')
    plt.title(f'{virhe_nimi[idx]} Measured vs. Predicted Values')
    plt.show()




In [None]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler 

# Skaalataan numeeriset muuttujat
robust_scaler = RobustScaler()
std_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()
df_train_NN = df_train_filtered.copy()
df_train_NN[['Pituusaste', 'Leveysaste']] = minmax_scaler.fit_transform(df_train_NN[['Pituusaste', 'Leveysaste']])
df_train_NN['Rv'] = minmax_scaler.fit_transform(df_train_NN[['Rv']])
df_train_NN['m2'] = minmax_scaler.fit_transform(df_train_NN[['m2']])

# One hot koodataan kategoriset muuttujat
df_hot = pd.get_dummies(df_train_NN['Kaupunginosa'], prefix='Kaupunginosa').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN['kerros'], prefix='kerros').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN['max_kerros'], prefix='max_kerros').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN['Kunto'], prefix='Kunto').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN['Hissi'], prefix='Hissi').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN['Asunnon tyyppi'], prefix='Asunnon tyyppi').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)

df_hot = pd.get_dummies(df_train_NN["Talot."], prefix='Talot.').astype('int')
df_train_NN = pd.concat([df_train_NN, df_hot], axis=1)


df_train_NN.drop(['Kaupunginosa', 'kerros', 'max_kerros', 'Kunto', 'Hissi', 'Asunnon tyyppi', "Talot."], axis=1, inplace=True)




In [None]:
# Muodostetaan X ja y sekä jaetaan data harjoitus- ja testijoukkoihin

X = df_train_NN.drop('Hinta', axis=1)
y = df_train_NN['Hinta']

X = X.to_numpy()
y = y.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
# import tensorflow as tf
# from tensorflow import keras
# from keras_tuner import Hyperband
# from keras import regularizers, layers, optimizers, initializers
# from keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN
# import pandas as pd
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# from sklearn.model_selection import KFold, train_test_split
# import numpy as np
# import matplotlib.pyplot as plt
# import time


# models_hyperband = []
# mse_scores = []
# mae_scores = []
# r2_scores = []
# best_hyperparameters_hyperband = []

# search_time_start = time.time() 

# def build_model(hp):
#     model = keras.Sequential()
#     model.add(layers.Input(shape=(X_train.shape[1],)))

#     # Luodaan kerroksia käyttäen Hyperband-optimoinnin hyperparametreja
#     for i in range(hp.Int('num_layers', 1, 4)):  # Vaihteluväli kerrosten määrälle
#         model.add(layers.Dense(
#             units=hp.Int(f'units_{i}', min_value=8, max_value=512, step=16),
#             activation=hp.Choice(f'activation_{i}', values=['relu', 'linear', 'selu', 'elu', 'sigmoid', 'tanh']),
#             kernel_regularizer=regularizers.l1_l2(
#                 l1=hp.Float(f'l1_reg_{i}', min_value=1e-6, max_value=1, sampling='log'),
#                 l2=hp.Float(f'l2_reg_{i}', min_value=1e-6, max_value=1, sampling='log')),
#             kernel_initializer=hp.Choice('initializer', values=['he_normal', 'glorot_uniform', 'lecun_normal', 'glorot_normal'])
#             )
#         )
#         model.add(layers.Dropout(hp.Float(f'dropout_{i}', min_value=0.0, max_value=0.5, step=0.05)))

#     model.add(layers.Dense(1, activation='linear'))
#     optimizer_choice = hp.Choice('optimizer', values=['rmsprop', 'nadam', 'adamax'])
#     learning_rate = hp.Choice('learning_rate', values=[1.0, 1e-1, 1e-2, 1e-3])
    
#     if optimizer_choice == 'rmsprop':
#         optimizer = optimizers.RMSprop(learning_rate=learning_rate)
#     elif optimizer_choice == 'nadam':
#         optimizer = optimizers.Nadam(learning_rate=learning_rate)
#     else:
#         optimizer = optimizers.Adamax(learning_rate=learning_rate)

#     model.compile(optimizer=optimizer, loss='mse') # TODO RMSLE on parempi metriikka! huomenna se testiin!
#     return model

# # Callbacks määritelty
# callbacks = [    
#     ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=5, min_lr=1e-6, verbose=1),
#     TerminateOnNaN()
# ]

# # Käytetään Hyperband-tuneria


# kf = KFold(n_splits=5, random_state=42)
# fold = 0
# for train_index, val_index in kf.split(X_train):


#     X_train_b, X_val_b = X_train[train_index], X_train[val_index]    
#     y_train_b, y_val_b = y_train[train_index], y_train[val_index]
#     y_train_b = tf.data.Dataset.from_tensor_slices(y_train_b)
#     X_train_b = tf.data.Dataset.from_tensor_slices(X_train_b)
#     train_dataset = tf.data.Dataset.zip((X_train_b, y_train_b)).batch(64)
#     train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

    
#     tuner = Hyperband(
#     build_model,
#     objective='val_loss',
#     max_epochs=300,
#     factor=3,
#     directory='NN_search',
#     project_name=f'kt_hyperband_fold_{fold}',
#     # overwrite=True, # Otetaan pois niin saan vanhat tulokset mukaan optimointiin
#     hyperband_iterations=3
#     )
    
#     tuner.search(train_dataset, validation_data=(X_val_b, y_val_b), callbacks=callbacks, verbose=2)

#     best_model = tuner.get_best_models(num_models=1)[0]
#     models_hyperband.append(best_model)
#     best_hyperparameters_hyperband.append(tuner.get_best_hyperparameters(num_trials=1)[0])

#     # Evaluoi malli
#     predictions = best_model.predict(X_test)
#     mse_scores.append(mean_squared_error(y_test, predictions))
#     mae_scores.append(mean_absolute_error(y_test, predictions))
#     r2_scores.append(r2_score(y_test, predictions))
#     fold += 1




In [None]:
# # TODO lisää sns y ja predicted jakaumat x-y reunoille ja keskelle R2, MSE, MAE
# search_time_end = time.time()
# print(f"Hyperband search took {search_time_end - search_time_start} seconds")

# # Tulosetaan kaikki tulokset alkuun
# for idx, (mae, mse, r2) in enumerate(zip(mae_scores, mse_scores, r2_scores), start=1):
#     print(f"Model {idx} - MAE: {mae}, MSE: {mse}, R2: {r2}")

# # Käydään vielä eri mallit lävitse selvyyden vuoksi
# for idx, model in enumerate(models_hyperband):

#     predictions = model.predict(X_test)
#     plt.figure(figsize=(20, 10)) 
#     plt.scatter(y_test, predictions, edgecolors=(0, 0, 0))
#     plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
#     plt.xlabel('Measured')  
#     plt.ylabel('Predicted') 
#     plt.title('Measured vs. Predicted Values')
#     plt.show()

#     print(f"Fold {idx} - MSE: {mse_scores[idx]}, MAE: {mae_scores[idx]}, R2: {r2_scores[idx]}")
#     hp = best_hyperparameters_hyperband[idx]
#     print(f"Best hyperparameters for model {i+1}:")
#     for key in hp.values:
#         print(f"{key}: {hp.get(key)}")
#     print("-" * 50)
#     model.summary()


In [None]:
# import xgboost
# from sklearn.model_selection import RandomizedSearchCV
# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
# from sklearn.metrics import mean_squared_error, mean_absolute_error
# from keras.models import Model

# # Kerätään ensin kaikkien mallien ominaisuusvektorit
# X_train_features_list = []
# X_test_features_list = []

# for model in models_hyperband:
#     feature_extractor = Model(inputs=model.inputs, outputs=model.layers[-2].output)
#     X_train_features = feature_extractor.predict(X_train)
#     X_test_features = feature_extractor.predict(X_test)
    
#     X_train_features_list.append(X_train_features)
#     X_test_features_list.append(X_test_features)

# # Yhdistetään ominaisuusvektorit
# X_train_combined = np.concatenate(X_train_features_list, axis=1)
# X_test_combined = np.concatenate(X_test_features_list, axis=1)

# xgb = xgboost.XGBRegressor(objective ='reg:squarederror')
# param_space = {
#     'n_estimators': np.arange(1, 500, 20),
#     'max_depth': np.arange(2, 11),
#     'learning_rate': [0.1, 0.01, 0.001],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0],
#     'gamma': [0, 1, 5],
#     'reg_alpha': [0, 0.1, 0.5],
#     'reg_lambda': [1, 1.5, 2]
# }

# random_search = RandomizedSearchCV(
#     estimator=xgb,
#     param_distributions=param_space,
#     cv=5,
#     n_jobs=-2,
#     n_iter=100,
#     verbose=2
# )

# start_time = time.time()
# random_search.fit(X_train_combined, y_train)
# best_model = random_search.best_estimator_
# end_time = time.time()
# elapsed_time = end_time - start_time

# predictions = best_model.predict(X_test_combined)




In [None]:
# mse = mean_squared_error(y_test, predictions)
# mae = mean_absolute_error(y_test, predictions)
# r2 = r2_score(y_test, predictions)
# print(f"MAE: {mae}, MSE: {mse}, R2: {r2}")
# print(f"Random search took {elapsed_time} seconds")
# print(f"Feature shape: {X_train_combined.shape}")

# plt.figure(figsize=(20, 10)) 
# plt.scatter(y_test, predictions, edgecolors=(0, 0, 0))
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
# plt.xlabel('Measured')  
# plt.ylabel('Predicted') 
# plt.title('Measured vs. Predicted Values NN Hyperband features')
# plt.show()

In [None]:
import tensorflow as tf
from keras_tuner import BayesianOptimization
from keras import regularizers, Sequential, layers, optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TerminateOnNaN
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler 
from tensorflow import keras
import time
from keras import initializers
import matplotlib.pyplot as plt

search_time_start = time.time()
models_bayes = []
mse_scores = []
mae_scores = []
r2_scores = []

def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Input(shape=(X_train.shape[1],)))

    # Luodaan kerroksia käyttäen Hyperband-optimoinnin hyperparametreja
    for i in range(hp.Int('num_layers', 1, 4)):  # Vaihteluväli kerrosten määrälle
        model.add(layers.Dense(
            units=hp.Int(f'units_{i}', min_value=8, max_value=512, step=16),
            activation=hp.Choice(f'activation_{i}', values=['relu', 'linear', 'selu', 'elu', 'sigmoid', 'tanh']),
            kernel_regularizer=regularizers.l1_l2(
                l1=hp.Float(f'l1_reg_{i}', min_value=1e-6, max_value=1e-1, sampling='log'),
                l2=hp.Float(f'l2_reg_{i}', min_value=1e-6, max_value=1e-1, sampling='log')),
            kernel_initializer=hp.Choice('initializer', values=['he_normal', 'glorot_uniform', 'lecun_normal', 'glorot_normal'])
            )
        )
        model.add(layers.Dropout(hp.Float(f'dropout_{i}', min_value=0.0, max_value=0.5, step=0.05)))

    model.add(layers.Dense(1, activation='linear'))
    optimizer_choice = hp.Choice('optimizer', values=['rmsprop', 'nadam', 'adamax', 'adam'])
    learning_rate = hp.Choice('learning_rate', values=[1.0, 1e-1, 1e-2, 1e-3, 1e-4])
    
    if optimizer_choice == 'rmsprop':
        optimizer = optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer_choice == 'nadam':
        optimizer = optimizers.Nadam(learning_rate=learning_rate)
    elif optimizer_choice == 'adam':
        optimizer = optimizers.Adam(learning_rate=learning_rate)
    else:
        optimizer = optimizers.Adamax(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss='mse')
    return model

# Initialize the Bayesian Optimization tuner


early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=10,  # Number of epochs with no improvement after which training will be stopped
    mode='min',
    verbose=1
)

reduce_lr_callback = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.8,  # New learning rate = learning rate * factor
    patience=5,  # Number of epochs with no improvement after which learning rate will be reduced
    min_lr=1e-6,  # Lower bound on the learning rate
    verbose=1
)

terminate_on_nan = TerminateOnNaN()

best_hyperparameters_bayes = []

start_time = time.time()
kf = KFold(n_splits=5)
models_bayes = []
round = 0

fold = 0
for train_index, val_index in kf.split(X_train):

    tuner = BayesianOptimization(
    build_model,
    objective='val_loss',
    max_trials=420, 
    executions_per_trial=1,
    directory='NN_search',
    project_name=f'kt_bayesian_fold_{fold}',
    # overwrite=True, # otetaan pois, niin saan vanhat tulokset mukaan
    max_consecutive_failed_trials=10,
    max_retries_per_trial = 0
)
    
    X_train_b, X_val_b = X_train[train_index], X_train[val_index]    
    y_train_b, y_val_b = y_train[train_index], y_train[val_index]
    y_train_b = tf.data.Dataset.from_tensor_slices(y_train_b)
    X_train_b = tf.data.Dataset.from_tensor_slices(X_train_b)
    train_dataset = tf.data.Dataset.zip((X_train_b, y_train_b)).batch(64)
    train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
    
    tuner.search(
        train_dataset,
        epochs=300,
        validation_data=(X_val_b, y_val_b),
        callbacks=[early_stopping_callback, reduce_lr_callback, terminate_on_nan],
        verbose=2
    )

  
    best_model = tuner.get_best_models(num_models=1)[0]
    models_bayes.append(best_model)
    best_hyperparameters_bayes.append(tuner.get_best_hyperparameters(num_trials=1)[0])

    predictions = best_model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    # Tallenna suorituskykymetriikat
    mse_scores.append(mse)
    mae_scores.append(mae)
    r2_scores.append(r2)
    fold += 1





In [None]:

search_time_end = time.time()
print(f"Bayesian search took {search_time_end - search_time_start} seconds")

# Tulosetaan kaikki tulokset alkuun
for idx, (mae, mse, r2) in enumerate(zip(mae_scores, mse_scores, r2_scores), start=1):
    print(f"Model {idx} - MAE: {mae}, MSE: {mse}, R2: {r2}")

# Käydään vielä eri mallit lävitse selvyyden vuoksi
for idx, model in enumerate(models_bayes):

    predictions = model.predict(X_test)
    plt.figure(figsize=(20, 10)) 
    plt.scatter(y_test, predictions, edgecolors=(0, 0, 0))
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
    plt.xlabel('Measured')  
    plt.ylabel('Predicted') 
    plt.title('Measured vs. Predicted Values')
    plt.show()

    print(f"Fold {idx} - MSE: {mse_scores[idx]}, MAE: {mae_scores[idx]}, R2: {r2_scores[idx]}")
    hp = best_hyperparameters_bayes[idx]
    print(f"Best hyperparameters for model {i+1}:")
    for key in hp.values:
        print(f"{key}: {hp.get(key)}")
    print("-" * 50)
    model.summary()

In [None]:
import xgboost
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from keras.models import Model

# Kerätään ensin kaikkien mallien ominaisuusvektorit
X_train_features_list = []
X_test_features_list = []

for model in models_bayes:
    feature_extractor = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    X_train_features = feature_extractor.predict(X_train)
    X_test_features = feature_extractor.predict(X_test)
    
    X_train_features_list.append(X_train_features)
    X_test_features_list.append(X_test_features)

# Yhdistetään ominaisuusvektorit
X_train_combined = np.concatenate(X_train_features_list, axis=1)
X_test_combined = np.concatenate(X_test_features_list, axis=1)

xgb = xgboost.XGBRegressor(objective ='reg:squarederror')
param_space = {
    'n_estimators': np.arange(1, 500, 20),
    'max_depth': np.arange(2, 11),
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_space,
    cv=5,
    n_jobs=-2,
    n_iter=100,
    verbose=2
)

start_time = time.time()
random_search.fit(X_train_combined, y_train)
best_model = random_search.best_estimator_
end_time = time.time()
elapsed_time = end_time - start_time

predictions = best_model.predict(X_test_combined)




In [None]:
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f"MAE: {mae}, MSE: {mse}, R2: {r2}")
print(f"Random search took {elapsed_time} seconds")
print(f"Feature shape: {X_train_combined.shape}")

plt.figure(figsize=(20, 10)) 
plt.scatter(y_test, predictions, edgecolors=(0, 0, 0))
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Measured')  
plt.ylabel('Predicted') 
plt.title('Measured vs. Predicted Values NN Bayesian features')
plt.show()