In [None]:
import pandas as pd
import numpy as np
df_train_filtered = pd.read_pickle('./data/df_train_filtered.pkl')


In [None]:
from sklearn.model_selection import train_test_split


# Koska Talot. tyyppejä ei ole kovin montaa, niin yhdistetään ne kaupunginosan kanssa jonka mukaan tehdään testi data setti
df_train_filtered['combined'] = df_train_filtered[['Kaupunginosa', 'Talot.']].astype(str).agg('-'.join, axis=1)
counts = df_train_filtered['combined'].value_counts()
df_train_filtered['combined'] = df_train_filtered['combined'].map(lambda x: 'other' if counts[x] < 2 else x)

X = df_train_filtered.drop('Hinta', axis=1)
y = df_train_filtered['Hinta']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=df_train_filtered['combined'], random_state=42)
X_train.drop('combined', axis=1, inplace=True)
X_test.drop('combined', axis=1, inplace=True)







In [None]:
from sklearn.metrics import mean_squared_error, r2_score,  mean_absolute_error

def rmsle_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.log1p(y_true+1), np.log1p(y_pred+1)))

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

r2 = r2_score(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
print(f"Mean squared error: {mse:.2f}\nMean absolute error: {mae:.2f}\nR²-arvo: {r2:.2f}\nRMSLE: {rmsle_score(y_test, predictions):.2f}")


In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(20, 10))
ax.scatter(y_test, predictions, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.title('Measured vs. Predicted Values')
plt.show()


In [None]:
import optuna
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import time

def rmsle(y_true, y_pred):
    if np.any(y_pred <= 0):
        return 1e6
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))

original_feature_names = list(X_train.columns) 
y_train_alku = np.array(y_train)
X_train_alku = np.array(X_train)

def objective(trial):
    params = {
         "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        "lambda": trial.suggest_float("lambda", 1e-6, 1.0, log = True),
        "alpha": trial.suggest_float("alpha", 1e-6, 1.0, log = True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-2, 0.3),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 20)
    }

    rmsle_scores = []
    kf = KFold(n_splits=5)

    for train_index, val_index in kf.split(X_train):
        X_train_k, X_val_k = X_train_alku[train_index], X_train_alku[val_index]
        y_train_k, y_val_k = y_train_alku[train_index], y_train_alku[val_index]

        dtrain = xgb.DMatrix(X_train_k, label=y_train_k, feature_names=original_feature_names)
        dval = xgb.DMatrix(X_val_k, label=y_val_k, feature_names=original_feature_names)

        bst = xgb.train(params, dtrain, num_boost_round=trial.suggest_int('num_boost_round', 1, 100),evals=[(dval, 'eval')], early_stopping_rounds=300, verbose_eval=False)
        preds = bst.predict(dval)
        rmsle_score = rmsle(y_val_k, preds)
        rmsle_scores.append(rmsle_score)

    average_rmsle = np.mean(rmsle_scores)
    return average_rmsle

study = optuna.create_study(direction='minimize', storage = 'sqlite:///tampere_reg.db', study_name='0329_base', load_if_exists=True)
study.sampler = optuna.samplers.QMCSampler(warn_independent_sampling = False)
print(f'Random...')
study.optimize(objective, n_trials=1000)
study.sampler = optuna.samplers.TPESampler()
print('TPE...')
study.optimize(objective, n_trials=420)

print(f'Best value is {study.best_value}')
print(f'Best parameters are {study.best_params}')

# Koulutetaan malli parhailla hyperparametreilla uudelleen koko datasetilla
best_params = study.best_trial.params
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=original_feature_names)
best_model_base = xgb.train(best_params, dtrain, num_boost_round=best_params['num_boost_round'])

train_pred = best_model_base.predict(dtrain)
train_rmsle = rmsle(y_train, train_pred)
train_mse = mean_squared_error(y_train, train_pred)
train_mae = mean_absolute_error(y_train, train_pred)
train_r2 = r2_score(y_train, train_pred)
print(f'Train RMSLE: {train_rmsle}, Train MSE: {train_mse}, Train MAE: {train_mae}, Train R2: {train_r2}')

dtest = xgb.DMatrix(X_test, label=y_test, feature_names=original_feature_names)
predictions = best_model_base.predict(dtest)
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
rmsle_val = rmsle(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"MAE: {mae}, MSE: {mse}, RMSLE: {rmsle_val}, R2: {r2}")

plt.figure(figsize=(20, 10))
plt.scatter(y_test, predictions, edgecolors=(0, 0, 0))
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Measured')
plt.ylabel('Predicted')
plt.title('Measured vs. Predicted Values')
plt.show()

plt.figure(figsize=(15, 30))
xgb.plot_importance(best_model_base, importance_type='weight', title='Feature Importance by Weight')
plt.show()

plt.figure(figsize=(15, 30))
xgb.plot_importance(best_model_base, importance_type='gain', title='Feature Importance by Gain')
plt.show()

plt.figure(figsize=(15, 30))
xgb.plot_importance(best_model_base, importance_type='cover', title='Feature Importance by Cover')
plt.show()


