In [2]:
## Importing libraries
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

In [4]:
## Data Cleaning
# Get energy dummy
sample_data = pd.read_csv('../EXAM/data/cleaned_data.csv',encoding='utf-16')
df = pd.DataFrame(sample_data)

# Change constructing year to construction age and squared term for OLS.
df['age'] = 2023 - df['year']
df['age_squared'] = df['age'] ** 2

# Price per squared metre
df['price_per_m2'] = df['price'] / df['living_space']
np.round(df[['price_per_m2','area_code']].groupby('area_code').mean(),2)


df['type'] = df['type'].astype('category')


print(df['type'])


Unnamed: 0,price,address,city,saledays,living_space,ground_space,rooms,owner_expenses,year,area_code,...,Energimærke A,Energimærke B,Energimærke C,Energimærke D,Energimærke E,Energimærke F,Energimærke G,age,age_squared,price_per_m2
0,1275000,Strandparken 46,Vest- og Sydsjælland,202.0,170.0,1019.0,2.0,2.598,1978.0,4591.0,...,0,0,0,1,0,0,0,45.0,2025.0,7500.000000
1,5095000,"Kalkbrænderihavnsgade 4A, 1. tv.",Byen København,0.0,94.0,0.0,3.0,2.389,2017.0,2100.0,...,1,0,0,0,0,0,0,6.0,36.0,54202.127660
2,14750000,Niels Andersens Vej 56,Københavns omegn,255.0,248.0,984.0,8.0,8.643,1941.0,2900.0,...,0,0,0,0,1,0,0,82.0,6724.0,59475.806452
3,4498000,"Helga Pedersens Gade 1, 2. 3.",Østjylland,7.0,107.0,0.0,3.0,3.849,2014.0,8000.0,...,0,1,0,0,0,0,0,9.0,81.0,42037.383178
4,2850000,Nøddevænget 20,Sydjylland,0.0,163.0,858.0,5.0,3.343,1965.0,7100.0,...,0,0,1,0,0,0,0,58.0,3364.0,17484.662577
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3645,1750000,"Prins Haralds Allé 101, 1.",Fyn,48.0,109.0,506.0,3.0,1.922,1935.0,5250.0,...,0,0,0,0,0,0,1,88.0,7744.0,16055.045872
3646,990000,Nissumvej 4,Vestjylland,48.0,117.0,800.0,3.0,1.566,1942.0,7620.0,...,0,0,1,0,0,0,0,81.0,6561.0,8461.538462
3647,1695000,Baunetoften 12,Nordsjælland,240.0,86.0,103.0,4.0,2.184,1990.0,3320.0,...,0,0,1,0,0,0,0,33.0,1089.0,19709.302326
3648,650000,Aalevej 40,Østjylland,48.0,79.0,829.0,3.0,1.464,1955.0,7160.0,...,0,0,0,1,0,0,0,68.0,4624.0,8227.848101


In [None]:
## Dataset split
columns_to_drop = ['price','address','city','type','energy', 'year','area_name', 'age_squared']
X,y = df.drop(columns_to_drop,axis=1),df.price
test_size = 0.30 # taking 70:30 training and test set
seed = 17082023 # Random number seeding for repeatability of the code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [None]:
## Main model
eta = 0.1
gradient_booster = HistGradientBoostingClassifier(learning_rate=eta)
# Gradient boosting classifiers are required to implement gradient boosting.
gradient_booster.fit(X_train,y_train)
y_test_predict = gradient_booster.predict(X_test)
print(classification_report(y_test,y_test_predict))
# Use the Python module named classification report to verify the correctness and quality of the accuracy report ().

In [None]:
## Lasso regularization
# Normal scaler
from sklearn.preprocessing import StandardScaler
norm_scaler = StandardScaler().fit(X_train)
X_train_std = norm_scaler.transform(X_train)
X_test_std = norm_scaler.transform(X_test)

# Lasso
from sklearn.linear_model import Lasso
output = []
lambdas = np.logspace(-4, 4, 20)
for lambda_ in lambdas:
    reg = Lasso(alpha=lambda_, random_state=1)
    reg.fit(X_train_std, y_train)
    output.append(
        [
            lambda_,
            mse(reg.predict(X_train_std), y_train,squared=False),
            mse(reg.predict(X_test_std), y_test,squared=False),
        ]
    )

In [None]:
## Search for best lambda
MSE_df = pd.DataFrame(
    data=output, columns=["lambda", "MSE train", "MSE test"]
).set_index("lambda")

MSE_df.plot(logx=True, logy=True)

# find the minimal observations as a series
best_fit = MSE_df["MSE test"].nsmallest(1)

# take out the data minimum RMSE and the optimal lambda
lambda_opt, RMSE_min = next(best_fit.items())
print(f"Minimum RMSE = {RMSE_min:.3f} found for lambda = {lambda_opt:.4f}.")


In [None]:
## 5-fold cross validation
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold
# Pipeline
pipe_lasso = make_pipeline(
    PolynomialFeatures(degree=3, include_bias=False),
    StandardScaler(),
    Lasso(random_state=1),
)

# Split X, y into development (2/3) and test data (1/3).
X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=(1 / 3), random_state=1)

lambdas = np.logspace(-4, 4, 12)

kfolds = KFold(n_splits=5)
mses = []

for lambda_ in lambdas:

    pipe_lasso = make_pipeline(
        PolynomialFeatures(degree=3, include_bias=False),
        StandardScaler(),
        Lasso(alpha=lambda_, random_state=1),
    )
    mses_test = []
    mses_train = []

    for train_idx, val_idx in kfolds.split(X_dev, y_dev):
        X_train, y_train = X_dev.iloc[train_idx], y_dev[train_idx]
        X_val, y_val = X_dev.iloc[val_idx], y_dev[val_idx]

        pipe_lasso.fit(X_train, y_train)

        mses_train.append(mse(pipe_lasso.predict(X_train), y_train))
        mses_test.append(mse(pipe_lasso.predict(X_val), y_val))

    mses.append([np.mean(mses_train), np.mean(mses_test), lambda_])

# Create df with MSE values
df_mses = pd.DataFrame(mses, columns=["MSE_train", "MSE_test", "lambda"])

# Index of the lambda that gives the lowest MSE_test in the dataframe
idx_optimal_lambda = df_mses.idxmin()["MSE_test"]
lambda_opt_test = df_mses.loc[idx_optimal_lambda]["lambda"]
opt_test_mse = df_mses.loc[idx_optimal_lambda]["MSE_test"]
print(
    f"Lowest test MSE equal to {opt_test_mse:.4f} is"
    f" achieved with lambda = {lambda_opt_test:.3f}."
)

In [None]:
## Root mean squared errors
# rmse_GBM = mse(y_test,y_test_predict,squared=False)
# rmse_GBM

In [None]:
## R-squared
r2_GBM = r2_score(y_test,y_test_predict)
r2_GBM