In [None]:
import pandas as pd
import pgeocode

In [5]:
# Read the home price csv file
data_path = "../data/homegate_final_unprocessed.csv"

df = pd.read_csv(data_path, index_col=0)
df

Unnamed: 0,price,type,room_num,floor,area_m2,floors_num,year_built,last_refurbishment,address
0,"CHF1,235,000.–",Apartment,2.5,GF,138 m2,4.0,2015.0,2018.0,8914 Aeugst am Albis
1,"CHF650,000.–",Apartment,4.5,1,121 m2,,1987.0,2021.0,"Via Vecchio Canale 2, 6982 Agno"
2,"CHF1,062,900.–",Apartment,2.5,1,63 m2,,,,"Via Campione, 6816 Bissone"
3,"CHF1,125,500.–",Attic flat,2.5,3,63 m2,,,,"Via Campione, 6816 Bissone"
4,"CHF2,180,700.–",Apartment,3.5,2,125 m2,,,,"Via Campione, 6816 Bissone"
...,...,...,...,...,...,...,...,...,...
2181,"CHF695,000.–",Bifamiliar house,4.5,,144 m2,,2022.0,,1994 Aproz (Nendaz)
2182,"CHF695,000.–",Bifamiliar house,4.5,,144 m2,,2022.0,,1994 Aproz (Nendaz)
2183,"CHF695,000.–",Row house,4.5,,144 m2,,2022.0,,1994 Aproz (Nendaz)
2184,"CHF695,000.–",Bifamiliar house,4.5,,144 m2,,2022.0,,1994 Aproz (Nendaz)


---
The first method we will use on our newly created dataframe is `.info()`. Here we can get a first overview of the types of data we have, and if we have missing values.

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2186 entries, 0 to 2185
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   price               2186 non-null   object 
 1   type                2186 non-null   object 
 2   room_num            2072 non-null   float64
 3   floor               928 non-null    object 
 4   area_m2             2021 non-null   object 
 5   floors_num          713 non-null    float64
 6   year_built          1652 non-null   float64
 7   last_refurbishment  489 non-null    float64
 8   address             2186 non-null   object 
dtypes: float64(4), object(5)
memory usage: 170.8+ KB


---
The price column is formatted with alphanumeric values. In order to properly do data exploration, we need to treat this column as an integer (number), so let's clean this entries using a Regular Expression (regex) so it only keeps the digits

In [7]:
df["price"]

0       CHF1,235,000.–
1         CHF650,000.–
2       CHF1,062,900.–
3       CHF1,125,500.–
4       CHF2,180,700.–
             ...      
2181      CHF695,000.–
2182      CHF695,000.–
2183      CHF695,000.–
2184      CHF695,000.–
2185      CHF695,000.–
Name: price, Length: 2186, dtype: object

In [8]:
df["price"].sample(n=10).unique()

array(['CHF340,000.–', 'CHF1,040,000.–', 'CHF745,000.–', 'CHF1,890,000.–',
       'CHF525,000.–', 'CHF2,010,000.–', 'CHF680,000.–', 'CHF570,000.–',
       'CHF672,000.–', 'CHF4,590,000.–'], dtype=object)

In [9]:
df['price'] = df['price'].str.replace('[^0-9]', '', regex=True)
df.head()

Unnamed: 0,price,type,room_num,floor,area_m2,floors_num,year_built,last_refurbishment,address
0,1235000,Apartment,2.5,GF,138 m2,4.0,2015.0,2018.0,8914 Aeugst am Albis
1,650000,Apartment,4.5,1,121 m2,,1987.0,2021.0,"Via Vecchio Canale 2, 6982 Agno"
2,1062900,Apartment,2.5,1,63 m2,,,,"Via Campione, 6816 Bissone"
3,1125500,Attic flat,2.5,3,63 m2,,,,"Via Campione, 6816 Bissone"
4,2180700,Apartment,3.5,2,125 m2,,,,"Via Campione, 6816 Bissone"


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2186 entries, 0 to 2185
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   price               2186 non-null   object 
 1   type                2186 non-null   object 
 2   room_num            2072 non-null   float64
 3   floor               928 non-null    object 
 4   area_m2             2021 non-null   object 
 5   floors_num          713 non-null    float64
 6   year_built          1652 non-null   float64
 7   last_refurbishment  489 non-null    float64
 8   address             2186 non-null   object 
dtypes: float64(4), object(5)
memory usage: 170.8+ KB


---
Even though we stripped the values of non-numerical characters, we still need to convert the data type so it can be interpreted as an integer. This way we can later take advantage of this for plotting and applying methods if needed.

In [11]:
df['price'] = df['price'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2186 entries, 0 to 2185
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   price               2186 non-null   int32  
 1   type                2186 non-null   object 
 2   room_num            2072 non-null   float64
 3   floor               928 non-null    object 
 4   area_m2             2021 non-null   object 
 5   floors_num          713 non-null    float64
 6   year_built          1652 non-null   float64
 7   last_refurbishment  489 non-null    float64
 8   address             2186 non-null   object 
dtypes: float64(4), int32(1), object(4)
memory usage: 162.2+ KB


In [12]:
df['area_m2'] = df['area_m2'].str.split(' ').str[0]
df['area_m2'] = df['area_m2'].astype(float)
df.head(3)

Unnamed: 0,price,type,room_num,floor,area_m2,floors_num,year_built,last_refurbishment,address
0,1235000,Apartment,2.5,GF,138.0,4.0,2015.0,2018.0,8914 Aeugst am Albis
1,650000,Apartment,4.5,1,121.0,,1987.0,2021.0,"Via Vecchio Canale 2, 6982 Agno"
2,1062900,Apartment,2.5,1,63.0,,,,"Via Campione, 6816 Bissone"


In [13]:
print(f"Missing values in 'floors_num': {df['floors_num'].isna().sum()}")

Missing values in 'floors_num': 1473


In [16]:
# Most missing values in 'floors_num' are Ground Floor properties
df['floors_num'] = df['floors_num'].fillna(1).astype(int)
df.head()

Unnamed: 0,price,type,room_num,floor,area_m2,floors_num,year_built,last_refurbishment,address
0,1235000,Apartment,2.5,GF,138.0,4,2015.0,2018.0,8914 Aeugst am Albis
1,650000,Apartment,4.5,1,121.0,1,1987.0,2021.0,"Via Vecchio Canale 2, 6982 Agno"
2,1062900,Apartment,2.5,1,63.0,1,,,"Via Campione, 6816 Bissone"
3,1125500,Attic flat,2.5,3,63.0,1,,,"Via Campione, 6816 Bissone"
4,2180700,Apartment,3.5,2,125.0,1,,,"Via Campione, 6816 Bissone"


In [17]:
print(f"Missing values in 'floor': {df['floor'].isna().sum()}")

Missing values in 'floor': 1258


In [18]:
# Filling the missing values of the 'floor' column with "GF"
df['floor'] = df['floor'].fillna("GF")
df.tail()

Unnamed: 0,price,type,room_num,floor,area_m2,floors_num,year_built,last_refurbishment,address
2181,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,,1994 Aproz (Nendaz)
2182,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,,1994 Aproz (Nendaz)
2183,695000,Row house,4.5,GF,144.0,1,2022.0,,1994 Aproz (Nendaz)
2184,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,,1994 Aproz (Nendaz)
2185,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,,1994 Aproz (Nendaz)


In [19]:
print(f"Missing values in 'last_refurbishment': {df['last_refurbishment'].isna().sum()}")

Missing values in 'last_refurbishment': 1697


In [20]:
df["year_built"]

0       2015.0
1       1987.0
2          NaN
3          NaN
4          NaN
         ...  
2181    2022.0
2182    2022.0
2183    2022.0
2184    2022.0
2185    2022.0
Name: year_built, Length: 2186, dtype: float64

In [21]:
# Filling the missing values in the column 'last_refurbishment' with the values from the column 'year_built'
df['last_refurbishment'] = df['last_refurbishment'].fillna(df['year_built'])

df

Unnamed: 0,price,type,room_num,floor,area_m2,floors_num,year_built,last_refurbishment,address
0,1235000,Apartment,2.5,GF,138.0,4,2015.0,2018.0,8914 Aeugst am Albis
1,650000,Apartment,4.5,1,121.0,1,1987.0,2021.0,"Via Vecchio Canale 2, 6982 Agno"
2,1062900,Apartment,2.5,1,63.0,1,,,"Via Campione, 6816 Bissone"
3,1125500,Attic flat,2.5,3,63.0,1,,,"Via Campione, 6816 Bissone"
4,2180700,Apartment,3.5,2,125.0,1,,,"Via Campione, 6816 Bissone"
...,...,...,...,...,...,...,...,...,...
2181,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,2022.0,1994 Aproz (Nendaz)
2182,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,2022.0,1994 Aproz (Nendaz)
2183,695000,Row house,4.5,GF,144.0,1,2022.0,2022.0,1994 Aproz (Nendaz)
2184,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,2022.0,1994 Aproz (Nendaz)


In [22]:
# Getting the price per square meter
df = df[df["area_m2"].notna()].copy()
df["price_sqm"] = df['price'] / df['area_m2']
df["price_sqm"] = df["price_sqm"].astype(int)
print(df.shape)
df.head()

(2021, 10)


Unnamed: 0,price,type,room_num,floor,area_m2,floors_num,year_built,last_refurbishment,address,price_sqm
0,1235000,Apartment,2.5,GF,138.0,4,2015.0,2018.0,8914 Aeugst am Albis,8949
1,650000,Apartment,4.5,1,121.0,1,1987.0,2021.0,"Via Vecchio Canale 2, 6982 Agno",5371
2,1062900,Apartment,2.5,1,63.0,1,,,"Via Campione, 6816 Bissone",16871
3,1125500,Attic flat,2.5,3,63.0,1,,,"Via Campione, 6816 Bissone",17865
4,2180700,Apartment,3.5,2,125.0,1,,,"Via Campione, 6816 Bissone",17445


In [23]:
for address in df['address'].sample(n=20).unique():
  print(address)

Via Nosetto, 6987 Caslano
St. Johannserstrasse, 5312 Döttingen
6612 Ascona
2942 Alle
Nordstrasse 1, 8620 Wetzikon ZH
Seestrasse 43, 6354 Vitznau
6781 Bedretto
Kohlplatzweg 7, 4310 Rheinfelden
6883 Novazzano
8050 Zürich
6814 Cadempino
6596 Gordola
Via S. Gottardo, 6900 Lugano
rue auguste cuenin 10, 2900 Porrentruy
8461 Kleinandelfingen
6748 Anzonico
Via poncione e Vespero 34, 6780 Airolo
Via Trevano, 6900 Lugano
Pulvermühleweg 6, 6010 Kriens
1950 Sion


In [24]:
def extract_zip_city(address):
    if ',' in address:
        zip_and_city = address.split(', ')[1]
        zip_code = zip_and_city.split(' ')[0]
        city = zip_and_city.split(' ')[1]
    else:
        zip_and_city = address
        zip_code = zip_and_city.split(' ')[0]
        city = zip_and_city.split(' ')[1]
    return pd.Series([zip_code, city])

df[['zip_code', 'city']] = df['address'].apply(extract_zip_city)
df

Unnamed: 0,price,type,room_num,floor,area_m2,floors_num,year_built,last_refurbishment,address,price_sqm,zip_code,city
0,1235000,Apartment,2.5,GF,138.0,4,2015.0,2018.0,8914 Aeugst am Albis,8949,8914,Aeugst
1,650000,Apartment,4.5,1,121.0,1,1987.0,2021.0,"Via Vecchio Canale 2, 6982 Agno",5371,6982,Agno
2,1062900,Apartment,2.5,1,63.0,1,,,"Via Campione, 6816 Bissone",16871,6816,Bissone
3,1125500,Attic flat,2.5,3,63.0,1,,,"Via Campione, 6816 Bissone",17865,6816,Bissone
4,2180700,Apartment,3.5,2,125.0,1,,,"Via Campione, 6816 Bissone",17445,6816,Bissone
...,...,...,...,...,...,...,...,...,...,...,...,...
2181,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,2022.0,1994 Aproz (Nendaz),4826,1994,Aproz
2182,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,2022.0,1994 Aproz (Nendaz),4826,1994,Aproz
2183,695000,Row house,4.5,GF,144.0,1,2022.0,2022.0,1994 Aproz (Nendaz),4826,1994,Aproz
2184,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,2022.0,1994 Aproz (Nendaz),4826,1994,Aproz


In [25]:
pgeocode_nomi = pgeocode.Nominatim('ch')
pgeocode_nomi.query_postal_code("8600")

postal_code                         8600
country_code                          CH
place_name        Dübendorf, Dübendorf 1
state_name                 Kanton Zürich
state_code                            ZH
county_name                 Bezirk Uster
county_code                          109
community_name                 Dübendorf
community_code                       191
latitude                         47.3921
longitude                         8.6187
accuracy                             4.0
Name: 0, dtype: object

In [26]:
def add_canton(zip_code):
    zip_info = pgeocode_nomi.query_postal_code(zip_code)
    return zip_info["state_name"]

df["canton"] = df["zip_code"].apply(add_canton)
df.head(3)

Unnamed: 0,price,type,room_num,floor,area_m2,floors_num,year_built,last_refurbishment,address,price_sqm,zip_code,city,canton
0,1235000,Apartment,2.5,GF,138.0,4,2015.0,2018.0,8914 Aeugst am Albis,8949,8914,Aeugst,Kanton Zürich
1,650000,Apartment,4.5,1,121.0,1,1987.0,2021.0,"Via Vecchio Canale 2, 6982 Agno",5371,6982,Agno,Ticino
2,1062900,Apartment,2.5,1,63.0,1,,,"Via Campione, 6816 Bissone",16871,6816,Bissone,Ticino


In [27]:
def add_coordinates(zip_code): 
    zip_info = pgeocode_nomi.query_postal_code(zip_code)
    # Assign two variables called 'latitude' and 'longitude' with the corresponding keys from the 'zip_info' data
    latitude = zip_info["latitude"]
    longitude = zip_info["longitude"]
    return pd.Series([latitude, longitude])

df[["lat", "lon"]] = df["zip_code"].apply(add_coordinates)
df.head(3)

Unnamed: 0,price,type,room_num,floor,area_m2,floors_num,year_built,last_refurbishment,address,price_sqm,zip_code,city,canton,lat,lon
0,1235000,Apartment,2.5,GF,138.0,4,2015.0,2018.0,8914 Aeugst am Albis,8949,8914,Aeugst,Kanton Zürich,47.27515,8.48965
1,650000,Apartment,4.5,1,121.0,1,1987.0,2021.0,"Via Vecchio Canale 2, 6982 Agno",5371,6982,Agno,Ticino,46.0005,8.9028
2,1062900,Apartment,2.5,1,63.0,1,,,"Via Campione, 6816 Bissone",16871,6816,Bissone,Ticino,45.951,8.9655


In [28]:
df = df.drop('address', axis=1)

In [29]:
df

Unnamed: 0,price,type,room_num,floor,area_m2,floors_num,year_built,last_refurbishment,price_sqm,zip_code,city,canton,lat,lon
0,1235000,Apartment,2.5,GF,138.0,4,2015.0,2018.0,8949,8914,Aeugst,Kanton Zürich,47.27515,8.48965
1,650000,Apartment,4.5,1,121.0,1,1987.0,2021.0,5371,6982,Agno,Ticino,46.00050,8.90280
2,1062900,Apartment,2.5,1,63.0,1,,,16871,6816,Bissone,Ticino,45.95100,8.96550
3,1125500,Attic flat,2.5,3,63.0,1,,,17865,6816,Bissone,Ticino,45.95100,8.96550
4,2180700,Apartment,3.5,2,125.0,1,,,17445,6816,Bissone,Ticino,45.95100,8.96550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2181,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,2022.0,4826,1994,Aproz,Canton du Valais,46.20380,7.30700
2182,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,2022.0,4826,1994,Aproz,Canton du Valais,46.20380,7.30700
2183,695000,Row house,4.5,GF,144.0,1,2022.0,2022.0,4826,1994,Aproz,Canton du Valais,46.20380,7.30700
2184,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,2022.0,4826,1994,Aproz,Canton du Valais,46.20380,7.30700


In [39]:
df = df[df['price']< 5000000].reset_index(drop=True)
#df = df[~df["year_built"].isna()]
df.drop(['lat', 'lon', 'price_sqm'], axis=1, inplace=True, errors='ignore')
df

Unnamed: 0,price,type,room_num,floor,area_m2,floors_num,year_built,last_refurbishment,zip_code,city,canton
0,1235000,Apartment,2.5,GF,138.0,4,2015.0,2018.0,8914,Aeugst,Kanton Zürich
1,650000,Apartment,4.5,1,121.0,1,1987.0,2021.0,6982,Agno,Ticino
2,1062900,Apartment,2.5,1,63.0,1,,,6816,Bissone,Ticino
3,1125500,Attic flat,2.5,3,63.0,1,,,6816,Bissone,Ticino
4,2180700,Apartment,3.5,2,125.0,1,,,6816,Bissone,Ticino
...,...,...,...,...,...,...,...,...,...,...,...
1928,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,2022.0,1994,Aproz,Canton du Valais
1929,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,2022.0,1994,Aproz,Canton du Valais
1930,695000,Row house,4.5,GF,144.0,1,2022.0,2022.0,1994,Aproz,Canton du Valais
1931,695000,Bifamiliar house,4.5,GF,144.0,1,2022.0,2022.0,1994,Aproz,Canton du Valais


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
import joblib
from interpret import show
from interpret.glassbox import ExplainableBoostingRegressor
import time
from tensorflow.python.keras.losses import mean_absolute_error

In [None]:
# TensorBoard setup
log_dir = "logs/fit/" + time.strftime("%Y-%m-%d_%H-%M-%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# Preprocess data
X = df.drop(columns=['price'])
y = df['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define preprocessing for numeric features (standard scaling and imputation)
numeric_features = ['room_num', 'area_m2', 'floors_num', 'year_built', 'last_refurbishment']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Define preprocessing for categorical features (one-hot encoding)
categorical_features = ['type', 'floor', 'zip_code', 'city', 'canton']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Fit and transform the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Convert sparse matrix to dense matrix if needed
if hasattr(X_train_transformed, 'toarray'):
    X_train_transformed = X_train_transformed.toarray()
    X_test_transformed = X_test_transformed.toarray()

# Manually create feature names
feature_names = numeric_features.copy()
for cat_feature in categorical_features:
    n_unique_values = len(df[cat_feature].unique())
    feature_names.extend([f"{cat_feature}_{i}" for i in range(n_unique_values)])

print(f"X_train_transformed shape: {X_train_transformed.shape}")

# Build a neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=X_train_transformed.shape[1]),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mean_absolute_error'])

# Train the neural network model and log the metrics
start_time = time.time()
history = model.fit(X_train_transformed, y_train, epochs=100, validation_split=0.2, callbacks=[tensorboard_callback])
end_time = time.time()
training_time = end_time - start_time
print(f"Time taken to train neural network model: {training_time} seconds")

# Evaluate the model on test data
y_pred = model.predict(X_test_transformed)
test_mae = mean_absolute_error(y_test, y_pred)

# Save the trained model and preprocessor
model.save('../web/rf_model.h5')
joblib.dump(preprocessor, '../web/preprocessor.joblib')

# Interpret ML feature engineering
ebm = ExplainableBoostingRegressor(random_state=42)

# Measure the time taken to fit the EBM model
start_time = time.time()
ebm.fit(X_train_transformed, y_train)
end_time = time.time()
ebm_training_time = end_time - start_time
print(f"Time taken to fit EBM model: {ebm_training_time} seconds")

# Assign the extracted feature names to the EBM model
ebm.feature_names = feature_names

# Measure the time taken to get the global explanation
start_time = time.time()
ebm_global = ebm.explain_global(name='EBM')
end_time = time.time()
global_explanation_time = end_time - start_time
print(f"Time taken to generate global explanation: {global_explanation_time} seconds")

# Show the global explanation
show(ebm_global)

X_train_transformed shape: (1353, 1106)
Epoch 1/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - loss: 1297441.7500 - mean_absolute_error: 1297441.7500 - val_loss: 1263358.2500 - val_mean_absolute_error: 1263358.2500
Epoch 2/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - loss: 1266639.3750 - mean_absolute_error: 1266639.3750 - val_loss: 1256022.6250 - val_mean_absolute_error: 1256022.6250
Epoch 3/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - loss: 1262221.3750 - mean_absolute_error: 1262221.3750 - val_loss: 1124633.6250 - val_mean_absolute_error: 1124633.5000
Epoch 4/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - loss: 1038081.4375 - mean_absolute_error: 1038081.4375 - val_loss: 665691.6875 - val_mean_absolute_error: 665691.6875
Epoch 5/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - loss: 682762.3750 - mean_absolute_error: 6



In [None]:
show(ebm_global)

In [None]:
pred = model.predict(X_test)

print('MAE', metrics.mean_absolute_error(y_test, pred))
print('R2 Score', metrics.r2_score(y_test, pred))

In [None]:
show(ebm.explain_local(X_test[5:], y_test[5:]), 0)

In [None]:
# Test for prediction
target_property = {
    'type' : ['Apartment'],
    'room_num' : [2.5],
    'floor' : ["2"],
    'area_m2' : [80],
    'floors_num' : [1],
    'year_built' : [1990],
    'last_refurbishment' : [2002],
    'zip_code' : ["8004"],
    'city' : ["Zürich"],
    'canton' : ["Kanton Zürich"],
}

to_predict = pd.DataFrame(target_property)

In [None]:
to_predict = preprocessor.transform(to_predict)
pred = model.predict(to_predict)
print(f"The value of the property using the trained machine learning algorithm is of {pred[0]} CHF")