In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split


# importer
df =pd.read_csv("../data/cleaned_data.csv")
# Selecting parameters to train our price prediction on
df_selection = df[[ 
    "price",              
    "living_area",
    "building_type",          
    "terrace",             
    "garden",              
    "swimming_pool",       
    "energy_class",
    "municipality",
    "province"]]

#df_selection = df_selection.replace(["apartment", "house"], [0, 1])
#Switching price to log_price


#Dropping missing values in living_area and price
df_selection = df_selection.dropna(subset=["living_area", "price"])
df_selection['log_price'] = df_selection['price'].apply(np.log10)
df_selection = df_selection.drop(columns = ['price'])
#Fill missing values in swimming_pool with zeros.
df_selection["swimming_pool"] = df_selection["swimming_pool"].fillna(0)


df_municipality = pd.get_dummies(df_selection[['municipality']])
df_province = pd.get_dummies(df_selection[['province']])

X = pd.concat([df_selection[["living_area",       
    "terrace",             
    "garden",              
    "swimming_pool",       
    "energy_class"]], df_municipality, df_province], axis=1)
y = df_selection['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)

y_pred_full = 10**y_pred
y_test_full = 10**y_test

df_preds = pd.DataFrame({'Actual': y_test_full.squeeze(), 'Predicted': y_pred_full.squeeze()})
print(df_preds)



        Actual      Predicted
2623  280000.0  302125.964959
8250  189000.0  226220.767778
5382  299000.0  278089.996358
2946  185000.0  196622.051198
7994  210000.0  264109.705359
...        ...            ...
1155  298000.0  279643.664549
8993  435000.0  464871.678775
1132  229000.0  193194.317509
7353  229000.0  304396.425243
6632  695000.0  550432.984283

[1835 rows x 2 columns]


In [26]:
from sklearn import metrics

print('R²:', metrics.r2_score(y_test, y_pred))

R²: 0.7008663213816937
