In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns 
from typing import List
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score


df = pd.read_csv("../data/cleaned_data.csv")

df_studio = df[df["building_subtype"] == "flat-studio"].copy()
df_studio["bedrooms"].replace({np.nan: 1, 2: 1, 5 : 1}, inplace=True)
df.drop(df[df["building_subtype"] == "flat-studio"].index, inplace=True)
df = pd.concat([df, df_studio])
df = df.dropna(subset=["living_area", "price", "bedrooms"])

df['log_price'] = df['price'].apply(np.log10)

imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
imputed_columns = ['status_build', 'frontages', 'kitchen_type', 'bathrooms']
imputer = imputer.fit(df[imputed_columns])
df[imputed_columns] = imputer.transform(df[imputed_columns])

def remove_outliers(df: pd.DataFrame, columns: List[str], n_std: int) -> pd.DataFrame:
    for col in columns:
        mean = df[col].mean()
        sd = df[col].std()

        df = df[(df[col] <= mean+(n_std*sd))]

    return df
df = remove_outliers(df, ['living_area','log_price','bedrooms','terrace',
                          'garden', 'status_build', 'frontages', 'kitchen_type', 
                          'bathrooms'], 2)

df = df.drop(columns = ["price", "liv_room_surf", "id", "surroundings", 
                        "flood_zone", "showers", "toilets", "heating"])
df["swimming_pool"] = df["swimming_pool"].fillna(0)
df["basement"] = df["basement"].fillna(0)
df["elevator"] = df["elevator"].fillna(0)
df["furnished"] = df["furnished"].fillna(0)

df_zipcode = pd.get_dummies(df[['zipcode']])

X = pd.concat([df[["living_area", "bedrooms", "terrace", "garden", "swimming_pool", 
                   "energy_class",'status_build', 'frontages', 'kitchen_type', 'basement',
                   'elevator', 'furnished', 'bathrooms']], df_zipcode], axis=1)
y = df["log_price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

regr = linear_model.LinearRegression()
aaa = regr.fit(X_train, y_train)
score = regr.score(X_test, y_test)
print("Model Score:", score)

y_pred = regr.predict(X_test)
r2 = r2_score(y_pred, y_test)
print("r2: ",r2)

y_pred_full = 10**y_pred
y_test_full = 10**y_test

df_preds = pd.DataFrame({'Actual': y_test_full.squeeze(), 'Predicted': y_pred_full.squeeze()})
print(df_preds)


Model Score: 0.18020698701030669
r2:  -3.0639826242446846
         Actual      Predicted
16187  265000.0  134438.460490
10475  260000.0  174129.533929
4919   135000.0  165587.252555
6382   159000.0  187370.817685
18925  149000.0  194592.238259
...         ...            ...
19741  199999.0  199343.125707
25912  250000.0  197010.955297
29727  204000.0  189835.415422
24550  235000.0  219574.816140
2240   255000.0  172299.215136

[4632 rows x 2 columns]
