In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np
import os
import time

In [2]:
YEAR = "2017"
TYPE = "house"
USE_LOG = True
USE_SS = True

# Import dataframe

In [3]:
df = pd.read_csv(f"real_estate/real_estate_{TYPE}{YEAR}.csv")

In [4]:
def split_train_data(df):
    df_train_x = df[~df["city"].isin(["saint nabord", "clermont ferrand", "la grande"])]
    df_train_x.drop(columns=["city"], inplace=True)
    df_train_y = df_train_x[["value_euros"]]
    df_train_x.drop(columns="value_euros", inplace=True)
    return df_train_x, df_train_y

def split_test_data(df):
    df_test_x = df[df["city"].isin(["saint nabord", "clermont ferrand", "la grande"])]
    df_test_x.drop(columns=["city"], inplace=True)
    df_test_y = df_test_x[["value_euros"]]
    df_test_x.drop(columns="value_euros", inplace=True)
    return df_test_x, df_test_y

In [5]:
df_train_x, df_train_y = split_train_data(df)
df_test_x, df_test_y = split_test_data(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [6]:
if USE_LOG:
    df_train_y = np.log(df_train_y)
    df_test_y = np.log(df_test_y)

In [7]:
if USE_SS:
    ss = StandardScaler()
    ss.fit(df_train_x)

def normalize(x):
    if USE_SS:
        return ss.transform(x)
    else:
        return x

# Train model

In [8]:
lr = LinearRegression()
lr.fit(normalize(df_train_x), df_train_y);

In [9]:
df_test_z = lr.predict(normalize(df_test_x))

In [10]:
r2score = r2_score(df_test_y, df_test_z)
print(r2score)

0.08577256435244662


In [11]:
mse = mean_squared_error(df_test_y, df_test_z, squared=False)
print(mse)

0.4348038274014522


# Export results

In [12]:
if not os.path.exists("results"):
    os.mkdir("results")

In [23]:
with open(f"results/{YEAR}_{TYPE}_{time.time()}.txt", "w") as f:
    f.write(f"USE_LOG\t{USE_LOG}\n")
    f.write(f"USE_SS\t{USE_SS}\n")
    f.write(f"COLUMNS\t{df_train_x.columns.values}\n")
    f.write(f"COEFS\t{lr.coef_[0]}\n")
    f.write(f"INTERCEPT\t{lr.intercept_[0]}\n")
    f.write(f"R2\t{r2score}\n")
    f.write(f"MSE\t{mse}\n")