In [1]:
!pip install torchmetrics


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error as  mape
from sklearn.metrics import r2_score
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Lasso, LassoCV

from torchmetrics import MeanSquaredLogError


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from preprocess import EstateData

dataset = EstateData("final.csv")

In [4]:
dataset.split_data(stratify=True)

In [5]:
dataset.preprocess(tukey=True)

Start Process!
Fill missing values: Done
Remove outlier: Done
Scale features: Done


In [6]:
train = dataset.train
test = dataset.test

In [7]:
print("train size {}".format(train.shape))
print("test size {}".format(test.shape))

train size (31658, 12)
test size (9278, 12)


In [8]:
train, dev = train_test_split(train, test_size=0.1)

In [9]:
train = pd.get_dummies(train)
dev = pd.get_dummies(dev)
test = pd.get_dummies(test)

In [10]:
features = train.columns.tolist()
#features.remove("Price")
features.remove("PricePerM2")

In [11]:
linear_regression = LinearRegression()
linear_regression.fit(train[features], train["PricePerM2"])

filename = 'checkpoint/linear_regression.model'
pickle.dump(linear_regression, open(filename, 'wb'))

In [12]:
linear_regression = pickle.load(open(filename, 'rb'))
pred = linear_regression.predict(dev[features])
print("mean_absolute_percentage_error",mape(dev["PricePerM2"], pred))
print("r2_score",r2_score(dev["PricePerM2"], pred))

mean_absolute_percentage_error 1.8945030949765724
r2_score 0.6319975597549148


In [13]:
pred = linear_regression.predict(test[features])
print("mean_absolute_percentage_error",mape(test["PricePerM2"], pred))
print("r2_score",r2_score(test["PricePerM2"], pred))

mean_absolute_percentage_error 2.091867109914237
r2_score 0.5424499140163639


In [14]:
lasso = Lasso(alpha=0.1)
lasso.fit(train[features], train["PricePerM2"])

filename = 'checkpoint/Lasso.model'
pickle.dump(lasso, open(filename, 'wb'))

In [15]:
lasso = pickle.load(open(filename, 'rb'))

pred = lasso.predict(dev[features])
print("mean_absolute_percentage_error",mape(dev["PricePerM2"], pred))
print("r2_score",r2_score(dev["PricePerM2"], pred))

mean_absolute_percentage_error 1.8885659830690604
r2_score 0.6321904773692899


In [16]:
pred = lasso.predict(test[features])
print("mean_absolute_percentage_error",mape(test["PricePerM2"], pred))
print("r2_score",r2_score(test["PricePerM2"], pred))

mean_absolute_percentage_error 2.0937684480963727
r2_score 0.5407466286033031


In [17]:
lassoCV = LassoCV(cv=5, random_state=0)
lassoCV.fit(train[features], train["PricePerM2"])


filename = 'checkpoint/LassoCV.model'
pickle.dump(lassoCV, open(filename, 'wb'))

In [18]:
lassoCV = pickle.load(open(filename, 'rb'))

pred = lassoCV.predict(dev[features])
print("mean_absolute_percentage_error",mape(dev["PricePerM2"], pred))
print("r2_score",r2_score(dev["PricePerM2"], pred))

mean_absolute_percentage_error 1.8851427054243843
r2_score 0.6321805565781


In [19]:
pred = lassoCV.predict(test[features])
print("mean_absolute_percentage_error",mape(test["PricePerM2"], pred))
print("r2_score", r2_score(test["PricePerM2"], pred))

mean_absolute_percentage_error 2.098051173052952
r2_score 0.5389310296294785
