In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from lls import LLS
from train_test_split import TrainTestSplit

In [2]:
data = pd.read_csv('Inputs\HousePrice.csv')
data.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1850000000.0,61666.67
1,60,1,True,True,True,Shahran,1850000000.0,61666.67
2,79,2,True,True,True,Pardis,550000000.0,18333.33
3,95,2,True,True,True,Shahrake Qods,902500000.0,30083.33
4,123,2,True,True,True,Shahrake Gharb,7000000000.0,233333.33


Preprocess data and update dollar price to july 2023.



In [3]:
data['Parking'].replace([True, False], [0, 1], inplace=True)
data['Warehouse'].replace([True, False], [0, 1], inplace=True)
data['Elevator'].replace([True, False], [0, 1], inplace=True)

In [4]:
data['Price(USD)'] = data['Price'] / 49000
data.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,0,0,0,Shahran,1850000000.0,37755.102041
1,60,1,0,0,0,Shahran,1850000000.0,37755.102041
2,79,2,0,0,0,Pardis,550000000.0,11224.489796
3,95,2,0,0,0,Shahrake Qods,902500000.0,18418.367347
4,123,2,0,0,0,Shahrake Gharb,7000000000.0,142857.142857


Show the address of the 5 most expensive houses


In [5]:
price_sorted = data.sort_values(by=['Price'], ascending=False).reset_index(drop=True)
price_sorted.head(5)

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,420,4,0,0,0,Zaferanieh,92400000000.0,1885714.0
1,705,5,0,0,1,Abazar,91000000000.0,1857143.0
2,400,5,0,0,1,Lavasan,85000000000.0,1734694.0
3,680,5,0,0,1,Ekhtiarieh,81600000000.0,1665306.0
4,350,4,0,0,0,Niavaran,80500000000.0,1642857.0


Use all possible features for X_train

In [6]:
X = np.array(data[['Area', 'Room', 'Parking', 'Warehouse', 'Elevator']])
Y = np.array(data[['Price']])

In [7]:
data.drop('Address', axis=1, inplace=True)
data.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Price,Price(USD)
0,63,1,0,0,0,1850000000.0,37755.102041
1,60,1,0,0,0,1850000000.0,37755.102041
2,79,2,0,0,0,550000000.0,11224.489796
3,95,2,0,0,0,902500000.0,18418.367347
4,123,2,0,0,0,7000000000.0,142857.142857


Split your dataset to train and test with train_test_split function

In [8]:
X_train, X_test, Y_train, Y_test = TrainTestSplit(X, Y, test_size=0.2, shuffle=True)



Fit the LLS model on your training dataset


In [9]:
lls = LLS()
lls.fit(X_train, Y_train)

array([[ 8.30411179e+07],
       [-1.10185793e+09],
       [-1.04203948e+09],
       [-1.54065292e+09],
       [-1.84244599e+09]])

Evaluate your model on your test dataset using MAE, MSE and RMSE loss functions. 

In [10]:
lls.evaluate(X_test, Y_test, 'mae')


28884444360.422688

In [11]:
lls.evaluate(X_test, Y_test, 'mse')

4.4636569080944755e+20

In [12]:
lls.evaluate(X_test, Y_test, 'rmse')

21127368288.77292

## Why the MSE metric is a very large number?
* Because the numbers of the differences have been raised to the power of 2 to get out of the negative state, but at the end, no root has been taken from them.

Compare your result with Scikit-Learn's results

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV 

In [14]:
lls_sk= LinearRegression()
lls_sk.fit(X_train,Y_train)

In [15]:
Y_pred = lls_sk.predict(X_test)
lls.evaluate_outer_model(Y_pred, Y_test, metric='mae')

2714691034.9825993

In [16]:
lls.evaluate_outer_model(Y_pred, Y_test, metric='mse')

2.964363840605792e+19

In [17]:
lls.evaluate_outer_model(Y_pred, Y_test, metric='rmse')

5444597175.738341

In [18]:
lls_sk= RidgeCV()
lls_sk.fit(X_train,Y_train)

In [19]:
Y_pred = lls_sk.predict(X_test)
lls.evaluate_outer_model(Y_pred, Y_test, metric='mae')

2711274484.027565

In [20]:
lls.evaluate_outer_model(Y_pred, Y_test, metric='mse')

2.9659570977522565e+19

In [21]:
lls.evaluate_outer_model(Y_pred, Y_test, metric='rmse')

5446060133.48389