In [192]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from lls import LLS
from train_test_split import TrainTestSplit

In [193]:
data = pd.read_csv('Inputs\HousePrice.csv')
data.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1850000000.0,61666.67
1,60,1,True,True,True,Shahran,1850000000.0,61666.67
2,79,2,True,True,True,Pardis,550000000.0,18333.33
3,95,2,True,True,True,Shahrake Qods,902500000.0,30083.33
4,123,2,True,True,True,Shahrake Gharb,7000000000.0,233333.33


Preprocess data and update dollar price to july 2023.



In [194]:
data['Parking'].replace([True, False], [0, 1], inplace=True)
data['Warehouse'].replace([True, False], [0, 1], inplace=True)
data['Elevator'].replace([True, False], [0, 1], inplace=True)

In [195]:
data['Price(USD)'] = data['Price'] / 49000
data.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,0,0,0,Shahran,1850000000.0,37755.102041
1,60,1,0,0,0,Shahran,1850000000.0,37755.102041
2,79,2,0,0,0,Pardis,550000000.0,11224.489796
3,95,2,0,0,0,Shahrake Qods,902500000.0,18418.367347
4,123,2,0,0,0,Shahrake Gharb,7000000000.0,142857.142857


Show the address of the 5 most expensive houses


In [196]:
price_sorted = data.sort_values(by=['Price'], ascending=False).reset_index(drop=True)
price_sorted.head(5)

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,420,4,0,0,0,Zaferanieh,92400000000.0,1885714.0
1,705,5,0,0,1,Abazar,91000000000.0,1857143.0
2,400,5,0,0,1,Lavasan,85000000000.0,1734694.0
3,680,5,0,0,1,Ekhtiarieh,81600000000.0,1665306.0
4,350,4,0,0,0,Niavaran,80500000000.0,1642857.0


Use all possible features for X_train

In [197]:
X = np.array(data[['Area', 'Room', 'Parking', 'Warehouse', 'Elevator']])
Y = np.array(data[['Price', 'Price(USD)']])

In [198]:
data.drop('Address', axis=1, inplace=True)
data.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Price,Price(USD)
0,63,1,0,0,0,1850000000.0,37755.102041
1,60,1,0,0,0,1850000000.0,37755.102041
2,79,2,0,0,0,550000000.0,11224.489796
3,95,2,0,0,0,902500000.0,18418.367347
4,123,2,0,0,0,7000000000.0,142857.142857


Split your dataset to train and test with train_test_split function

In [199]:
X_train, X_test, Y_train, Y_test = TrainTestSplit(X, Y, test_size=0.2, shuffle=True)



Fit the LLS model on your training dataset


In [200]:
lls = LLS()
lls.fit(X_train, Y_train)

array([[ 8.19498532e+07,  1.67244598e+03],
       [-1.04560809e+09, -2.13389406e+04],
       [-8.24472973e+08, -1.68259790e+04],
       [-1.56469394e+09, -3.19325295e+04],
       [-2.08529963e+09, -4.25571354e+04]])

Evaluate your model on your test dataset using MAE, MSE and RMSE loss functions. 

In [208]:
print('Evaluate MAE:', lls.evaluate(X_test, Y_test, 'mae'))
print('Evaluate MSE:', lls.evaluate(X_test, Y_test, 'mse'))
print('Evaluate RMAE:', lls.evaluate(X_test, Y_test, 'rmse'))

Evaluate MAE: 3574789572.306511
Evaluate MSE: 3.144273435298267e+19
Evaluate RMAE: 5607382130.101592


## Why the MSE metric is a very large number?
* Because the numbers of the differences have been raised to the power of 2 to get out of the negative state, but at the end, no root has been taken from them.

Compare your result with Scikit-Learn's results

In [209]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV 
import sklearn.metrics as metrics

In [210]:
lls= LinearRegression()
lls.fit(X_train,Y_train)

In [211]:
Y_pred = lls.predict(X_test)
print('Evaluate MAE:', metrics.mean_absolute_error(Y_pred, Y_test))
print('Evaluate MSE:', metrics.mean_squared_error(Y_pred, Y_test))
print('Evaluate RMAE:', np.sqrt(metrics.mean_squared_error(Y_pred, Y_test)))

Evaluate MAE: 2634305397.6035657
Evaluate MSE: 3.726660169728482e+19
Evaluate RMAE: 6104637720.396257


In [212]:
lls= RidgeCV()
lls.fit(X_train,Y_train)

In [213]:
Y_pred = lls.predict(X_test)
print('Evaluate MAE:', metrics.mean_absolute_error(Y_pred, Y_test))
print('Evaluate MSE:', metrics.mean_squared_error(Y_pred, Y_test))
print('Evaluate RMAE:', np.sqrt(metrics.mean_squared_error(Y_pred, Y_test)))

Evaluate MAE: 2634304674.3762836
Evaluate MSE: 3.726656400118104e+19
Evaluate RMAE: 6104634632.898273
