<a href="https://colab.research.google.com/github/Volks44/ML_Lab/blob/main/Task_1_03_HousingPriceForecast02_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 1: Housing Price Forecasting


# Import

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# Data understanding

## Reading the dataset (csv file) into Pandas dataframe

In [None]:
housing_file_path = "DataSet_LakasArak_labeled.csv"
housing = pd.DataFrame(pd.read_csv(housing_file_path))

## Exploring the dataframe

In [None]:
# Check the head of the dataset
housing.head()

Unnamed: 0,county,city,postcode,property_type,property_subtype,property_condition_type,property_floor,building_floor_count,view_type,orientation,...,room_cnt,small_room_cnt,created_at,property_area,balcony_area,price_created_at,ad_view_cnt,active_days,nr,split
0,Budapest,Budapest XII.,,flat,brick flat (for sale),good,1,,street view,,...,2.0,1.0,2015-02-09,65.0,0.0,23.5,605.0,119.0,4,test
1,Budapest,Budapest I.,1016.0,flat,brick flat (for sale),novel,2,,street view,,...,1.0,1.0,2015-02-09,45.0,0.0,20.0,49.0,25.0,12,train
2,Budapest,Budapest XVI.,1164.0,flat,brick flat (for sale),novel,1,,garden view,,...,2.0,1.0,2015-02-09,60.0,0.0,22.0,77.0,77.0,14,train
3,Budapest,Budapest X.,,flat,brick flat (for sale),good,4,,garden view,,...,2.0,0.0,2015-02-09,55.0,4.0,11.0,139.0,18.0,21,train
4,Budapest,Budapest XVIII.,1181.0,flat,prefabricated panel flat (for sale),renewed,6,,,,...,2.0,1.0,2015-02-09,60.0,3.0,10.2,176.0,69.0,31,test


In [None]:
housing.shape

(78539, 23)

In [None]:
# The dataset shape shows the number of instances and features in the dataset
num_instances = housing.shape[0]
num_features = housing.shape[1]

In [None]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78539 entries, 0 to 78538
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   county                   78539 non-null  object 
 1   city                     77980 non-null  object 
 2   postcode                 49585 non-null  float64
 3   property_type            78539 non-null  object 
 4   property_subtype         76880 non-null  object 
 5   property_condition_type  78539 non-null  object 
 6   property_floor           74746 non-null  object 
 7   building_floor_count     36429 non-null  object 
 8   view_type                42878 non-null  object 
 9   orientation              47647 non-null  object 
 10  garden_access            17200 non-null  object 
 11  heating_type             67233 non-null  object 
 12  elevator_type            64388 non-null  object 
 13  room_cnt                 78539 non-null  float64
 14  small_room_cnt        

In [None]:
housing.describe()

Unnamed: 0,postcode,room_cnt,small_room_cnt,property_area,balcony_area,price_created_at,ad_view_cnt,active_days,nr
count,49585.0,78539.0,78539.0,78539.0,78539.0,78539.0,78539.0,78539.0,78539.0
mean,1103.35898,1.467666,0.559875,48.440584,1.953182,19.341475,259.59932,44.173684,196334.09324
std,50.769326,0.59984,0.737015,12.716653,4.677227,8.900296,512.351553,47.821006,113305.083861
min,1011.0,0.0,0.0,5.0,0.0,0.2,0.0,1.0,4.0
25%,1064.0,1.0,0.0,40.0,0.0,13.2,42.0,11.0,97959.0
50%,1101.0,1.0,0.0,50.0,0.0,16.9,103.0,28.0,196095.0
75%,1142.0,2.0,1.0,60.0,3.0,23.9,263.0,61.0,294516.5
max,1239.0,7.0,4.0,70.0,97.0,99.6,28096.0,537.0,394181.0


## Features and ground truth labels

In [None]:
# One of the columns contains the prices. In this task, we need to predict the prices based on some information that we have; thus, this column is the ground truth label.
# 'price_created_at' column has the ground truth label that we are going to use in training and testing later.
gt_feature = 'price_created_at'

# Data preperation

In [None]:
# As we learned earlier, prepare the data for the next steps (e.g. train and test).
# You might use all the available features or part of them. Please justify your choices.
# Attention!!! Do not frop any N/A value.

In [None]:
selected_features = ['room_cnt','small_room_cnt','property_area','balcony_area']
target = 'price_created_at'
all_small_columns = selected_features + [target]
print(all_small_columns)

housing_clean_small = housing[all_small_columns]
housing_clean_small.shape

['room_cnt', 'small_room_cnt', 'property_area', 'balcony_area', 'price_created_at']


(78539, 5)



---


#### Justification:
I decided to keep the selected features becauste of the same number of instances and the data type.



---



## Holding out a test set for performance evaluation

In [None]:
# 1- We need to decide how much of the data is used for testing.
#    In this experiment the data is labeled beforehand, we have 30% of the data for testing purposes.
# 2- How many instances do we have for training and testing?

train_set = housing[housing['split']=='train']
test_set = housing[housing['split']=='test']

train_set.shape, test_set.shape

((54977, 23), (23562, 23))

In [None]:
# The following is just to assert that the data is complete and none of th einstances was dropped
test_perc = 0.3
train_perc = 1 - test_perc

assert (len(train_set) + len(test_set)) == num_instances
assert (len(train_set)) == int(train_perc*num_instances)
assert (len(test_set)) == (num_instances - len(train_set))

# Model selection

In [None]:
# After the data preperation/preprocessing step, the list of selected features (as strings) should be saved into a list in the form:
# features = [feature1, feature2, ...]

features = [selected_features[0], selected_features[1], selected_features[2], selected_features[3]]
features

['room_cnt', 'small_room_cnt', 'property_area', 'balcony_area']

In [None]:
# We need to create features and ground truth sets for both train and test splits that we have. Use 'features' and 'gt_feature'.

X_train = train_set[features]
y_train = train_set[gt_feature]
X_test = test_set[features]
y_test = test_set[gt_feature]

X_train.shape , y_train.shape, X_test.shape, y_test.shape

((54977, 4), (54977,), (23562, 4), (23562,))

In [None]:
# For prediction, you should use the following function. As you can see, it is incomplete, please fill the gaps.
# The selected model has to learn the features in the data before giving an educated prediction. Why?
# We first fit the model using the train data, then use it to predict labels (prices) for the test instances. Why?

def model_predict(model, X_train, y_train, X_test):
  # fit the model
  model.fit(X_train,y_train)
  # make predictions
  pred = model.predict(X_test)
  return model, pred

In [None]:
# Using the selected models, You can make the predictions using 'model_predict' function. Please save the returned values so we can check their performance.

### Model 1: Linear Regression

In [None]:
model_1 = LinearRegression()
model_1, pred_1 = model_predict(model_1, X_train, y_train, X_test)

In [None]:
pred_1.shape

(23562,)

### Model 2: Random Forest Regressor

In [None]:
model_2 = RandomForestRegressor(n_estimators=100, max_depth=5)
model_2, pred_2 = model_predict(model_2, X_train, y_train, X_test)

### Model 3: Gradient Boosting Regressor

In [None]:
model_3 = GradientBoostingRegressor(n_estimators=100, max_depth=5)
model_3, pred_3 = model_predict(model_3, X_train, y_train, X_test)

# Evaluation

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
# For evaluation we use Mean Absolute Error (MAE), Mean Absolute Percentage Error (MAPE), and Root Mean Square Error (RMSE).
# Please complete the following function:

def model_evaluate(pred, target):
    mae = mean_absolute_error(target, pred)
    mape = np.mean(np.abs((target - pred) / target)) * 100
    rmse = np.sqrt(mean_squared_error(target, pred))
    return mae, mape, rmse

In [None]:
# Check and compare the performance for all the models. Do you find any interesting observtions(s)? What are your conclusion(s)?

### Model 1

In [None]:
mae, mape, rmse = model_evaluate(pred_1, y_test)
print(mae, mape, rmse)

5.776043449065582 32.46755572098473 7.890516253261829


### Model 2

In [None]:
mae, mape, rmse = model_evaluate(pred_2, y_test)
print(mae, mape, rmse)

5.5576627172914685 31.382341120500374 7.689748003287972


### Model 3


In [None]:
mae, mape, rmse = model_evaluate(pred_3, y_test)
print(mae, mape, rmse)

5.438976376819999 30.777499304414548 7.605688211412157


---
##Conclusion:
All the prediction models shows a MAE between 5 to 6, the model 3 (Gradient Boosting Regressor) shows a best MAPE of 30.77% of error but not a big difference with the other models. I can conclude that the three models used have similar performance in the prediction of the housing price.

## Extra models

### XGBoost

In [None]:
from xgboost import XGBRegressor

model_4 = XGBRegressor(n_estimators=100, max_depth=5)
model_4, pred_4 = model_predict(model_4, X_train, y_train, X_test)
mae, mape, rmse = model_evaluate(pred_4, y_test)
print(mae, mape, rmse)

5.3899010089690185 30.43370834154423 7.56725508446498


### CatBoost

In [None]:
!pip install catboost --q
from catboost import CatBoostRegressor

model_5 = CatBoostRegressor(n_estimators=100, max_depth=5)
model_5, pred_5 = model_predict(model_5, X_train, y_train, X_test)
mae, mape, rmse = model_evaluate(pred_5, y_test)
print(mae, mape, rmse)

Learning rate set to 0.5
0:	learn: 7.9609445	total: 31.4ms	remaining: 3.11s
1:	learn: 7.6962551	total: 54.8ms	remaining: 2.69s
2:	learn: 7.5775298	total: 69ms	remaining: 2.23s
3:	learn: 7.5220184	total: 82ms	remaining: 1.97s
4:	learn: 7.5038992	total: 124ms	remaining: 2.36s
5:	learn: 7.4877730	total: 160ms	remaining: 2.51s
6:	learn: 7.4812169	total: 191ms	remaining: 2.54s
7:	learn: 7.4641103	total: 226ms	remaining: 2.6s
8:	learn: 7.4580627	total: 249ms	remaining: 2.52s
9:	learn: 7.4470572	total: 276ms	remaining: 2.48s
10:	learn: 7.4349304	total: 303ms	remaining: 2.45s
11:	learn: 7.4293931	total: 333ms	remaining: 2.44s
12:	learn: 7.4258589	total: 366ms	remaining: 2.45s
13:	learn: 7.4219559	total: 393ms	remaining: 2.42s
14:	learn: 7.4189564	total: 418ms	remaining: 2.37s
15:	learn: 7.4181914	total: 445ms	remaining: 2.34s
16:	learn: 7.4149360	total: 486ms	remaining: 2.37s
17:	learn: 7.4125115	total: 516ms	remaining: 2.35s
18:	learn: 7.4064276	total: 539ms	remaining: 2.3s
19:	learn: 7.40366

### Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

model_6 = Lasso()
model_6, pred_6 = model_predict(model_6, X_train, y_train, X_test)
mae, mape, rmse = model_evaluate(pred_6, y_test)
print(mae, mape, rmse)

5.818328898748973 32.75958504487473 7.9265688047942


---
All the models used shows similar results as the previous models selected. In conclusion the price prediction would be almost the same for all the models.