In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import accuracy_score
from sklearn.metrics import root_mean_squared_error

import numpy as np

pd.set_option('display.max_columns', None)

# Upload csv file : file is copied from "C:\repos\immo-eliza-team6-analysis\analysis\Team_6_Step_3b_outliers_cat.ipynb"
# after running "analysis\Team_6_Step_3b_outliers_cat.ipynb" without the coding blocks for 
# "Encoding - label encoding" on kitchen type, state of building and EPC 
# (hence, missing values on categorical values was treated)
data = r'raw.csv' 
df = pd.read_csv(data, sep = ',')

In [3]:
### dropping columns not to be considered in model < df2 ###
#print(df['Number_of_facades'].value_counts())
df2 = df.drop(['locality_name', 'Postal_code','street', 'number', 'Subtype','latitude','longitude','hasTerrace','terraceSurface', 'gardenSurface', 'Furnished','price_per_sqm',
       'price_per_sqm_land', 'Assigned_City','Assigned_City_5', 'Has_Assigned_City_5', 'Assigned_City_10','Has_Assigned_City_10', 'Assigned_City_15', 'Has_Assigned_City_15'],axis=1)

print("Remaining dataframe (df2) :")
df2.info()
print("shape of df2:",df2.shape)
print("type of df2:", type(df2))


Remaining dataframe (df2) :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4180 entries, 0 to 4179
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  4180 non-null   int64 
 1   Price               4180 non-null   int64 
 2   Number_of_bedrooms  4180 non-null   int64 
 3   Living_area         4180 non-null   int64 
 4   Open_fire           4180 non-null   bool  
 5   Swimming_Pool       4180 non-null   bool  
 6   Kitchen_type        4180 non-null   object
 7   Number_of_facades   4180 non-null   int64 
 8   State_of_building   4180 non-null   object
 9   epc                 4180 non-null   object
 10  landSurface         4180 non-null   int64 
 11  Province            4180 non-null   object
 12  Has_Assigned_City   4180 non-null   bool  
dtypes: bool(3), int64(6), object(4)
memory usage: 338.9+ KB
shape of df2: (4180, 13)
type of df2: <class 'pandas.core.frame.DataFrame'>


In [4]:
### splitting in input and output : X and y ###
X = df2.drop(['id', 'Price'], axis=1)
y = df2["Price"]
print("data type of X and y:",type(X), type(y))
all = X.columns 
print("columns in X", all)

data type of X and y: <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
columns in X Index(['Number_of_bedrooms', 'Living_area', 'Open_fire', 'Swimming_Pool',
       'Kitchen_type', 'Number_of_facades', 'State_of_building', 'epc',
       'landSurface', 'Province', 'Has_Assigned_City'],
      dtype='object')


In [5]:
### splitting in training and testing data ###
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=41)
print("shape of X_train and X_test:" , X_train.shape, X_test.shape)

shape of X_train and X_test: (3135, 11) (1045, 11)


In [10]:
### CatBoostRegressor ###
cat_features = ['State_of_building', 'epc', 'Province', 'Kitchen_type']
data = X_train
label = y_train
dataset = Pool(data, label, cat_features)
# initialize Pool
train_pool = Pool(X_train,
                  y_train,
                  cat_features)
test_pool = Pool(X_test,
                 y_test,
                 cat_features)

# # specify the training parameters
model = CatBoostRegressor(objective='RMSE')

#train the model
model.fit(train_pool,verbose=False,plot=True, eval_set=test_pool) 
model.best_score_
       
### EVALUATE ###
y_train_pred = model.predict(train_pool)
r2_score_train=r2_score(y_train, y_train_pred)
print("r2 on training data: ", r2_score_train)
# make the prediction using the resulting model
y_test_pred = model.predict(test_pool)
r2_score_test=r2_score(y_test, y_test_pred)
print("r2 on testing data: ",r2_score_test)

RMSE = root_mean_squared_error(y_test, y_test_pred)
print("RMSE on test data is: ", RMSE)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

r2 on training data:  0.7878975514339925
r2 on testing data:  0.6786242332741987
RMSE on test data is:  74202.81573722234


In [12]:
### INDIVIDUAL TESTS ON PARAMETERS ###
# # specify the training parameters
model = CatBoostRegressor(objective='RMSE', iterations = 2000, learning_rate = 0.01, depth = 4, l2_leaf_reg = 1)

#train the model

model.fit(train_pool,verbose=False,plot=True, eval_set=test_pool) 
model.best_score_
       
### EVALUATE ###
y_train_pred = model.predict(train_pool)
r2_score_train=r2_score(y_train, y_train_pred)
print("r2 on training data: ", r2_score_train)
# make the prediction using the resulting model
y_test_pred = model.predict(test_pool)
r2_score_test=r2_score(y_test, y_test_pred)
print("r2 on testing data: ",r2_score_test)

RMSE = root_mean_squared_error(y_test, y_test_pred)
print("RMSE on test data is: ", RMSE)



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

r2 on training data:  0.7231837224608006
r2 on testing data:  0.672824246351415
RMSE on test data is:  74869.40442631856
