In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import accuracy_score

import numpy as np

pd.set_option('display.max_columns', None)

# Upload csv file : file is copied from "C:\repos\immo-eliza-team6-analysis\analysis\Team_6_Step_3b_outliers_cat.ipynb"
# after running "analysis\Team_6_Step_3b_outliers_cat.ipynb" without the coding blocks for 
# "Encoding - label encoding" on kitchen type, state of building and EPC 
# (hence, missing values on categorical values was treated)
data = r'raw.csv' 
df = pd.read_csv(data, sep = ',')

In [2]:
### dropping columns not to be considered in model < df2 ###
#print(df['Number_of_facades'].value_counts())
df2 = df.drop(['locality_name', 'Postal_code','street', 'number', 'Subtype','latitude','longitude','hasTerrace','terraceSurface', 'gardenSurface', 'Furnished','price_per_sqm',
       'price_per_sqm_land', 'Assigned_City','Assigned_City_5', 'Has_Assigned_City_5', 'Assigned_City_10','Has_Assigned_City_10', 'Assigned_City_15', 'Has_Assigned_City_15'],axis=1)

print("Remaining dataframe (df2) :")
df2.info()
print("shape of df2:",df2.shape)
print("type of df2:", type(df2))


Remaining dataframe (df2) :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4180 entries, 0 to 4179
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  4180 non-null   int64 
 1   Price               4180 non-null   int64 
 2   Number_of_bedrooms  4180 non-null   int64 
 3   Living_area         4180 non-null   int64 
 4   Open_fire           4180 non-null   bool  
 5   Swimming_Pool       4180 non-null   bool  
 6   Kitchen_type        4180 non-null   object
 7   Number_of_facades   4180 non-null   int64 
 8   State_of_building   4180 non-null   object
 9   epc                 4180 non-null   object
 10  landSurface         4180 non-null   int64 
 11  Province            4180 non-null   object
 12  Has_Assigned_City   4180 non-null   bool  
dtypes: bool(3), int64(6), object(4)
memory usage: 338.9+ KB
shape of df2: (4180, 13)
type of df2: <class 'pandas.core.frame.DataFrame'>


In [3]:
### splitting in input and output : X and y ###
X = df2.drop(['id', 'Price'], axis=1)
y = df2["Price"]
print("data type of X and y:",type(X), type(y))
all = X.columns 
print("columns in X", all)

data type of X and y: <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
columns in X Index(['Number_of_bedrooms', 'Living_area', 'Open_fire', 'Swimming_Pool',
       'Kitchen_type', 'Number_of_facades', 'State_of_building', 'epc',
       'landSurface', 'Province', 'Has_Assigned_City'],
      dtype='object')


In [4]:
### splitting in training and testing data ###
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=41)
print("shape of X_train and X_test:" , X_train.shape, X_test.shape)

shape of X_train and X_test: (3135, 11) (1045, 11)


In [12]:
### CatBoostRegressor ###
cat_features = ['State_of_building', 'epc', 'Province', 'Kitchen_type']
data = X_train
label = y_train
dataset = Pool(data, label, cat_features)
# initialize Pool
train_pool = Pool(X_train,
                  y_train,
                  cat_features)
test_pool = Pool(X_test,
                 y_test,
                 cat_features)
# specify the training parameters

model = CatBoostRegressor(objective='RMSE')
#train the model

model.fit(train_pool,verbose=False,plot=True, eval_set=test_pool) 
model.best_score_
       
### EVALUATE ###
preds_train = model.predict(train_pool)
r2_score_train=r2_score(y_train, preds_train)
print("r2 on training data: ", r2_score_train)
# make the prediction using the resulting model
preds = model.predict(test_pool)
r2_score_test=r2_score(y_test, preds)
print("r2 on testing data: ",r2_score_test)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

r2 on training data:  0.7878975514339925
r2 on testing data:  0.6786242332741987


In [18]:
### CatBoostRegressor ###
cat_features = ['State_of_building', 'epc', 'Province', 'Kitchen_type']
data = X_train
label = y_train
dataset = Pool(data, label, cat_features)
# initialize Pool
train_pool = Pool(X_train,
                  y_train,
                  cat_features)
test_pool = Pool(X_test,
                 y_test,
                 cat_features)
# specify the training parameters

model = CatBoostRegressor(objective='RMSE')
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}
model.grid_search(grid, train_pool)

pred = model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)
print('Testing performance')
print('RMSE: {:.2f}'.format(rmse))
print('‘R2: {:.2f}'.format(r2))

0:	learn: 361837.8545674	test: 351499.9685161	best: 351499.9685161 (0)	total: 4.48ms	remaining: 444ms
1:	learn: 352334.0435220	test: 342029.2702781	best: 342029.2702781 (1)	total: 14.8ms	remaining: 727ms
2:	learn: 343094.9522908	test: 332911.7634366	best: 332911.7634366 (2)	total: 34ms	remaining: 1.1s
3:	learn: 334046.8238615	test: 323895.3226937	best: 323895.3226937 (3)	total: 50.6ms	remaining: 1.21s
4:	learn: 325501.9897193	test: 315392.3078234	best: 315392.3078234 (4)	total: 65.5ms	remaining: 1.25s
5:	learn: 317067.6358728	test: 306964.3942050	best: 306964.3942050 (5)	total: 80.7ms	remaining: 1.26s
6:	learn: 308918.2639782	test: 298810.6166459	best: 298810.6166459 (6)	total: 96.9ms	remaining: 1.29s
7:	learn: 301230.6183672	test: 291150.0349334	best: 291150.0349334 (7)	total: 113ms	remaining: 1.3s
8:	learn: 293720.6721754	test: 283708.0587356	best: 283708.0587356 (8)	total: 128ms	remaining: 1.3s
9:	learn: 286547.6208825	test: 276554.8142415	best: 276554.8142415 (9)	total: 144ms	remai

NameError: name 'mean_squared_error' is not defined

In [19]:
sorted_feature_importance = model.feature_importances_.argsort()
plt.barh(boston.feature_names[sorted_feature_importance], 
        model.feature_importances_[sorted_feature_importance], 
        color='turquoise')
plt.xlabel("CatBoost Feature Importance")

NameError: name 'boston' is not defined

In [20]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, feature_names = boston.feature_names[sorted_feature_importance])

NameError: name 'shap' is not defined