# Parameter optimization

In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
path_dataset = 'dataset/datos_properati_limpios_model.csv'
df = pd.read_csv(path_dataset)
df.head()

Unnamed: 0,lat,lon,price_aprox_usd,surface_total_in_m2,surface_covered_in_m2,rooms,barrio_match,PH,apartment,house,...,VILLA LUGANO,VILLA LURO,VILLA ORTUZAR,VILLA PUEYRREDON,VILLA REAL,VILLA RIACHUELO,VILLA SANTA RITA,VILLA SOLDATI,VILLA URQUIZA,outlier_price_m2
0,-34.589,-58.417,170000.0,40.0,38.0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,-34.591,-58.418,90000.0,27.0,27.0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,-34.587,-58.437,150000.0,44.0,44.0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,-34.593,-58.428,154000.0,58.0,58.0,2,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-34.593,-58.428,154000.0,58.0,58.0,3,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


I separate the dataset in training (80%) and test (20%) using the column `price_approx_usd` as target

In [2]:
import numpy as np
np.random.seed(123)
from sklearn.model_selection import train_test_split
X=df.drop(columns="price_aprox_usd")
y=df["price_aprox_usd"]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

print(X_train.shape[0], X_test.shape[0])

5100 1276


## Scikit-learn - Training

First of all, let's see how to do cross validation. For that we need to define the number of folds, in this case we are going to use 5.

GridSearchCV allows us to test through a parameter search space the best possible combination given an estimator.

In [3]:
param_grid=[{"max_depth" :[1,2,3,4,5],"max_features": [1,2,3,4,5]}]

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

dtreg = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(dtreg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', 
                           return_train_score=True)

In [5]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=42),
             param_grid=[{'max_depth': [1, 2, 3, 4, 5],
                          'max_features': [1, 2, 3, 4, 5]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [6]:
x=grid_search.cv_results_
pd.DataFrame(x).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.005,0.001,0.002,0.0,1,1,"{'max_depth': 1, 'max_features': 1}",-994459406.735,-988137206.161,-933590398.944,...,-977096844.342,25168954.571,22,-971802086.985,-973477618.12,-987115312.252,-969720705.635,-979052090.496,-976233562.698,6261103.528
1,0.006,0.005,0.001,0.001,1,2,"{'max_depth': 1, 'max_features': 2}",-994459406.735,-988137206.161,-933590398.944,...,-977096844.342,25168954.571,22,-971802086.985,-973477618.12,-987115312.252,-969720705.635,-979052090.496,-976233562.698,6261103.528
2,0.006,0.008,0.0,0.0,1,3,"{'max_depth': 1, 'max_features': 3}",-994459406.735,-988137206.161,-933590398.944,...,-977096844.342,25168954.571,22,-971802086.985,-973477618.12,-987115312.252,-969720705.635,-979052090.496,-976233562.698,6261103.528
3,0.003,0.006,0.003,0.006,1,4,"{'max_depth': 1, 'max_features': 4}",-994459406.735,-988137206.161,-933590398.944,...,-977096844.342,25168954.571,20,-971802086.985,-973477618.12,-987115312.252,-969720705.635,-979052090.496,-976233562.698,6261103.528
4,0.003,0.006,0.003,0.006,1,5,"{'max_depth': 1, 'max_features': 5}",-994459406.735,-988137206.161,-933590398.944,...,-977096844.342,25168954.571,20,-971802086.985,-973477618.12,-987115312.252,-969720705.635,-979052090.496,-976233562.698,6261103.528


In [7]:
grid_search.best_params_

{'max_depth': 4, 'max_features': 5}

__We will search for the best model for the following search space:__

* `"min_samples_split": [2, 10, 20]`
* `"max_depth": [None, 2, 5, 10, 15]`
* `"min_samples_leaf": [1, 5, 10, 15]`
* `"max_leaf_nodes": [None, 5, 10, 20]`

In [9]:
param_grid2=[{"max_depth" :[None,2,5,10,15],"min_samples_split": [2, 10, 20],"min_samples_leaf": [1, 5, 10, 15],"max_leaf_nodes": [None, 5, 10, 20]}]

grid_search2 = GridSearchCV(dtreg, param_grid2, cv=10, scoring='neg_mean_squared_error', return_train_score=True)   

In [10]:
grid_search2.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=DecisionTreeRegressor(random_state=42),
             param_grid=[{'max_depth': [None, 2, 5, 10, 15],
                          'max_leaf_nodes': [None, 5, 10, 20],
                          'min_samples_leaf': [1, 5, 10, 15],
                          'min_samples_split': [2, 10, 20]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [11]:
optimised_decision_tree = grid_search2.best_estimator_

__Let's evaluate the performance of this model in testing.__


In [12]:
from sklearn.metrics import mean_squared_error
y_opt_pred = optimised_decision_tree.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_opt_pred))
np.round(rmse)

21301.0

We see the first 10 results of the property value prediction.

In [13]:
val_real = pd.Series(y_test.values)
val_pred = pd.Series(y_opt_pred)

In [14]:
predicciones = pd.concat([val_real.rename('Real value'),val_pred.rename('Pred value') ,abs(val_real-val_pred).rename('Dif(+/-)')] ,  axis=1)

In [15]:
predicciones.head(10)

Unnamed: 0,Real value,Pred value,Dif(+/-)
0,80000.0,103438.66,23438.66
1,128000.0,135705.882,7705.882
2,150000.0,156075.759,6075.759
3,85000.0,102400.991,17400.991
4,135000.0,135571.622,571.622
5,135000.0,109560.0,25440.0
6,68000.0,75181.25,7181.25
7,110000.0,140444.444,30444.444
8,134000.0,158431.25,24431.25
9,110000.0,76701.202,33298.798
