In [7]:
import pandas as pd
import xgboost as xgb
import numpy as np
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.metrics import mean_squared_error

training_set = pd.read_csv('../../res/ftr/base_data_train.csv').drop(columns=['Unnamed: 0'])
pd.set_option('display.max_columns', 30)

training_set.head()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio
0,254099,,2.0,1.0,2.0,80.0,80.0,23533.0,16.67,0,0,0,0,0,2273000
1,53461,10.0,3.0,2.0,2.0,268.0,180.0,24514.0,15.884,0,0,0,1,1,3600000
2,247984,5.0,3.0,2.0,2.0,144.0,166.0,48551.0,16.725,0,0,0,0,0,1200000
3,209067,1.0,2.0,1.0,1.0,63.0,67.0,53666.0,15.408,0,0,0,1,1,650000
4,185997,10.0,2.0,1.0,1.0,95.0,95.0,47835.0,16.959,0,0,0,0,0,1150000


In [8]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin,tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i : %i : %i' % (thour, tmin, round(tsec,2)))

In [9]:
Y = training_set['precio'].values
X = training_set.drop(['precio', 'id'], axis = 1)

evaluation_set = pd.read_csv('../../res/ftr/base_data_evaluation.csv').drop(columns=['Unnamed: 0', 'id'], axis=1)

In [10]:
parameters = {
    'learning_rate': [0.02, 0.04, 0.08, 0.2, 0.4],
    'max_depth': [2, 4, 6, 10, 16, 32],
    'min_child_weight': [0, 0.5, 1, 5, 10],
    'gamma': [0, 1, 5],
    'subsample': [0.25, 0.5, 0.75, 1.0],
    'colsample_bytree': [0.5, 0.75, 1],
    'n_estimators': [50, 100, 200]
}

regressor = xgb.XGBRegressor(objective='reg:squarederror')

In [11]:
skf = StratifiedKFold(10, shuffle=True, random_state=1001)

random_search = RandomizedSearchCV(regressor, param_distributions=parameters,
                                   n_iter=32, scoring='neg_mean_squared_error',
                                   n_jobs=4, cv=skf.split(X,Y), verbose=3, random_state=1001)

start_time = timer(None)

random_search.fit(X, Y)

timer(start_time)



Fitting 10 folds for each of 32 candidates, totalling 320 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  3.5min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed: 24.8min
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed: 69.7min
[Parallel(n_jobs=4)]: Done 320 out of 320 | elapsed: 88.3min finished



 Time taken: 1 : 30 : 12


In [12]:
print('\n Best hyperparameters:')
print(random_search.best_params_)


 Best hyperparameters:
{'subsample': 0.75, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 0.5}
