In [1]:
import pandas as pd
import xgboost as xgb

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split

from hyperopt import fmin, tpe, hp, anneal, Trials, STATUS_OK 

import numpy as np

from datetime import datetime


training_set = pd.read_csv('../../res/ftr/base_data_train.csv').drop(columns=['Unnamed: 0'])
pd.set_option('display.max_columns', 30)

training_set.head()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio
0,254099,,2.0,1.0,2.0,80.0,80.0,23533.0,16.67,0,0,0,0,0,2273000
1,53461,10.0,3.0,2.0,2.0,268.0,180.0,24514.0,15.884,0,0,0,1,1,3600000
2,247984,5.0,3.0,2.0,2.0,144.0,166.0,48551.0,16.725,0,0,0,0,0,1200000
3,209067,1.0,2.0,1.0,1.0,63.0,67.0,53666.0,15.408,0,0,0,1,1,650000
4,185997,10.0,2.0,1.0,1.0,95.0,95.0,47835.0,16.959,0,0,0,0,0,1150000


In [2]:
def load_features(train_df, features_list):
    for feature in features_list:
        train_ftr = pd.read_csv('../../res/ftr/'+feature+'_train.csv')
    
        train_df = train_df.merge(train_ftr, on='id', how='inner').drop(columns=['Unnamed: 0'])
    
    return train_df
    
features_array = ['amenities', 'avenida_in_direction', 'encoded_provincia', 'encoded_tipodepropiedad', 'feature_hashed_ciudad',
                  'mean_precio_encoded_ciudad', 'mean_precio_encoded_provincia', 'mean_precio_encoded_tipodepropiedad',
                  'metros_totales_y_cubiertos_log', 'murder_rate_of_entity','provincia_borders_analysis', 'provincia_economy',
                  'qualificative_adjectives_in_description', 'surface_features', 'mean_idzona_price']

training_set = load_features(training_set, features_array)

data = training_set.drop(columns = ['precio'])
target = training_set['precio']

data.head()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,property_amenities,...,murder_rate_of_entity,na_border,sa_border,pacific_o,atlantic_o,cap_border,population_of_entity,gdp_of_entity,gdp_per_capita_of_entity,seguridad,metrostotalesporhabitacion,metroscubiertosporhabitacion,metroscubiertossobretotales,metrosdescubiertos,precio_por_metro
0,254099,,2.0,1.0,2.0,80.0,80.0,23533.0,16.67,0,0,0,0,0,0.0,...,9.666667,0.0,0.0,0.0,0.0,1.0,8851080.0,2254840.0,0.254753,0,40.0,40.0,1.0,0.0,23256.837144
1,53461,10.0,3.0,2.0,2.0,268.0,180.0,24514.0,15.884,0,0,0,1,1,0.0,...,9.666667,0.0,0.0,0.0,0.0,1.0,8851080.0,2254840.0,0.254753,0,60.0,89.333333,1.488889,-88.0,15614.90838
2,247984,5.0,3.0,2.0,2.0,144.0,166.0,48551.0,16.725,0,0,0,0,0,0.0,...,13.5,0.0,0.0,0.0,1.0,0.0,7350682.0,850237.0,0.115668,1,55.333333,48.0,0.86747,22.0,8441.101152
3,209067,1.0,2.0,1.0,1.0,63.0,67.0,53666.0,15.408,0,0,0,1,1,0.0,...,10.333333,0.0,0.0,0.0,0.0,1.0,15175862.0,1209424.0,0.079694,0,33.5,31.5,0.940299,4.0,9857.301547
4,185997,10.0,2.0,1.0,1.0,95.0,95.0,47835.0,16.959,0,0,0,0,0,0.0,...,13.5,0.0,0.0,0.0,1.0,0.0,7350682.0,850237.0,0.115668,0,47.5,47.5,1.0,0.0,13804.584787


In [3]:
random_state = 42
n_iter = 10

train_data, test_data, train_targets, test_targets = train_test_split(data, target, test_size=0.20, shuffle=True, random_state = random_state)
n_folds = 5
kfolds = KFold(n_splits=n_folds, random_state=random_state)

In [4]:
def xgb_crossvalidation_rmse(params, random_state=random_state, cv=kfolds, X = train_data, y = train_targets):
    params = {
        'learning_rate': params['learning_rate'],
        'max_depth': int(params['max_depth']),
        'min_child_weight': int(params['min_child_weight']),
        'gamma': params['gamma'],
        'subsample': params['subsample'],
        'colsample_bytree': params['colsample_bytree'],
        'n_estimators': int(params['n_estimators'])
    }
    regressor = xgb.XGBRegressor(objective='reg:squarederror', random_state = random_state, **params)
    score = -cross_val_score(regressor, X, y, cv = cv, scoring='neg_mean_squared_error', n_jobs=1).mean()
    return score

In [5]:
%%time

space = {
    'learning_rate': hp.quniform('learning_rate', 0, 1, 0.0005),
    'max_depth': hp.quniform('max_depth', 2, 40, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 30, 1),
    'gamma': hp.quniform('gamma', 0, 5, 0.5),
    'subsample': hp.quniform('subsample', 0, 1, 0.1),
    'colsample_bytree': hp.quniform('colsample_bytree', 0, 1, 0.1),
    'n_estimators': hp.quniform('n_estimators', 100, 4100, 100)
}

trials = Trials()

best_hyperparameters = fmin(fn=xgb_crossvalidation_rmse, space=space, algo=anneal.suggest, max_evals=n_iter, trials=trials, rstate=np.random.RandomState(random_state))

best_hyperparameters

  0%|          | 0/10 [00:00<?, ?it/s, best loss: ?]

  if getattr(data, 'base', None) is not None and \



  0%|          | 0/10 [00:36<?, ?it/s, best loss: ?]


KeyboardInterrupt: 