In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
    
TRAIN_PATH = '../../res/ftr/base_data_train.csv'
EVALUATION_PATH = '../../res/ftr/base_data_evaluation.csv'

pd.set_option('display.max_columns', 30)

training_set = pd.read_csv(TRAIN_PATH)
evaluation_set = pd.read_csv(EVALUATION_PATH)

training_set.drop(columns=['Unnamed: 0'], inplace=True)
evaluation_set.drop(columns=['Unnamed: 0'], inplace=True)

evaluation_set.head()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos
0,4941,29.0,3.0,,4.0,300.0,,,15.906,0,0,0,0,0
1,51775,,1.0,1.0,1.0,67.0,67.0,113851.0,16.732,0,0,0,0,0
2,115253,0.0,2.0,1.0,2.0,87.0,100.0,23620.0,16.585,0,0,0,0,1
3,299321,2.0,2.0,2.0,2.0,86.0,86.0,129347.0,16.527,0,0,0,0,0
4,173570,10.0,2.0,1.0,1.0,80.0,76.0,57125.0,15.932,0,0,0,1,1


In [2]:
def load_features(train_df, evaluation_df, features_list):
    for feature in features_list:
        test_ftr = pd.read_csv('../../res/ftr/'+feature+'_train.csv')
        evaluation_ftr = pd.read_csv('../../res/ftr/'+feature+'_evaluation.csv')
    
        train_df = train_df.merge(test_ftr, on='id', how='inner').drop(columns=['Unnamed: 0'])
        evaluation_df = evaluation_df.merge(evaluation_ftr, on='id', how='inner').drop(columns=['Unnamed: 0'])
    
    return (train_df, evaluation_df)
    
(training_set, evaluation_set) = load_features(training_set, evaluation_set, ['surface_features', 'murder_rate_of_entity', 'encoded_tipodepropiedad'])

training_set.head()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio,...,type_Local en centro comercial,type_Bodega comercial,type_Otros,type_Villa,type_Duplex,type_Inmuebles productivos urbanos,type_Departamento Compartido,type_Nave industrial,type_Rancho,type_Terreno industrial,type_nan,type_Huerta,type_Lote,type_Hospedaje,type_Garage
0,254099,,2.0,1.0,2.0,80.0,80.0,23533.0,16.67,0,0,0,0,0,2273000,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,53461,10.0,3.0,2.0,2.0,268.0,180.0,24514.0,15.884,0,0,0,1,1,3600000,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,247984,5.0,3.0,2.0,2.0,144.0,166.0,48551.0,16.725,0,0,0,0,0,1200000,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,209067,1.0,2.0,1.0,1.0,63.0,67.0,53666.0,15.408,0,0,0,1,1,650000,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,185997,10.0,2.0,1.0,1.0,95.0,95.0,47835.0,16.959,0,0,0,0,0,1150000,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [3]:
training_set_X = training_set.drop(columns=['precio']).values
training_set_Y = training_set['precio'].values

regressor = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=500, learning_rate=0.04,
                             gamma=1, subsample=0.75, colsample_bytree=0.75, max_depth=10)

regressor.fit(training_set_X, training_set_Y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.75, gamma=1,
             importance_type='gain', learning_rate=0.04, max_delta_step=0,
             max_depth=10, min_child_weight=1, missing=None, n_estimators=500,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=0.75, verbosity=1)

In [4]:
evaluation_set_prediction = regressor.predict(evaluation_set.values)

In [5]:
estimated_prices = pd.DataFrame()
estimated_prices['id'] = evaluation_set['id']
estimated_prices['target'] = evaluation_set_prediction
estimated_prices

estimated_prices.to_csv('../../res/reg/estimated_prices.csv', index=False, header=True)

In [6]:
estimated_prices

Unnamed: 0,id,target
0,4941,6.094716e+06
1,51775,1.116059e+06
2,115253,2.566406e+06
3,299321,1.468590e+06
4,173570,6.023743e+05
...,...,...
59995,75094,4.567041e+06
59996,171847,7.301476e+05
59997,138313,1.025768e+06
59998,271268,1.456546e+06
