In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from datetime import datetime
    
TRAIN_PATH = '../../res/ftr/base_data_train.csv'
EVALUATION_PATH = '../../res/ftr/base_data_evaluation.csv'

pd.set_option('display.max_columns', 30)

training_set = pd.read_csv(TRAIN_PATH)
evaluation_set = pd.read_csv(EVALUATION_PATH)

training_set.drop(columns=['Unnamed: 0'], inplace=True)
evaluation_set.drop(columns=['Unnamed: 0'], inplace=True)

In [2]:
def load_features(train_df, evaluation_df, features_list):
    for feature in features_list:
        test_ftr = pd.read_csv('../../res/ftr/'+feature+'_train.csv')
        evaluation_ftr = pd.read_csv('../../res/ftr/'+feature+'_evaluation.csv')
    
        train_df = train_df.merge(test_ftr, on='id', how='inner').drop(columns=['Unnamed: 0'])
        evaluation_df = evaluation_df.merge(evaluation_ftr, on='id', how='inner').drop(columns=['Unnamed: 0'])
    
    return (train_df, evaluation_df)
    
features_array = ['amenities', 'avenida_in_direction', 'encoded_provincia', 'encoded_tipodepropiedad', 'feature_hashed_ciudad',
                  'mean_precio_encoded_ciudad', 'mean_precio_encoded_provincia', 'mean_precio_encoded_tipodepropiedad',
                  'metros_totales_y_cubiertos_log', 'murder_rate_of_entity','provincia_borders_analysis', 'provincia_economy',
                  'qualificative_adjectives_in_description', 'surface_features', 'mean_idzona_price', 'dolar_for_date']
(training_set, evaluation_set) = load_features(training_set, evaluation_set, features_array)

In [3]:
training_set_X = training_set.drop(columns=['precio'])
training_set_Y = training_set['precio']

regressor = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=2000, learning_rate=0.02,
                             gamma=1, subsample=0.75, colsample_bytree=0.5, max_depth=16, min_child_weight=25)

regressor.fit(training_set_X, training_set_Y)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=1,
             importance_type='gain', learning_rate=0.02, max_delta_step=0,
             max_depth=16, min_child_weight=25, missing=None, n_estimators=2000,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=0.75, verbosity=1)

In [4]:
evaluation_set_prediction = regressor.predict(evaluation_set)

In [5]:
estimated_prices = pd.DataFrame()
estimated_prices['id'] = evaluation_set['id']
estimated_prices['target'] = evaluation_set_prediction

export_path = '../../res/reg/estimated_prices-'+datetime.now().strftime("%m-%d-%Y-%H-%M")+'.csv'

estimated_prices.to_csv(export_path, index=False, header=True)

In [6]:
estimated_prices

Unnamed: 0,id,target
0,4941,6.989552e+06
1,51775,1.016496e+06
2,115253,2.283658e+06
3,299321,1.428328e+06
4,173570,6.844388e+05
...,...,...
59995,75094,3.556392e+06
59996,171847,7.509093e+05
59997,138313,8.306864e+05
59998,271268,1.585818e+06


In [7]:
feature_importance = pd.DataFrame(regressor.feature_importances_, columns = ['Importance'], index = training_set_X.columns)\
                                           .sort_values(['Importance'], ascending = False)

feature_importance

Unnamed: 0,Importance
precio_por_metro,0.056913
metroscubiertos,0.052858
mean_precio_encoded_ciudad,0.043649
type_Terreno,0.042024
metroscubiertos_log,0.039207
...,...
type_Lote,0.000000
ciudad_hash_38,0.000000
ciudad_hash_26,0.000000
ciudad_hash_27,0.000000
