In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from datetime import datetime
    
TRAIN_PATH = '../../res/ftr/base_data_train.csv'

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 150)

training_set = pd.read_csv(TRAIN_PATH)

training_set.drop(columns=['Unnamed: 0'], inplace=True)

training_set.head()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio
0,254099,,2.0,1.0,2.0,80.0,80.0,23533.0,16.67,0,0,0,0,0,2273000
1,53461,10.0,3.0,2.0,2.0,268.0,180.0,24514.0,15.884,0,0,0,1,1,3600000
2,247984,5.0,3.0,2.0,2.0,144.0,166.0,48551.0,16.725,0,0,0,0,0,1200000
3,209067,1.0,2.0,1.0,1.0,63.0,67.0,53666.0,15.408,0,0,0,1,1,650000
4,185997,10.0,2.0,1.0,1.0,95.0,95.0,47835.0,16.959,0,0,0,0,0,1150000


In [2]:
def load_features(train_df, features_list):
    for feature in features_list:
        test_ftr = pd.read_csv('../../res/ftr/'+feature+'_train.csv')
    
        train_df = train_df.merge(test_ftr, on='id', how='inner').drop(columns=['Unnamed: 0'])
    return train_df
    
training_set = load_features(training_set, ['avenida_in_direction', 'qualificative_adjectives_in_description', 'security_words_in_description', 'provincia_borders_analysis', 'metros_totales_y_cubiertos_log', 'surface_features', 'provincia_features', 'mean_precio_encoded_tipodepropiedad', 'mean_precio_encoded_ciudad', 'mean_precio_encoded_provincia', 'murder_rate_of_entity', 'encoded_tipodepropiedad', 'encoded_provincia', 'hashed_ciudad', 'amenities'])

training_set.head()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio,...,ciudad_hash_34,ciudad_hash_35,ciudad_hash_36,ciudad_hash_37,ciudad_hash_38,ciudad_hash_39,property_amenities,any_property_amenity,all_propety_amenities,location_amenities,any_location_amenity,all_location_amenities,number_of_amenities,any_amenity,all_amenities
0,254099,,2.0,1.0,2.0,80.0,80.0,23533.0,16.67,0,0,0,0,0,2273000,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,53461,10.0,3.0,2.0,2.0,268.0,180.0,24514.0,15.884,0,0,0,1,1,3600000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,1.0,0.0
2,247984,5.0,3.0,2.0,2.0,144.0,166.0,48551.0,16.725,0,0,0,0,0,1200000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,209067,1.0,2.0,1.0,1.0,63.0,67.0,53666.0,15.408,0,0,0,1,1,650000,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,1.0,0.0
4,185997,10.0,2.0,1.0,1.0,95.0,95.0,47835.0,16.959,0,0,0,0,0,1150000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
training_set_X = training_set.drop(columns=['precio'])
training_set_Y = training_set['precio']

regressor = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=2000, learning_rate=0.02,
                             gamma=1, subsample=0.75, colsample_bytree=0.5, max_depth=16, min_child_weight=25)

regressor.fit(training_set_X, training_set_Y)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=1,
             importance_type='gain', learning_rate=0.02, max_delta_step=0,
             max_depth=16, min_child_weight=25, missing=None, n_estimators=2000,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=0.75, verbosity=1)

In [7]:
feature_importance = pd.DataFrame(regressor.feature_importances_, columns = ['Importance'], index = training_set_X.columns)\
                                           .sort_values(['Importance'], ascending = False)

feature_importance

Unnamed: 0,Importance
entity_Distrito Federal,0.177342
mean_precio_encoded_ciudad,0.041753
type_Terreno,0.038006
metroscubiertos,0.036054
gdp_of_entity,0.027821
ciudad_hash_16,0.027632
type_Apartamento,0.027136
metroscubiertos_log,0.02523
mean_precio_encoded_provincia,0.02485
cap_border,0.023055


In [42]:
unimportant_features = feature_importance.loc[feature_importance.Importance < 0.003]
unimportant_features = unimportant_features.reset_index()

unimportant_features.to_csv('../../res/ftr/unimportant_features.csv')
unimportant_features['index'].values

array(['all_amenities', 'metrosdescubiertos', 'entity_Durango',
       'murder_rate_of_entity', 'avenida', 'sa_border', 'seguridad_x',
       'entity_Quintana Roo', 'entity_Querétaro', 'entity_Tabasco',
       'usosmultiples', 'centroscomercialescercanos', 'any_amenity',
       'number_of_amenities', 'type_Quinta Vacacional',
       'location_amenities', 'pacific_o', 'entity_Baja California Norte',
       'type_Casa uso de suelo', 'any_location_amenity',
       'entity_Tlaxcala', 'escuelascercanas', 'id',
       'all_location_amenities', 'entity_Chiapas', 'entity_Hidalgo',
       'entity_Sonora', 'entity_Sinaloa', 'entity_Baja California Sur',
       'type_Villa', 'entity_Veracruz', 'type_Otros', 'entity_Oaxaca',
       'entity_Tamaulipas', 'type_Local en centro comercial',
       'type_Nave industrial', 'entity_Michoacán',
       'type_Inmuebles productivos urbanos', 'ciudad_hash_18',
       'entity_Colima', 'entity_Aguascalientes',
       'type_Terreno industrial', 'type_nan',
      

In [41]:
total_loss = unimportant_features['Importance'].agg(sum)

print(f"Total loss: {total_loss}%")

Total loss: 0.08168040215969086%
