In [6]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from datetime import datetime

from sklearn.externals import joblib
    
TRAIN_PATH = '../../res/ftr/base_data_train.csv'
EVALUATION_PATH = '../../res/ftr/base_data_evaluation.csv'

pd.set_option('display.max_columns', 30)

training_set = pd.read_csv(TRAIN_PATH)
evaluation_set = pd.read_csv(EVALUATION_PATH)

training_set.drop(columns=['Unnamed: 0'], inplace=True)
evaluation_set.drop(columns=['Unnamed: 0'], inplace=True)



In [2]:
def load_features(train_df, evaluation_df, features_list):
    for feature in features_list:
        test_ftr = pd.read_csv('../../res/ftr/'+feature+'_train.csv')
        evaluation_ftr = pd.read_csv('../../res/ftr/'+feature+'_evaluation.csv')
    
        train_df = train_df.merge(test_ftr, on='id', how='inner').drop(columns=['Unnamed: 0'])
        evaluation_df = evaluation_df.merge(evaluation_ftr, on='id', how='inner').drop(columns=['Unnamed: 0'])
    
    return (train_df, evaluation_df)
    
features_array = ['amenities', 'avenida_in_direction', 'encoded_provincia', 'encoded_tipodepropiedad',
                  'feature_hashed_ciudad', 'mean_precio_encoded_ciudad', 'mean_precio_encoded_provincia',
                  'mean_precio_encoded_tipodepropiedad', 'metros_totales_y_cubiertos_log', 'murder_rate_of_entity',
                  'provincia_borders_analysis', 'provincia_economy', 'qualificative_adjectives_in_description',
                  'surface_features', 'mean_idzona_price','security_words_in_description',
                  'metrostotales_bigger_than_metroscubiertos']
(training_set, evaluation_set) = load_features(training_set, evaluation_set, features_array)

In [3]:
training_set_X = training_set.drop(columns=['precio']).values
training_set_Y = training_set['precio'].values

lgb_train = lgb.Dataset(training_set_X, training_set_Y)

gbm = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', max_depth=10, n_estimators=3000, learning_rate=0.015,
                        min_child_weight=25, subsample=0.75, colsample_bytree=0.5)

gbm.fit(training_set_X, training_set_Y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
              importance_type='split', learning_rate=0.015, max_depth=10,
              min_child_samples=20, min_child_weight=25, min_split_gain=0.0,
              n_estimators=3000, n_jobs=-1, num_leaves=31,
              objective='regression', random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=0.75,
              subsample_for_bin=200000, subsample_freq=0)

In [4]:
evaluation_set_prediction = gbm.predict(evaluation_set.values)

In [7]:
estimated_prices = pd.DataFrame()
estimated_prices['id'] = evaluation_set['id']
estimated_prices['target'] = evaluation_set_prediction

prices_export_path = '../../res/reg/LightGBM/regressions/lightgbm_estimated_prices-'+datetime.now().strftime("%m-%d-%Y-%H-%M")+'.csv'
regressor_export_path = '../../res/reg/LightGBM/lightgbm_regressor'

estimated_prices.to_csv(prices_export_path, index=False, header=True)
joblib.dump(gbm, regressor_export_path)

estimated_prices

Unnamed: 0,id,target
0,4941,7.146841e+06
1,51775,8.740380e+05
2,115253,2.309505e+06
3,299321,1.124081e+06
4,173570,7.098178e+05
...,...,...
59995,75094,3.621106e+06
59996,171847,7.040239e+05
59997,138313,9.982874e+05
59998,271268,1.571560e+06
