In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from datetime import datetime
    
TRAIN_PATH = '../../res/ftr/base_data_train.csv'
EVALUATION_PATH = '../../res/ftr/base_data_evaluation.csv'

pd.set_option('display.max_columns', 30)

training_set = pd.read_csv(TRAIN_PATH)
evaluation_set = pd.read_csv(EVALUATION_PATH)

training_set.drop(columns=['Unnamed: 0'], inplace=True)
evaluation_set.drop(columns=['Unnamed: 0'], inplace=True)

In [2]:
def load_features(train_df, evaluation_df, features_list):
    for feature in features_list:
        test_ftr = pd.read_csv('../../res/ftr/'+feature+'_train.csv')
        evaluation_ftr = pd.read_csv('../../res/ftr/'+feature+'_evaluation.csv')
    
        train_df = train_df.merge(test_ftr, on='id', how='inner').drop(columns=['Unnamed: 0'])
        evaluation_df = evaluation_df.merge(evaluation_ftr, on='id', how='inner').drop(columns=['Unnamed: 0'])
    
    return (train_df, evaluation_df)
    
features_array = ['amenities', 'avenida_in_direction', 'encoded_provincia', 'encoded_tipodepropiedad', 'feature_hashed_ciudad',
                  'mean_precio_encoded_ciudad', 'mean_precio_encoded_provincia', 'mean_precio_encoded_tipodepropiedad',
                  'metros_totales_y_cubiertos_log', 'murder_rate_of_entity','provincia_borders_analysis', 'provincia_economy',
                  'qualificative_adjectives_in_description', 'surface_features', 'mean_idzona_price', 'dolar_for_date']
(training_set, evaluation_set) = load_features(training_set, evaluation_set, features_array)

In [3]:
training_set_X = training_set.drop(columns=['precio']).values
training_set_Y = training_set['precio'].values

lgb_train = lgb.Dataset(training_set_X, training_set_Y)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'max_depth': 16,
    'n_estimators': 2000,
    'learning_rate': 0.02,
    'min_child_weight': 25,
    'subsample': 0.75,
    'colsample_bytree': 0.5
}

gbm = lgb.train(params, lgb_train, num_boost_round=1000)



In [4]:
evaluation_set_prediction = gbm.predict(evaluation_set, num_iteration=gbm.best_iteration)

In [5]:
estimated_prices = pd.DataFrame()
estimated_prices['id'] = evaluation_set['id']
estimated_prices['target'] = evaluation_set_prediction

export_path = '../../res/reg/lightgbm_estimated_prices-'+datetime.now().strftime("%m-%d-%Y-%H-%M")+'.csv'
estimated_prices.to_csv(export_path, index=False, header=True)

estimated_prices

Unnamed: 0,id,target
0,4941,7.348536e+06
1,51775,8.568363e+05
2,115253,2.255800e+06
3,299321,1.080726e+06
4,173570,7.030899e+05
...,...,...
59995,75094,3.668390e+06
59996,171847,6.879884e+05
59997,138313,9.428035e+05
59998,271268,1.580784e+06
