In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_absolute_error
from hyperopt import hp, tpe, Trials
from hyperopt.fmin import fmin
import hyperopt
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score as r2_score
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv('data/all_v2.csv')
df.head()

Unnamed: 0,price,date,time,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type
0,6050000,2018-02-19,20:00:21,59.805808,30.376141,2661,1,8,10,3,82.6,10.8,1
1,8650000,2018-02-27,12:04:54,55.683807,37.297405,81,3,5,24,2,69.1,12.0,1
2,4000000,2018-02-28,15:44:00,56.29525,44.061637,2871,1,5,9,3,66.0,10.0,1
3,1850000,2018-03-01,11:24:52,44.996132,39.074783,2843,4,12,16,2,38.0,5.0,11
4,5450000,2018-03-01,17:42:43,55.918767,37.984642,81,3,13,14,2,60.0,10.0,1


# Preprocess data

In [2]:
df = df.query('price > 100000')
df = df.query('price < 500000000')
df = df.query('rooms != -2')
df.index = np.arange(df.shape[0])

df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df = df.sort_values(by='date')
df['year'] = df['date'].dt.year
del df['date'], df['time']


categoricals = ['building_type', 'object_type', 'region', 'year']

In [3]:
def calc_mean_room_area(df):
    return (df['area'] - df['kitchen_area']) / (abs(df['rooms']))

In [4]:
df['mean_room_area'] = calc_mean_room_area(df)
df['percent_of_kitchen_area'] = df['kitchen_area'] / df['area']
df['percent_of_level'] = df['level'] / df['levels']

In [5]:
for col in categoricals:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [6]:
# данные отсортированы по времени
# для отложенной выборки возьмём 100000 последних примеров
df_test = df.iloc[df.index[-100000:]]
df = df.iloc[df.index[:-100000]]

print(f"Test target mean value : {df_test['price'].mean()}")
print(f"Train target mean value : {df['price'].mean()}")

Test target mean value : 6137584.3326
Train target mean value : 4361977.834525038


In [7]:
thr = 5e6
expensive_regions = df.groupby('region')['price'].mean().index[df.groupby('region')['price'].mean() > thr]
df['expensive_region'] = df['region'].apply(lambda x: x in expensive_regions).astype(np.int8)
df_test['expensive_region'] = df_test['region'].apply(lambda x: x in expensive_regions).astype(np.int8)

In [8]:
df_test.index = np.arange(df_test.shape[0])

# Utils

In [9]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


def pred_map(pred_df):
    MAE = mean_absolute_error(pred_df['y_true'], pred_df['y_pred'])
    RMSE = rmse(pred_df['y_true'], pred_df['y_pred'])
    R2 = r2_score(pred_df['y_true'], pred_df['y_pred'])
    plt.figure(figsize=(8,8))
    ax = plt.subplot(111)
    ax.scatter('y_true', 'y_pred', data=pred_df)
    ax.set_xlabel('True Value', fontsize=15)
    ax.set_ylabel('Predicted Value', fontsize=15)
    ax.set_xlim(pred_df.min().min()-0.1 , pred_df.max().max()+0.1)
    ax.set_ylim(pred_df.min().min()-0.1 , pred_df.max().max()+0.1)
    x = np.linspace(pred_df.min().min()-0.1, pred_df.max().max()+0.1, 2)
    y = x
    ax.plot(x,y,'r-')
    plt.text(0.1, 0.9, 'MAE = {}'.format(str(round(MAE, 5))), transform=ax.transAxes, fontsize=15)
    plt.text(0.1, 0.8, 'RMSE = {}'.format(str(round(RMSE, 5))), transform=ax.transAxes, fontsize=15)
    plt.text(0.1, 0.7, 'R2 = {}'.format(str(round(R2, 5))), transform=ax.transAxes, fontsize=15)

In [10]:
#check if given parameter can be interpreted as a numerical value
def is_number(s):
    if s is None:
        return False
    try:
        float(s)
        return True
    except ValueError:
        return False

#convert given set of paramaters to integer values
#this at least cuts the excess float decimals if they are there
def convert_int_params(names, params):
    for int_type in names:
        #sometimes the parameters can be choices between options or numerical values. like "log2" vs "1-10"
        raw_val = params[int_type]
        if is_number(raw_val):
            params[int_type] = int(raw_val)
    return params

#convert float parameters to 3 digit precision strings
#just for simpler diplay and all
def convert_float_params(names, params):
    for float_type in names:
        raw_val = params[float_type]
        if is_number(raw_val):
            params[float_type] = '{:.3f}'.format(raw_val)
    return params


# how many CV folds to do on the data
n_folds = 5
# max number of rows to use for X and y. to reduce time and compare options faster
max_n = None
# max number of trials hyperopt runs
n_trials = 200
#verbosity in LGBM is how often progress is printed. with 100=print progress every 100 rounds. 0 is quite?
verbosity = 0
print_summary = False

all_scores = []
all_params = []

# run n_folds of cross validation on the data
# averages fold results
def fit_cv(X, y, params, fit_params):
    # cut the data if max_n is set
    if max_n is not None:
        X = X[:max_n]
        y = y[:max_n]
    
    y = np.array(y)

    score = 0
    folds = KFold(n_splits=n_folds, shuffle=True, random_state=69)

    if print_summary:
        print(f"Running {n_folds} folds...")
    oof_preds = np.zeros((X.shape[0]))
    for i, (train_index, test_index) in enumerate(folds.split(X, y)):
        if verbosity > 0:
            print('-' * 20, f"RUNNING FOLD: {i}/{n_folds}", '-' * 20)

        model = lgb.LGBMRegressor(**params)
        categoricals = ['building_type', 'object_type', 'region', 'year', 'expensive_region']
        X_train, y_train = X.iloc[train_index], y[train_index]
        X_test, y_test = X.iloc[test_index], y[test_index]
        #if 100 it prints progress 100,200,300,... iterations
        model.fit(X_train, y_train, eval_set=(X_test, y_test), 
                  verbose=verbosity, **fit_params, categorical_feature=categoricals)
        oof_preds[test_index] = model.predict(X.iloc[test_index])
        score += mean_absolute_error(y[test_index], oof_preds[test_index])
        importances = model.feature_importances_
        features = X.columns
        
    total_score = score / n_folds
    all_scores.append(total_score)
    all_params.append(params)
    if print_summary:
        print(f"total mae: {total_score}")
    return total_score

def create_fit_params(params):
    using_dart = params['boosting_type'] == "dart"
    if params["objective"] == "rmse":
        # https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
        fit_params = {"eval_metric": ["rmse"]}
    else:
        fit_params = {"eval_metric": "rmse"}
    if using_dart:
        n_estimators = 2000
    else:
        n_estimators = 10000
        fit_params["early_stopping_rounds"] = 100
    params["n_estimators"] = n_estimators
    return fit_params


# this is the objective function the hyperopt aims to minimize
# i call it objective_sklearn because the lgbm functions called use sklearn API
def objective_sklearn(params):
    int_types = ["num_leaves", "min_child_samples", "subsample_for_bin", "min_data_in_leaf", "bagging_freq"]
    params = convert_int_params(int_types, params)

    # Extract the boosting type
    params['boosting_type'] = params['boosting_type']['boosting_type']
    #    print("running with params:"+str(params))

    fit_params = create_fit_params(params)

    score = fit_cv(X, y, params, fit_params)
    if verbosity == 0:
        if print_summary:
            print("Score {:.3f}".format(score))
    else:
        print("Score {:.3f} params {}".format(score, params))
    result = {"loss": score, "score": score, "params": params, 'status': hyperopt.STATUS_OK}
    return result

def optimize_lgbm(max_n_search=None):
    # https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
    # https://indico.cern.ch/event/617754/contributions/2590694/attachments/1459648/2254154/catboost_for_CMS.pdf
    space = {
        #this is just piling on most of the possible parameter values for LGBM
        #some of them apparently don't make sense together, but works for now.. :)
        'boosting_type': hp.choice('boosting_type',
                                   [{'boosting_type': 'gbdt',
                                     }]),
        'num_leaves': hp.quniform('num_leaves', 127, 511, 8),
        'learning_rate': hp.uniform('learning_rate', 0.05, 0.2),
        'subsample_for_bin': hp.quniform('subsample_for_bin', 100, 5000, 100),
        'feature_fraction': hp.uniform('feature_fraction', 0.8, 1),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.8, 1), #alias "subsample"
        'min_data_in_leaf': hp.quniform('min_data_in_leaf', 10, 251, 50),
        'lambda_l1': hp.uniform('lambda_l1', 0, 0.3),
        'lambda_l2': hp.uniform('lambda_l2', 0, 0.3),
        'verbose': -1,
        'seed': 42,
        'bagging_freq': hp.quniform('bagging_freq', 1, 30, 5),
        #the LGBM parameters docs list various aliases, and the LGBM implementation seems to complain about
        #the following not being used due to other params, so trying to silence the complaints by setting to None
#         'subsample': None, #overridden by bagging_fraction
#         'reg_alpha': None, #overridden by lambda_l1
#         'reg_lambda': None, #overridden by lambda_l2
        'min_sum_hessian_in_leaf': None, #overrides min_child_weight
        'min_child_samples': None, #overridden by min_data_in_leaf
        'colsample_bytree': None, #overridden by feature_fraction
#        'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
        'min_child_weight': hp.loguniform('min_child_weight', -16, 10), #also aliases to min_sum_hessian
        'metric': 'mae',
    }
    space['objective'] = "regression"

    global max_n
    max_n = max_n_search
    trials = Trials()
    best = fmin(fn=objective_sklearn,
                space=space,
                algo=tpe.suggest,
                max_evals=n_trials,
                trials=trials,
               verbose= 1)

    # find the trial with lowest loss value. this is what we consider the best one
    idx = np.argmin(trials.losses())
    print(idx)

    print(trials.trials[idx])

    # these should be the training parameters to use to achieve the best score in best trial
    params = trials.trials[idx]["result"]["params"]
    max_n = None

    print('==============================')
    print('= PARAMS')
    print('==============================')
    print(params)
    return params

# run a search
def run_lgb(X_cols, df_train, df_test, y_param, max_n=60000):
    global X
    global y
    y = y_param
    nrows = max_n

    X = df_train[X_cols]
    X_test = df_test[X_cols]

    # the param is the number of rows to use for training
    params = optimize_lgbm(max_n)
    print(params)

    return params

In [11]:
cols = df.drop(['price'], 1).columns
params = run_lgb(cols, df, df_test, df['price'])

100%|██████████| 200/200 [1:39:09<00:00, 29.75s/trial, best loss: 644036.1581950441]
59
{'state': 2, 'tid': 59, 'spec': None, 'result': {'loss': 644036.1581950441, 'score': 644036.1581950441, 'params': {'bagging_fraction': 0.8876911520616036, 'bagging_freq': 5, 'boosting_type': 'gbdt', 'colsample_bytree': None, 'feature_fraction': 0.8568055674122264, 'lambda_l1': 0.11085417960346233, 'lambda_l2': 0.21631191989432652, 'learning_rate': 0.15926206787675018, 'metric': 'mae', 'min_child_samples': None, 'min_child_weight': 0.0002629359348261388, 'min_data_in_leaf': 0, 'min_sum_hessian_in_leaf': None, 'num_leaves': 328, 'objective': 'regression', 'seed': 42, 'subsample_for_bin': 2800, 'verbose': -1, 'n_estimators': 10000}, 'status': 'ok'}, 'misc': {'tid': 59, 'cmd': ('domain_attachment', 'FMinIter_Domain'), 'workdir': None, 'idxs': {'bagging_fraction': [59], 'bagging_freq': [59], 'boosting_type': [59], 'feature_fraction': [59], 'lambda_l1': [59], 'lambda_l2': [59], 'learning_rate': [59], 'min