In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_log_error as mse
from sklearn.metrics import make_scorer
import lightgbm as lgbm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform, expon
import pickle

sns.set(style="darkgrid", context="notebook")
rand_seed = 1009
np.random.seed(rand_seed)
xsize, ysize = 12.0, 8.0

import warnings
warnings.filterwarnings('ignore')

In [34]:
X_train = pd.read_csv("../data/X_train.csv", index_col=False)
X_test = pd.read_csv("../data/X_test.csv", index_col=False)
y_train = np.genfromtxt("../data/y_train.txt")
train_id = np.genfromtxt("../data/train_id.txt").astype(int)
test_id = np.genfromtxt("../data/test_id.txt").astype(int)

In [35]:
def log_rmse(y_true, y_pred):
    y_pred[y_pred < 0.0] = 0.0
    return np.sqrt(mse(y_true, y_pred))

log_rmse_scorer = make_scorer(log_rmse, greater_is_better=False)

In [36]:
lgbm_regressor = lgbm.LGBMRegressor(boosting_type="gbdt", num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100, 
                                    subsample_for_bin=200000, objective=None, class_weight=None, min_split_gain=0.0, 
                                    min_child_weight=0.001, min_child_samples=20, subsample=1.0, subsample_freq=0, 
                                    colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=rand_seed, n_jobs=-1, 
                                    silent=True, importance_type="split")
scores = cross_val_score(lgbm_regressor, X_train, y_train, groups=None, scoring=log_rmse_scorer, cv=4, n_jobs=None, verbose=0, 
                         fit_params={"eval_metric":log_rmse_scorer}, pre_dispatch="2*n_jobs", error_score="raise-deprecating")
print("RMSE: "+"%.5f"%np.mean(-scores))

RMSE: 0.14236


In [None]:
lgbm_regressor = lgbm.LGBMRegressor(boosting_type="gbdt", objective=None, class_weight=None, min_split_gain=0.0, subsample=1.0, 
                                    subsample_freq=0, random_state=rand_seed, n_jobs=-1, silent=True, importance_type="split")

param_distributions = {
    "num_leaves": randint(12, 48),
    "max_depth": randint(5, 125),
    "learning_rate": expon(scale=2.0),
    "n_estimators": randint(50, 250),
    "subsample_for_bin": randint(50000, 400000),
    "min_child_weight": expon(scale=0.5),
    "min_child_samples": randint(5, 30),
    "colsample_bytree": uniform(0.1, 0.9),
    "reg_alpha": expon(),
    "reg_lambda": expon()
}

clf = RandomizedSearchCV(lgbm_regressor, param_distributions, n_iter=750, scoring=log_rmse_scorer, n_jobs=1, iid=True, 
                         refit=True, fit_params={"eval_metric":log_rmse_scorer}, cv=4, verbose=1, pre_dispatch="2*n_jobs", 
                         random_state=rand_seed, error_score=-100.0, return_train_score=True)
clf.fit(X_train, y_train)

Fitting 4 folds for each of 750 candidates, totalling 3000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
y_pred = clf.predict(X_train)
print("RMSE: "+str("%.5f"%log_rmse(y_train, y_pred)))

In [None]:
file = open("../models/lightgbm_regression.pkl", "wb")
file.write(pickle.dumps(clf.best_estimator_))
file.close()

In [None]:
y_pred = lgbm_regressor.predict(X_test)
submission_df = pd.DataFrame(data={"Id":test_id, "SalePrice":y_pred})
submission_df.to_csv("../submissions/lightgbm_model_24_12_2018.csv", index=False)