Data is from [https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)

In [23]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from hyperopt import Trials, fmin, STATUS_OK
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [92]:
train = pd.read_csv("data/train.csv")
train.set_index("Id",inplace=True)
train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [93]:
target = train['SalePrice'].values
train.drop('SalePrice',inplace=True,axis=1)
numerical_features_mask = train.columns[(train.dtypes == "float64") | (train.dtypes == "int64")].tolist()
label_features_mask = train.columns[(train.dtypes == "object")].tolist()

train[numerical_features_mask] = train[numerical_features_mask].fillna(train[numerical_features_mask].mean())
train[label_features_mask] = train[label_features_mask].fillna("None")

scaler = StandardScaler()
ordinal_encoder = OrdinalEncoder()

numerical_features = scaler.fit_transform(train[numerical_features_mask])
label_features = ordinal_encoder.fit_transform(train[label_features_mask])
features = np.hstack([numerical_features, label_features])

In [94]:
x_train, x_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    test_size=0.2, 
                                                    random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=1)

In [95]:
lightgbm_hp_train = lgb.Dataset(x_train, y_train)
lightgbm_hp_val = lgb.Dataset(x_val, y_val)

In [104]:
lgbm_best_m_parameters = {
    'application': 'regression',
    'objective': 'regression',
    'metric':'rmse',
#     'bagging_fraction': 0.16341249004759797,
#     'bagging_frequency': 60,
#     'boosting': 'gbdt',
#     'colsample_bytree': 0.65,
#     'drop_rate': 0.02152542286648553,
#     'feature_fraction': 0.9925588223029901,
#     'learning_rate': 2,
#     'min_child_weight': 300.0,
#     'num_leaves': 113,
#     'reg_alpha': 209.17343985118765,
#     'reg_lambda': 274.44813578528056,
#     'scale_pos_weight': 8.607399330844665,
#     'subsample': 1.0,
}
gbm = lgb.train(lgbm_best_m_parameters, lightgbm_hp_train, valid_sets=lightgbm_hp_val)

[1]	valid_0's rmse: 68439.8
[2]	valid_0's rmse: 63326.8
[3]	valid_0's rmse: 58779.4
[4]	valid_0's rmse: 54852.6
[5]	valid_0's rmse: 51263
[6]	valid_0's rmse: 47919.1
[7]	valid_0's rmse: 45326.8
[8]	valid_0's rmse: 42775.3
[9]	valid_0's rmse: 40656.3
[10]	valid_0's rmse: 38633.6
[11]	valid_0's rmse: 36888.8
[12]	valid_0's rmse: 35302.2
[13]	valid_0's rmse: 33784
[14]	valid_0's rmse: 32825.9
[15]	valid_0's rmse: 31682.7
[16]	valid_0's rmse: 30884.2
[17]	valid_0's rmse: 30113
[18]	valid_0's rmse: 29235.8
[19]	valid_0's rmse: 28582.3
[20]	valid_0's rmse: 28012
[21]	valid_0's rmse: 27422.1
[22]	valid_0's rmse: 26997.2
[23]	valid_0's rmse: 26758.7
[24]	valid_0's rmse: 26386.3
[25]	valid_0's rmse: 26093.9
[26]	valid_0's rmse: 25881
[27]	valid_0's rmse: 25745.6
[28]	valid_0's rmse: 25578.9
[29]	valid_0's rmse: 25410.7
[30]	valid_0's rmse: 25226.8
[31]	valid_0's rmse: 25065.7
[32]	valid_0's rmse: 25047.4
[33]	valid_0's rmse: 24975.6
[34]	valid_0's rmse: 24819.3
[35]	valid_0's rmse: 24814
[36]	v

In [105]:
y_train_pred = gbm.predict(x_train)
r2_score(y_train, y_train_pred)

0.9756681263938893

In [106]:
y_test_pred = gbm.predict(x_test)
r2_score(y_test, y_test_pred)

0.8864495036436278

In [107]:
test = pd.read_csv("data/test.csv")
test.set_index("Id",inplace=True)
test.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal


In [108]:
def process(train):
    numerical_features_mask = train.columns[(train.dtypes == "float64") | (train.dtypes == "int64")].tolist()
    label_features_mask = train.columns[(train.dtypes == "object")].tolist()

    train[numerical_features_mask] = train[numerical_features_mask].fillna(train[numerical_features_mask].mean())
    train[label_features_mask] = train[label_features_mask].fillna("None")

    scaler = StandardScaler()
    ordinal_encoder = OrdinalEncoder()

    numerical_features = scaler.fit_transform(train[numerical_features_mask])
    label_features = ordinal_encoder.fit_transform(train[label_features_mask])
    features = np.hstack([numerical_features, label_features])
    return features
features = process(test)

In [109]:
predictions = gbm.predict(features)
predictions.shape

(1459,)

In [110]:
sample_submission = pd.read_csv("data/sample_submission.csv")
sample_submission.set_index("Id",inplace=True)
sample_submission['SalePrice'] = predictions

In [111]:
sample_submission.to_csv("submission.csv")