In [39]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
from sklearn.linear_model import LinearRegression
import random
import datetime as dt

# Parameters
XGB_WEIGHT = 0.8000  # Weight of XGB models


In [40]:
print("Reading data...")
#read training data
list_p = []
list_p.append(pd.read_csv('input/properties_2016.csv'))
list_p.append(pd.read_csv('input/properties_2017.csv'))
prop = pd.concat(list_p)

list_t = []
list_t.append(pd.read_csv('input/train_2016_v2.csv'))
list_t.append(pd.read_csv('input/train_2017.csv'))
train = pd.concat(list_t)
print("finished")

Reading data...


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


finished


In [41]:
print("drop out outliers...")
ulimit = np.percentile(train.logerror.values, 99)
llimit = np.percentile(train.logerror.values, 1)
train=train[train.logerror < ulimit]
train=train[train.logerror > llimit]

drop out outliers...


In [42]:
print( "Processing data for LightGBM..." )
for c, dtype in zip(prop.columns, prop.dtypes):	
    if dtype == np.float64:		
        prop[c] = prop[c].astype(np.float32)

df_train = train.merge(prop, how='left', on='parcelid')
df_train.fillna(df_train.median(),inplace = True)

x_train = df_train[['basementsqft', 'buildingqualitytypeid','threequarterbathnbr','finishedfloor1squarefeet',
                    'finishedsquarefeet6','finishedsquarefeet15','fireplacecnt','fullbathcnt','garagecarcnt',
                    'garagetotalsqft','lotsizesquarefeet','poolsizesum','roomcnt','yearbuilt','latitude','longitude']]
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

Processing data for LightGBM...
(329058, 16) (329058,)


In [43]:
#load training dataset
train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

x_train = x_train.values.astype(np.float32, copy=False)
d_train = lgb.Dataset(x_train, label=y_train)

In [44]:
#set parameters of lgb
params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.0022 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'          
params['sub_feature'] = 0.35    # feature_fraction (small values => use very different submodels)
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 512        # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
params['verbose'] = 0
params['feature_fraction_seed'] = 2
params['bagging_seed'] = 3

np.random.seed(0)
random.seed(0)

print("Fitting LightGBM model ...")
lgb_model = lgb.train(params, d_train, 430)
print("finished")

Fitting LightGBM model ...
finished


In [45]:
lgb_model.save_model('lgb_model_logerror.txt')

<lightgbm.basic.Booster at 0x1a2fa8b828>

In [46]:
del d_train; gc.collect()
del x_train; gc.collect()

0

In [47]:
lgb_model = lgb.Booster(model_file='lgb_model_logerror.txt')

In [48]:
print("Preparing for LightGBM prediction ...")
print("Reading files ...")
sample = pd.read_csv('input/sample_submission.csv')

Preparing for LightGBM prediction ...
Reading files ...


In [49]:
print("...")
sample['parcelid'] = sample['ParcelId']
print("Merge with property data ...")
df_test = sample.merge(prop, on='parcelid', how='left')
print("...")
del sample, prop; gc.collect()

...
Merge with property data ...
...


35

In [50]:
print("...")
x_test = df_test[train_columns]
print("...")
del df_test; gc.collect()

...
...


7

In [51]:
print("Preparing x_test...")
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

Preparing x_test...


In [52]:
print("...")
x_test = x_test.values.astype(np.float32, copy=False)

...


In [53]:
print("Start LightGBM prediction ...")
lgb_pred = lgb_model.predict(x_test)

Start LightGBM prediction ...


In [54]:
print(type(lgb_pred))

<class 'numpy.ndarray'>


In [55]:
del x_test; gc.collect()

print( "first LightGBM predictions:" )
print( pd.DataFrame(lgb_pred).head())

first LightGBM predictions:
          0
0  0.012263
1  0.012263
2  0.012915
3  0.012915
4  0.012387


In [56]:
print("Reading data...")
#read training data
list_p = []
list_p.append(pd.read_csv('input/properties_2016.csv'))
list_p.append(pd.read_csv('input/properties_2017.csv'))
prop = pd.concat(list_p)

list_t = []
list_t.append(pd.read_csv('input/train_2016_v2.csv'))
list_t.append(pd.read_csv('input/train_2017.csv'))
train = pd.concat(list_t)
print("finished")

Reading data...


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


finished


In [57]:
print("drop out outliers...")
ulimit = np.percentile(train.logerror.values, 99)
llimit = np.percentile(train.logerror.values, 1)
train=train[train.logerror < ulimit]
train=train[train.logerror > llimit]

drop out outliers...


In [58]:
print( "processing data for XGBoost ...")
for c in prop.columns:
    prop[c]=prop[c].fillna(-1)
    if prop[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(prop[c].values))
        prop[c] = lbl.transform(list(prop[c].values))
print("finished")

processing data for XGBoost ...
finished


In [59]:
df_train = train.merge(prop, how='left', on='parcelid')
df_train.fillna(df_train.median(),inplace = True)

x_train = df_train[['basementsqft', 'buildingqualitytypeid','threequarterbathnbr','finishedfloor1squarefeet',
                    'finishedsquarefeet6','finishedsquarefeet15','fireplacecnt','fullbathcnt','garagecarcnt',
                    'garagetotalsqft','lotsizesquarefeet','poolsizesum','roomcnt','yearbuilt','latitude','longitude']]
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)
y_mean = np.mean(y_train)

(329058, 16) (329058,)


In [77]:
#set parameters of xgboost
print("setting up data for XGBoost...")
# xgboost params
xgb_params = {
    'eta': 0.038,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}
num_boost_rounds = 250
dtrain = xgb.DMatrix(x_train, y_train)

setting up data for XGBoost...


In [78]:
#train xgboost
print("training xgboost...")
xgb_model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)
print("finished")

training xgboost...
finished


In [62]:
xgb_model.save_model('xgb_model_logerror.txt')


In [64]:
print("Preparing for LightGBM prediction ...")
print("Reading files ...")
sample = pd.read_csv('input/sample_submission.csv')


Preparing for LightGBM prediction ...
Reading files ...


In [65]:
print("...")
sample['parcelid'] = sample['ParcelId']
print("Merge with property data ...")
df_test = sample.merge(prop, on='parcelid', how='left')
print("...")
del sample, prop; gc.collect()

...
Merge with property data ...
...


169

In [66]:
print("...")
x_test = df_test[train_columns]
print("...")
del df_test; gc.collect()

...
...


7

In [67]:
dtest = xgb.DMatrix(x_test)

In [79]:
print( "start XGBoost prediction...")
xgb_pred = xgb_model.predict(dtest)


start XGBoost prediction...


In [80]:
print( "first XGBoost predictions:" )
print( pd.DataFrame(xgb_pred).head() )

first XGBoost predictions:
          0
0  0.024022
1  0.024022
2  0.009763
3  0.009763
4  0.055549


In [81]:
#combine xgb and lgb
xgb_pred = XGB_WEIGHT*xgb_pred + (1-XGB_WEIGHT)*lgb_pred
#xgb_pred = xgb_pred1

print( "Combined XGBoost and Light GBM predictions:" )
print( pd.DataFrame(xgb_pred).head() )

Combined XGBoost and Light GBM predictions:
          0
0  0.021670
1  0.021670
2  0.010394
3  0.010394
4  0.046916
