In [1]:
import numpy as np
import pandas as pd
import os
from sklearn import preprocessing, model_selection, metrics
from lightgbm import LGBMRegressor, train, Dataset

* **First, import data from csv.**

In [2]:
data_dir = '../input/'
properties_2016 = pd.read_csv(data_dir + 'properties_2016.csv')
properties_2017 = pd.read_csv(data_dir + 'properties_2017.csv')
train_2016 = pd.read_csv(data_dir + 'train_2016_v2.csv')
train_2017 = pd.read_csv(data_dir + 'train_2017.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [48]:
properties_2016.shape

(2985217, 58)

In [49]:
properties_2017.shape

(2985217, 58)

* **It is verified below that all properties appear in 2016 training data also appear in 2017. Since 2017 training data include additional information for some properties which is not available in 2016 data, according to explanation from Kagle, 2017 data is used for training here.**

In [3]:
properties_2016.loc[~properties_2016['parcelid'].isin(properties_2017['parcelid'])]

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock


* **Here transaction information is joined with property features to form a complete set of data.**

In [4]:
data_2016 = train_2016.merge(properties_2017, on='parcelid', how='left')
data_2017 = train_2017.merge(properties_2017, on='parcelid', how='left')
data_2016[['transactionyear', 'transactionmonth', 'transactionday']] = data_2016['transactiondate'].str.split('-', expand=True)
data_2017[['transactionyear', 'transactionmonth', 'transactionday']] = data_2017['transactiondate'].str.split('-', expand=True)
data = pd.concat([data_2016, data_2017], ignore_index=True)

In [8]:
data

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,transactionyear,transactionmonth,transactionday
0,11016594,0.027600,2016-01-01,1.0,,,2.0,3.0,,8.0,...,545000.0,2016.0,327000.0,6735.88,,,6.037107e+13,2016,01,01
1,14366692,-0.168400,2016-01-01,,,,3.5,4.0,,,...,974900.0,2016.0,628442.0,10153.02,,,,2016,01,01
2,12098116,-0.004000,2016-01-01,1.0,,,3.0,2.0,,8.0,...,989500.0,2016.0,791600.0,11484.48,,,6.037464e+13,2016,01,01
3,12643413,0.021800,2016-01-02,1.0,,,2.0,2.0,,7.0,...,248613.0,2016.0,74480.0,3048.74,,,6.037296e+13,2016,01,02
4,14432541,-0.005000,2016-01-02,,,,2.5,4.0,,,...,441177.0,2016.0,269017.0,5488.96,,,6.059042e+13,2016,01,02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167883,10833991,-0.002245,2017-09-20,1.0,,,3.0,3.0,,8.0,...,379000.0,2016.0,114000.0,4685.34,,,6.037132e+13,2017,09,20
167884,11000655,0.020615,2017-09-20,,,,2.0,2.0,,6.0,...,354621.0,2016.0,283704.0,4478.43,,,6.037101e+13,2017,09,20
167885,17239384,0.013209,2017-09-21,,,,2.0,4.0,,,...,67205.0,2016.0,16522.0,1107.48,,,6.111008e+13,2017,09,21
167886,12773139,0.037129,2017-09-21,1.0,,,1.0,3.0,,4.0,...,49546.0,2016.0,16749.0,876.43,,,6.037434e+13,2017,09,21


* **The following step checks what kinds of features the data have.**

In [5]:
import collections
collections.Counter(data.dtypes[2:])

Counter({dtype('O'): 9, dtype('float64'): 52})

* **There are numerical and categorical features. For the sake of convenience the data are split into 2 parts in terms of numerical features and categorical features.**

In [7]:
# categorical_properties = np.array([column for column in data_2017 if str(data_2017[column].dtypes) == 'object'])
# numerical_properties = np.array([column for column in data_2017 if str(data_2017[column].dtypes) == 'float64'])[1:]

In [6]:
categorical_properties = [p for p in data if str(data[p].dtypes) == 'object' or ('id' in p and 'parcelid' not in p) or 'fips' in p or 'censustractandblock' in p]
numerical_properties = [p for p in data if p not in categorical_properties and p != 'parcelid' and p!= 'logerror']

In [7]:
numerical_properties[:10]

['basementsqft',
 'bathroomcnt',
 'bedroomcnt',
 'calculatedbathnbr',
 'finishedfloor1squarefeet',
 'calculatedfinishedsquarefeet',
 'finishedsquarefeet12',
 'finishedsquarefeet13',
 'finishedsquarefeet15',
 'finishedsquarefeet50']

In [8]:
categorical_properties[:10]

['transactiondate',
 'airconditioningtypeid',
 'architecturalstyletypeid',
 'buildingclasstypeid',
 'buildingqualitytypeid',
 'decktypeid',
 'fips',
 'hashottuborspa',
 'heatingorsystemtypeid',
 'pooltypeid10']

In [10]:
data_cat = data[np.append(['parcelid', 'logerror'], categorical_properties)]
data_num = data[np.append(['parcelid', 'logerror'], numerical_properties)]

* **NaNs in numerical data are replaced with mean value of the corresponding feature; categorical with mode.**

In [11]:
data_num_filled = data_num.fillna(data_num.mean())
data_cat_filled = data_cat.fillna(data_cat.mode().iloc[0])

* **Correlation between each individual numerical feature and logerror are calculated. TopN most correlated features are selected.**

In [12]:
cors = {column: data_num_filled['logerror'].corr(data_num_filled[column]) for column in numerical_properties[1:]}
cors = {key:value for (key, value) in cors.items() if not np.isnan(value)} # removing nan values
cors_sorted = dict(sorted(cors.items(), key=lambda item:np.abs(item[1]), reverse=True))

In [13]:
topN = 10
features_num = set(dict(sorted(cors_sorted.items(), key=lambda item:np.abs(item[1]), reverse=True)[:topN]).keys())
features_num.add('transactionyear')
# transaction year is included here because it will be used in prediction

In [14]:
{k:cors[k] for k in features_num if k in cors}

{'garagecarcnt': 0.014458426137687325,
 'calculatedfinishedsquarefeet': 0.038283474667789044,
 'taxdelinquencyyear': -0.010751199215757783,
 'finishedsquarefeet12': 0.040101913730560615,
 'fullbathcnt': 0.026341058203474488,
 'yearbuilt': 0.009823573845404594,
 'garagetotalsqft': 0.015742737146362422,
 'calculatedbathnbr': 0.02755502072646444,
 'bedroomcnt': 0.027185755309712675,
 'bathroomcnt': 0.025053101781299277}

* **The same is done with categorical features.**

In [15]:
le = preprocessing.LabelEncoder()
cors_cat = {}
cors_cat = {column: data_cat_filled['logerror'].corr(pd.Series(le.fit_transform(data_cat_filled[column]))) for column in categorical_properties}
cors_cat = {key:value for (key, value) in cors_cat.items() if not np.isnan(value)} # removing nan values
cors_cat_sorted = dict(sorted(cors_cat.items(), key=lambda item:np.abs(item[1]), reverse=True))

# for feature in categorical_properties:
#     x = pd.Series(le.fit_transform(data_cat_2016_filled[feature]))
#     y = data_2016[data_2016[feature].notna()]['logerror']
#     cors_cat[feature] = x.corr(y)

In [16]:
topN_cat = 12
features_cat = set(dict(sorted(cors_cat_sorted.items(), key=lambda item:np.abs(item[1]), reverse=True)[:topN_cat]).keys())
features_cat.add('transactionmonth')
features_cat.discard('transactiondate')
features_cat.discard('transactionyear')
# transaction month is included here because it will be used in prediction

In [17]:
{k:cors_cat[k] for k in features_cat}

{'fips': 0.00825309457357788,
 'regionidneighborhood': -0.00706241284358163,
 'censustractandblock': 0.005994429749113647,
 'heatingorsystemtypeid': -0.015668568378374294,
 'propertylandusetypeid': -0.00632961765610038,
 'buildingclasstypeid': 0.00605592739469304,
 'propertyzoningdesc': -0.006380099021812083,
 'regionidzip': 0.01603424271883758,
 'propertycountylandusecode': 0.005540343271338882,
 'regionidcounty': -0.007488740973833986,
 'transactionmonth': 0.003795931770945203}

* **Numerical and categorical features combined forms our features of interest.**

In [18]:
features = features_num.union(features_cat)

In [22]:
features

{'bathroomcnt',
 'bedroomcnt',
 'buildingclasstypeid',
 'calculatedbathnbr',
 'calculatedfinishedsquarefeet',
 'censustractandblock',
 'finishedsquarefeet12',
 'fips',
 'fullbathcnt',
 'garagecarcnt',
 'garagetotalsqft',
 'heatingorsystemtypeid',
 'propertycountylandusecode',
 'propertylandusetypeid',
 'propertyzoningdesc',
 'regionidcounty',
 'regionidneighborhood',
 'regionidzip',
 'taxdelinquencyyear',
 'transactionmonth',
 'transactionyear',
 'yearbuilt'}

In [19]:
processed_data = data[{'logerror'}.union(features)]
# data with interested features

In [20]:
processed_data_filled = processed_data.copy()

In [21]:
processed_data_filled.loc[:, features_cat] = data_cat_filled
processed_data_filled.loc[:, features_num] = data_num_filled
# processed_data_filled are data with NaN filled

* **Since according to "Advanced Topics - Categorical Feature Support" (on https://github.com/Microsoft/LightGBM/blob/master/docs/Advanced-Topics.rst#missing-value-handle), categorical features with high cardinality usually is best treated as numerical features. Therefore, based on the output of the following cell, "censustractandblock" is to be considered as a numerical.**

In [22]:
processed_data[features_cat].nunique()

fips                             3
regionidneighborhood           505
censustractandblock          57542
heatingorsystemtypeid           12
propertylandusetypeid           14
buildingclasstypeid              2
propertyzoningdesc            2346
regionidzip                    390
propertycountylandusecode       84
regionidcounty                   3
transactionmonth                12
dtype: int64

In [23]:
processed_data['censustractandblock'].dtypes

dtype('float64')

This feature is already treated as numerical in the dataframe. Therefore no encoding is needed. It just need to be moved from categorical features to numerical features. 

In [24]:
features_num.add('censustractandblock')
features_cat.discard('censustractandblock')

* **Check data type of features and encode categories.**

In [25]:
processed_data[features].dtypes

fips                            float64
garagecarcnt                    float64
regionidneighborhood            float64
censustractandblock             float64
taxdelinquencyyear              float64
heatingorsystemtypeid           float64
propertylandusetypeid           float64
finishedsquarefeet12            float64
fullbathcnt                     float64
yearbuilt                       float64
garagetotalsqft                 float64
bedroomcnt                      float64
bathroomcnt                     float64
propertycountylandusecode        object
regionidcounty                  float64
transactionmonth                 object
calculatedfinishedsquarefeet    float64
transactionyear                  object
buildingclasstypeid             float64
propertyzoningdesc               object
calculatedbathnbr               float64
regionidzip                     float64
dtype: object

In [26]:
le_pluc = preprocessing.LabelEncoder()
le_pzd = preprocessing.LabelEncoder()
# propertycountylandusecode
processed_data.loc[processed_data['propertycountylandusecode'].notna(), 'propertycountylandusecode'] = le_pluc.fit_transform(processed_data.loc[processed_data['propertycountylandusecode'].notna(), 'propertycountylandusecode'])
processed_data.loc[processed_data['propertyzoningdesc'].notna(), 'propertyzoningdesc'] = le_pzd.fit_transform(processed_data.loc[processed_data['propertyzoningdesc'].notna(), 'propertyzoningdesc'])
processed_data.loc[:, ['transactionmonth', 'transactionyear', 'propertycountylandusecode', 'propertyzoningdesc']] = processed_data.loc[:, ['transactionmonth', 'transactionyear', 'propertycountylandusecode', 'propertyzoningdesc']].astype('str').astype('float')
for feature in features_cat:
    processed_data.loc[:,feature] = processed_data[feature].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


* **Steps above are the preprocessing of data. Now it's time to train the model. First split data into training and test sets.**

In [27]:
training_data, test_data = model_selection.train_test_split(processed_data, test_size=0.2, random_state=6911)

* **Training the model (parameters are yet to be tuned)**

In [28]:
lgbm_model = LGBMRegressor(boosting_type='gbdt', num_leaves=31, max_depth=- 1, learning_rate=0.1, \
                             n_estimators=100, subsample_for_bin=200000, objective=None, class_weight=None, \
                             min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, \
                             subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, \
                             random_state=None, n_jobs=- 1, silent=True, importance_type='split')
lgbm_model.fit(training_data[features], training_data['logerror'])
# training_set = Dataset(data=training_data[features], label=training_data['logerror'], categorical_feature=features_cat)
# test_set = Dataset(data=test_data[features], label=test_data['logerror'], categorical_feature=features_cat)
# trained = train({}, train_set=training_set, valid_sets=test_set)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

* **Testing the model**

In [33]:
prediction = lgbm_model.predict(test_data[features])

In [34]:
metrics.mean_absolute_error(test_data['logerror'], prediction)

0.07153428394436478

* **Generating prediction for submission**

In [29]:
sample_submission = sample_submission.rename(columns={'ParcelId': 'parcelid'})
sample_submission

Unnamed: 0,parcelid,201610,201611,201612,201710,201711,201712
0,10754147,0,0,0,0,0,0
1,10759547,0,0,0,0,0,0
2,10843547,0,0,0,0,0,0
3,10859147,0,0,0,0,0,0
4,10879947,0,0,0,0,0,0
...,...,...,...,...,...,...,...
2985212,168176230,0,0,0,0,0,0
2985213,14273630,0,0,0,0,0,0
2985214,168040630,0,0,0,0,0,0
2985215,168040830,0,0,0,0,0,0


* **Generate test data with parcelid specified in sample_submission.csv (encode label, set data type, select features)**

In [30]:
data_sub = pd.DataFrame(sample_submission['parcelid']).merge(properties_2017, on='parcelid', how='left')
data_sub.loc[~data_sub['propertycountylandusecode'].isin(le_pluc.classes_), 'propertycountylandusecode'] = np.nan
data_sub.loc[~data_sub['propertyzoningdesc'].isin(le_pzd.classes_), 'propertyzoningdesc'] = np.nan
data_sub.loc[data_sub['propertycountylandusecode'].notna(), 'propertycountylandusecode'] = le_pluc.transform(data_sub.loc[data_sub['propertycountylandusecode'].notna(), 'propertycountylandusecode'])
data_sub.loc[data_sub['propertyzoningdesc'].notna(), 'propertyzoningdesc'] = le_pzd.transform(data_sub.loc[data_sub['propertyzoningdesc'].notna(), 'propertyzoningdesc'])
data_sub.loc[:, ['propertycountylandusecode', 'propertyzoningdesc']] = data_sub.loc[:, ['propertycountylandusecode', 'propertyzoningdesc']].astype('str').astype('float')

In [40]:
data_201610.shape

(2985217, 22)

In [39]:
data_201610 = data_sub.copy()
data_201610['transactionyear'] = 2016
data_201610['transactionmonth'] = 10
data_201610 = data_201610[features]
for feature in features_cat:
    data_201610.loc[:,feature] = data_201610[feature].astype('category')

# data_201611 = data_sub.copy()
# data_201611['transactionyear'] = 2016
# data_201611['transactionmonth'] = 11
# data_201611 = data_201611[features]
# for feature in features_cat:
#     data_201611.loc[:,feature] = data_201611[feature].astype('category')

# data_201612 = data_sub.copy()
# data_201612['transactionyear'] = 2016
# data_201612['transactionmonth'] = 12
# data_201612 = data_201612[features]
# for feature in features_cat:
#     data_201612.loc[:,feature] = data_201612[feature].astype('category')

# data_201710 = data_sub.copy()
# data_201710['transactionyear'] = 2017
# data_201710['transactionmonth'] = 10
# data_201710 = data_201710[features]
# for feature in features_cat:
#     data_201710.loc[:,feature] = data_201710[feature].astype('category')

# data_201711 = data_sub.copy()
# data_201711['transactionyear'] = 2017
# data_201711['transactionmonth'] = 11
# data_201711 = data_201711[features]
# for feature in features_cat:
#     data_201711.loc[:,feature] = data_201711[feature].astype('category')

# data_201712 = data_sub.copy()
# data_201712['transactionyear'] = 2017
# data_201712['transactionmonth'] = 12
# data_201712 = data_201712[features]
# for feature in features_cat:
#     data_201712.loc[:,feature] = data_201712[feature].astype('category')

In [42]:
data_201712.columns

Index(['fips', 'garagecarcnt', 'regionidneighborhood', 'censustractandblock',
       'taxdelinquencyyear', 'heatingorsystemtypeid', 'propertylandusetypeid',
       'finishedsquarefeet12', 'fullbathcnt', 'yearbuilt', 'garagetotalsqft',
       'bedroomcnt', 'bathroomcnt', 'propertycountylandusecode',
       'regionidcounty', 'transactionmonth', 'calculatedfinishedsquarefeet',
       'transactionyear', 'buildingclasstypeid', 'propertyzoningdesc',
       'calculatedbathnbr', 'regionidzip'],
      dtype='object')

* **Generate prediction and write to file.**

In [32]:
lgbm_model_2 = LGBMRegressor()
lgbm_model_2.fit(processed_data[features], processed_data['logerror'])
sample_submission['201610'] = lgbm_model_2.predict(data_201610)
sample_submission['201611'] = lgbm_model_2.predict(data_201611)
sample_submission['201612'] = lgbm_model_2.predict(data_201612)
sample_submission['201710'] = lgbm_model_2.predict(data_201710)
sample_submission['201711'] = lgbm_model_2.predict(data_201711)
sample_submission['201712'] = lgbm_model_2.predict(data_201712)

In [33]:
sample_submission

Unnamed: 0,parcelid,201610,201611,201612,201710,201711,201712
0,10754147,-0.014497,-0.013062,-0.013900,-0.012003,-0.010568,-0.011406
1,10759547,0.685942,0.631506,0.692798,0.687986,0.633550,0.694842
2,10843547,0.258394,0.158115,0.238358,0.227654,0.127375,0.208213
3,10859147,0.098358,0.054350,0.097704,0.096205,0.052198,0.096148
4,10879947,-0.057821,-0.060834,-0.062593,-0.068021,-0.071034,-0.072793
...,...,...,...,...,...,...,...
2985212,168176230,0.003736,0.000077,0.003679,0.007726,0.004067,0.007668
2985213,14273630,0.009768,0.000841,0.015440,0.013758,0.004831,0.019430
2985214,168040630,-0.005026,-0.008316,-0.005084,-0.001037,-0.004326,-0.001094
2985215,168040830,-0.016286,-0.019575,-0.016343,-0.012296,-0.015586,-0.012353


In [40]:
# sample_submission.to_csv(data_dir + 'sample_submission_output.csv')

#### Parameter Tuning


In [41]:
from hyperopt import hp, tpe, fmin, Trials
from sklearn.model_selection import KFold
no_fold = 4

# def objective_fun(boosting_type, num_leaves, max_depth, learning_rate, n_estimators, \
#                   min_split_gain, min_child_weight, min_child_samples, subsample, subsample_freq, \
#                   subsample_freq, colsample_bytree, reg_alpha, reg_lambda, random_state):
#     model = LGBMRegressor(boosting_type, num_leaves, max_depth, learning_rate, n_estimators, \
#                   min_split_gain, min_child_weight, min_child_samples, \
#                   subsample_freq, colsample_bytree, reg_alpha, reg_lambda, random_state)
def objective_fun(args):
#     boosting_type, num_leaves, max_depth, learning_rate, n_estimators, min_split_gain, min_child_weight, \
#     min_child_samples, subsample, subsample_freq, colsample_bytree, reg_alpha, reg_lambda, random_state = args
#     model = LGBMRegressor(boosting_type=boosting_type, num_leaves=num_leaves, max_depth=max_depth, \
#                           learning_rate=learning_rate, n_estimators=n_estimators, \
#                           min_split_gain=min_split_gain, min_child_weight=min_child_weight, \
#                           min_child_samples=min_child_samples, subsample_freq, \
#                           subsample_freq=subsample_freq, colsample_bytree=colsample_bytree, \
#                           reg_alpha=reg_alpha, reg_lambda=reg_lambda, random_state=random_state)
    model = LGBMRegressor(num_leaves=int(args['num_leaves']), \
                          max_depth=int(args['max_depth']), learning_rate=args['learning_rate'], \
                          n_estimators=int(args['n_estimators']), min_split_gain=args['min_split_gain'], \
                          min_child_weight=args['min_child_weight'], min_child_samples=int(args['min_child_samples']), \
                          subsample=args['subsample'], subsample_freq=int(args['subsample_freq']), \
                          colsample_bytree=args['colsample_bytree'], reg_alpha=args['reg_alpha'], \
                          reg_lambda=args['reg_lambda'], random_state=args['random_state'])
    kf = KFold(n_splits=no_fold)
    loss = 0
    for index_tr, index_val in kf.split(training_data[features], training_data['logerror']):
        data_tr = training_data.iloc[index_tr]
        data_val = training_data.iloc[index_val]
        model.fit(data_tr[features], data_tr['logerror'])
        prediction = model.predict(data_val[features])
        loss += metrics.mean_absolute_error(data_val['logerror'], prediction) / no_fold
    return {
        'status' : 'ok',
        'loss' : loss
    }

hp_space = {
#    'boosting_type' : hp.choice('boosting_type', ['gbdt', 'dart', 'goss', 'rf']),
    'num_leaves' : hp.quniform('num_leaves', 10, 500, 1),
    'max_depth' : hp.quniform('max_depth', 10, 500, 1),
    'learning_rate' : hp.uniform('learning_rate', 0.0001, 0.1),
    'n_estimators' : hp.quniform('n_estimators', 10, 1000, 1),
    'min_split_gain' : hp.uniform('min_split_gain', 0, 1),
    'min_child_weight' : hp.uniform('min_child_weight', 0.0001, 1),
    'min_child_samples' : hp.quniform('min_child_samples', 10, 500, 1),
    'subsample' : hp.uniform('subsample', 0, 1),
    'subsample_freq' : hp.quniform('subsample_freq', 0, 5000, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0, 1),
    'reg_alpha' : hp.uniform('reg_alpha', 0, 5),
    'reg_lambda' : hp.uniform('reg_lambda', 0, 5),
    'random_state' :  None
}

trials = Trials()

best = fmin(fn=objective_fun, space=hp_space, algo=tpe.suggest, max_evals=1000, trials=trials)

import pickle
with open('trials', 'wb') as file:
    pickle.dump(best, file)
    pickle.dump(trials, file)

ModuleNotFoundError: No module named 'hyperopt'

In [43]:
best ={'colsample_bytree': 0.2147467656099759,
 'learning_rate': 0.007138343861070687,
 'max_depth': 442.0,
 'min_child_samples': 23.0,
 'min_child_weight': 0.8464143518606242,
 'min_split_gain': 0.000172647682586337,
 'n_estimators': 149.0,
 'num_leaves': 188.0,
 'reg_alpha': 2.6763347204500403,
 'reg_lambda': 2.5674135399163243,
 'subsample': 0.9996855816069476,
 'subsample_freq': 2629.0}

In [None]:
best

In [44]:
tuned_model = LGBMRegressor(colsample_bytree=best['colsample_bytree'], learning_rate=best['learning_rate'], \
                            max_depth=int(best['max_depth']), min_child_samples=int(best['min_child_samples']), \
                            min_child_weight=best['min_child_weight'], min_split_gain=best['min_split_gain'], \
                            n_estimators=int(best['n_estimators']), num_leaves=int(best['num_leaves']), \
                            reg_alpha=best['reg_alpha'], reg_lambda=best['reg_lambda'], \
                            subsample=best['subsample'], subsample_freq=int(best['subsample_freq']))

In [45]:
tuned_model.fit(training_data[features], training_data['logerror'])
prediction = tuned_model.predict(test_data[features])
metrics.mean_absolute_error(test_data['logerror'], prediction)

0.06880767103493073

* **Generate prediction using tuned model and write to file.**

In [46]:
tuned_model.fit(processed_data[features], processed_data['logerror'])
sample_submission['201610'] = tuned_model.predict(data_201610)
sample_submission['201611'] = tuned_model.predict(data_201611)
sample_submission['201612'] = tuned_model.predict(data_201612)
sample_submission['201710'] = tuned_model.predict(data_201710)
sample_submission['201711'] = tuned_model.predict(data_201711)
sample_submission['201712'] = tuned_model.predict(data_201712)
sample_submission = sample_submission.rename(columns={'parcelid': 'ParcelId'})

In [47]:
sample_submission.to_csv(data_dir + 'sample_submission_output.csv', index=False)