# Kaggle Zillow Test 1 - XGBoost

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [2]:
properties = pd.read_csv('../data/properties_2016.csv')
train = pd.read_csv('../data/train_2016_v2.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
for c in properties.columns:
    properties[c]=properties[c].fillna(-1)
    if properties[c].dtype == 'object':
        print(c)
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))

train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
x_test = properties.drop(['parcelid'], axis=1)
# shape        
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))

hashottuborspa
propertycountylandusecode
propertyzoningdesc
fireplaceflag
taxdelinquencyflag
Shape train: (90275, 57)
Shape test: (2985217, 57)


In [4]:
# drop out ouliers
train_df=train_df[ train_df.logerror > -0.4 ]
train_df=train_df[ train_df.logerror < 0.4 ]
x_train=train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
y_train = train_df["logerror"].values.astype(np.float32)
y_mean = np.mean(y_train)

print('After removing outliers:')     
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))

After removing outliers:
Shape train: (88431, 57)
Shape test: (2985217, 57)


In [5]:
# xgboost params
xgb_params = {
    'eta': 0.03,
    'max_depth': 6,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': y_mean,
    'silent': 1
}

In [6]:
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

In [8]:
# cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   nfold=5,
                   num_boost_round=100,
                   early_stopping_rounds=5,
                   verbose_eval=10, 
                   show_stdv=False
                  )
num_boost_rounds = len(cv_result)
print(num_boost_rounds)

num_boost = cv_result['test-mae-mean'].argmin()
mean_mae = cv_result['test-mae-mean'].min()
print("\n\tMAE {} for {} rounds".format(mean_mae, num_boost))

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)
pred = model.predict(dtest)
y_pred=[]

[0]	train-mae:0.0528664	test-mae:0.0528782
[10]	train-mae:0.0525124	test-mae:0.0526496
[20]	train-mae:0.0522758	test-mae:0.0525202
[30]	train-mae:0.0520954	test-mae:0.0524428
[40]	train-mae:0.051948	test-mae:0.05239
[50]	train-mae:0.0518224	test-mae:0.0523592
[60]	train-mae:0.0517142	test-mae:0.0523376
[70]	train-mae:0.0516228	test-mae:0.0523228
[80]	train-mae:0.0515446	test-mae:0.0523106
[90]	train-mae:0.0514622	test-mae:0.052303
[99]	train-mae:0.0513978	test-mae:0.0522958
100

	MAE 0.0522958 for 97 rounds


In [None]:
MAE 0.0522958 for 97 rounds

In [40]:
for i,predict in enumerate(pred):
    y_pred.append(str(round(predict,10)))
y_pred=np.array(y_pred)

output = pd.DataFrame({'ParcelId': properties['parcelid'].astype(np.int32),
        '201610': y_pred, '201611': y_pred, '201612': y_pred,
        '201710': y_pred, '201711': y_pred, '201712': y_pred})
# set col 'ParceID' to first col
cols = output.columns.tolist()
cols = cols[-1:] + cols[:-1]
output = output[cols]
from datetime import datetime
output.to_csv('sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False, float_format='%.4f')
output 

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,-0.0646082,-0.0646082,-0.0646082,-0.0646082,-0.0646082,-0.0646082
1,10759547,-0.0197657,-0.0197657,-0.0197657,-0.0197657,-0.0197657,-0.0197657
2,10843547,0.0415127,0.0415127,0.0415127,0.0415127,0.0415127,0.0415127
3,10859147,0.0600102,0.0600102,0.0600102,0.0600102,0.0600102,0.0600102
4,10879947,0.010414,0.010414,0.010414,0.010414,0.010414,0.010414
5,10898347,0.0123633,0.0123633,0.0123633,0.0123633,0.0123633,0.0123633
6,10933547,-0.00389507,-0.00389507,-0.00389507,-0.00389507,-0.00389507,-0.00389507
7,10940747,0.0303052,0.0303052,0.0303052,0.0303052,0.0303052,0.0303052
8,10954547,-0.0640804,-0.0640804,-0.0640804,-0.0640804,-0.0640804,-0.0640804
9,10976347,0.0247395,0.0247395,0.0247395,0.0247395,0.0247395,0.0247395
