In [161]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc

print('Loading data ...')

train = pd.read_csv('../data/train_2016_v2.csv/train_2016_v2.csv')
prop = pd.read_csv('../data/properties_2016.csv/properties_2016.csv')
sample = pd.read_csv('../data/sample_submission.csv')

#print('Binding to float32')

for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

print('Creating training set ...')
# print('train shape:', train.shape)
df_data = train.merge(prop, how='left', on='parcelid')
# print('df_train.shape: ', df_train.shape)
# print(df_train[['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode']])




Loading data ...
Creating training set ...


In [168]:
ulimit = np.percentile(df_data.logerror.values, 99)
llimit = np.percentile(df_data.logerror.values, 1)
df_data['logerror'].ix[df_data['logerror']>ulimit] = ulimit
df_data['logerror'].ix[df_data['logerror']<llimit] = llimit




.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  This is separate from the ipykernel package so we can avoid doing imports until


In [169]:
length = df_data.shape[0]
missing_df = df_data.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']

missing_df = missing_df.sort_values(by='missing_count', ascending=False)
missing_df['ratio'] = missing_df['missing_count'] * 1.0 / length

mask = (missing_df['ratio'] < 0.1)
missing_df = missing_df[mask]
missing_cols = list(missing_df.column_name.values)
# missing_cols.append('logerror')
print missing_df


                     column_name  missing_count     ratio
0           finishedsquarefeet12           4679  0.051831
1                   regionidcity           1803  0.019972
2                    fullbathcnt           1182  0.013093
3              calculatedbathnbr           1182  0.013093
4                      yearbuilt            756  0.008374
5   calculatedfinishedsquarefeet            661  0.007322
6            censustractandblock            605  0.006702
7     structuretaxvaluedollarcnt            380  0.004209
8                    regionidzip             35  0.000388
9                      taxamount              6  0.000066
10     propertycountylandusecode              1  0.000011
11         landtaxvaluedollarcnt              1  0.000011
12             taxvaluedollarcnt              1  0.000011
19                      latitude              0  0.000000
24                regionidcounty              0  0.000000
23        rawcensustractandblock              0  0.000000
22         pro

In [170]:
df_data = df_data[missing_cols]
df_data.head()

Unnamed: 0,finishedsquarefeet12,regionidcity,fullbathcnt,calculatedbathnbr,yearbuilt,calculatedfinishedsquarefeet,censustractandblock,structuretaxvaluedollarcnt,regionidzip,taxamount,...,propertylandusetypeid,logerror,longitude,bedroomcnt,roomcnt,fips,assessmentyear,bathroomcnt,transactiondate,parcelid
0,1684.0,12447.0,2.0,2.0,1959.0,1684.0,60371070000000.0,122754.0,96370.0,6735.879883,...,261.0,0.0276,-118488536.0,3.0,0.0,6037.0,2015.0,2.0,2016-01-01,11016594
1,2263.0,32380.0,3.0,3.5,2014.0,2263.0,,346458.0,96962.0,10153.019531,...,261.0,-0.1684,-117677552.0,4.0,0.0,6059.0,2015.0,3.5,2016-01-01,14366692
2,2217.0,47019.0,3.0,3.0,1940.0,2217.0,60374640000000.0,61994.0,96293.0,11484.480469,...,261.0,-0.004,-118175032.0,2.0,0.0,6037.0,2015.0,3.0,2016-01-01,12098116
3,839.0,12447.0,2.0,2.0,1987.0,839.0,60372960000000.0,171518.0,96222.0,3048.73999,...,266.0,0.0218,-118309000.0,2.0,0.0,6037.0,2015.0,2.0,2016-01-02,12643413
4,2283.0,17686.0,2.0,2.5,1981.0,2283.0,60590420000000.0,169574.0,96961.0,5488.959961,...,261.0,-0.005,-117700232.0,4.0,8.0,6059.0,2015.0,2.5,2016-01-02,14432541


In [171]:
df_data['transactiondate'] = pd.to_datetime(df_data['transactiondate'], format='%Y-%m-%d')
mask = (df_data['transactiondate'].dt.month.isin([1, 2, 3]))
df_valid = df_data[mask]
df_train = df_data[~mask]
x_valid = df_valid.drop(['parcelid', 'logerror', 'transactiondate', 'propertycountylandusecode'], axis=1)
y_valid = df_valid['logerror'].values
print(x_valid.shape, y_valid.shape)

train_columns = x_train.columns


for c in x_valid.dtypes[x_valid.dtypes == object].index.values:
    # print('x_train[c]', c, x_train[c].head(10))
    x_valid[c] = (x_valid[c] == True)
    # print('x_train[c]', c, x_train[c].head(10))


((21541, 22), (21541,))


In [172]:
# x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

train_columns = x_train.columns


for c in x_train.dtypes[x_train.dtypes == object].index.values:
    # print('x_train[c]', c, x_train[c].head(10))
    x_train[c] = (x_train[c] == True)
    # print('x_train[c]', c, x_train[c].head(10))

# del df_train; gc.collect()

((68734, 22), (68734,))


In [174]:
# split = 80000
# x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)



print('Training ...')

params = {}
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['max_depth'] = 4
params['silent'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)



print('Building test set ...')

sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')


x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)



d_test = xgb.DMatrix(x_test)


print('Predicting on test ...')

p_test = clf.predict(d_test)



sub = pd.read_csv('../data/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    print('c', c)
    sub[c] = p_test

print('Writing csv ...')
# sub.to_csv('../data/xgb_starter.csv', index=False, float_format='%.4f') # Thanks to @inversion

Building DMatrix...
Training ...
[0]	train-mae:0.480035	valid-mae:0.479034
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[10]	train-mae:0.39348	valid-mae:0.392673
[20]	train-mae:0.323734	valid-mae:0.323293
[30]	train-mae:0.267398	valid-mae:0.267338
[40]	train-mae:0.222048	valid-mae:0.222386
[50]	train-mae:0.185699	valid-mae:0.18637
[60]	train-mae:0.156667	valid-mae:0.157722
[70]	train-mae:0.133584	valid-mae:0.135006
[80]	train-mae:0.115383	valid-mae:0.117202
[90]	train-mae:0.10115	valid-mae:0.103409
[100]	train-mae:0.090155	valid-mae:0.092804
[110]	train-mae:0.081777	valid-mae:0.084805
[120]	train-mae:0.075471	valid-mae:0.07884
[130]	train-mae:0.070805	valid-mae:0.074455
[140]	train-mae:0.067382	valid-mae:0.071269
[150]	train-mae:0.064896	valid-mae:0.068988
[160]	train-mae:0.063106	valid-mae:0.067358
[170]	train-mae:0.061818	valid-mae:0.066188
[180]	train-mae:0.060892	valid-mae:0.065359
[1

In [137]:
sub.to_csv('../data/xgb_starter.csv', index=False, float_format='%.4f') # Thanks to @inversion