In [70]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc

print('Loading data ...')

train = pd.read_csv('../data/train_2016_v2.csv/train_2016_v2.csv')
prop = pd.read_csv('../data/properties_2016.csv/properties_2016.csv')
sample = pd.read_csv('../data/sample_submission.csv')

#print('Binding to float32')

for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

print('Creating training set ...')
# print('train shape:', train.shape)
df_data = train.merge(prop, how='left', on='parcelid')
# print('df_train.shape: ', df_train.shape)
# print(df_train[['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode']])




Loading data ...
Creating training set ...


In [71]:
ulimit = np.percentile(df_data.logerror.values, 99)
llimit = np.percentile(df_data.logerror.values, 1)
df_data['logerror'].ix[df_data['logerror']>ulimit] = ulimit
df_data['logerror'].ix[df_data['logerror']<llimit] = llimit




.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  This is separate from the ipykernel package so we can avoid doing imports until
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  after removing the cwd from sys.path.


In [72]:
length = df_data.shape[0]
missing_df = df_data.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']

missing_df = missing_df.sort_values(by='missing_count', ascending=False)
missing_df['ratio'] = missing_df['missing_count'] * 1.0 / length

mask = (missing_df['ratio'] < 0.1)
missing_df = missing_df[mask]
missing_cols = list(missing_df.column_name.values)
# missing_cols.append('logerror')
print missing_df


                     column_name  missing_count     ratio
14          finishedsquarefeet12           4679  0.051831
38                  regionidcity           1803  0.019972
21                   fullbathcnt           1182  0.013093
10             calculatedbathnbr           1182  0.013093
49                     yearbuilt            756  0.008374
13  calculatedfinishedsquarefeet            661  0.007322
59           censustractandblock            605  0.006702
52    structuretaxvaluedollarcnt            380  0.004209
41                   regionidzip             35  0.000388
56                     taxamount              6  0.000066
34     propertycountylandusecode              1  0.000011
55         landtaxvaluedollarcnt              1  0.000011
53             taxvaluedollarcnt              1  0.000011
7                     bedroomcnt              0  0.000000
2                transactiondate              0  0.000000
6                    bathroomcnt              0  0.000000
54            

In [79]:
df_data = df_data[missing_cols]
df_data.head()

Unnamed: 0,finishedsquarefeet12,regionidcity,fullbathcnt,calculatedbathnbr,yearbuilt,calculatedfinishedsquarefeet,censustractandblock,structuretaxvaluedollarcnt,regionidzip,taxamount,...,assessmentyear,fips,roomcnt,latitude,longitude,logerror,propertylandusetypeid,rawcensustractandblock,regionidcounty,parcelid
0,1684.0,12447.0,2.0,2.0,1959.0,1684.0,60371070000000.0,122754.0,96370.0,6735.879883,...,2015.0,6037.0,0.0,34280992.0,-118488536.0,0.0276,261.0,60371068.0,3101.0,11016594
1,2263.0,32380.0,3.0,3.5,2014.0,2263.0,,346458.0,96962.0,10153.019531,...,2015.0,6059.0,0.0,33668120.0,-117677552.0,-0.1684,261.0,60590524.0,1286.0,14366692
2,2217.0,47019.0,3.0,3.0,1940.0,2217.0,60374640000000.0,61994.0,96293.0,11484.480469,...,2015.0,6037.0,0.0,34136312.0,-118175032.0,-0.004,261.0,60374640.0,3101.0,12098116
3,839.0,12447.0,2.0,2.0,1987.0,839.0,60372960000000.0,171518.0,96222.0,3048.73999,...,2015.0,6037.0,0.0,33755800.0,-118309000.0,0.0218,266.0,60372964.0,3101.0,12643413
4,2283.0,17686.0,2.0,2.5,1981.0,2283.0,60590420000000.0,169574.0,96961.0,5488.959961,...,2015.0,6059.0,8.0,33485644.0,-117700232.0,-0.005,261.0,60590424.0,1286.0,14432541


In [103]:
df_data['transactiondate'] = pd.to_datetime(df_data['transactiondate'], format='%Y-%m-%d')
mask = (df_data['transactiondate'].dt.month.isin([1, 2, 3]))
df_valid = df_data[mask]
df_train = df_data[~mask]
x_valid = df_valid.drop(['parcelid', 'logerror', 'transactiondate', 'propertycountylandusecode'], axis=1)
y_valid = df_valid['logerror'].values
print(x_valid.shape, y_valid.shape)

# train_columns = x_train.columns


for c in x_valid.dtypes[x_valid.dtypes == object].index.values:
    # print('x_train[c]', c, x_train[c].head(10))
    x_valid[c] = (x_valid[c] == True)
    # print('x_train[c]', c, x_train[c].head(10))


((21541, 22), (21541,))


In [104]:
# x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

train_columns = x_train.columns


for c in x_train.dtypes[x_train.dtypes == object].index.values:
    # print('x_train[c]', c, x_train[c].head(10))
    x_train[c] = (x_train[c] == True)
    # print('x_train[c]', c, x_train[c].head(10))

# del df_train; gc.collect()

((68734, 22), (68734,))


In [105]:
# split = 80000
# x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

del x_train, x_valid; gc.collect()

print('Training ...')

params = {}
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['max_depth'] = 4
params['silent'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

del d_train, d_valid

print('Building test set ...')

sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')

del prop; gc.collect()

x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

del df_test, sample; gc.collect()

d_test = xgb.DMatrix(x_test)

del x_test; gc.collect()

print('Predicting on test ...')

p_test = clf.predict(d_test)

del d_test; gc.collect()

sub = pd.read_csv('../data/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    print('c', c)
    sub[c] = p_test

print('Writing csv ...')
# sub.to_csv('../data/xgb_starter.csv', index=False, float_format='%.4f') # Thanks to @inversion

Building DMatrix...
Training ...
[0]	train-mae:0.480034	valid-mae:0.479032
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[10]	train-mae:0.393481	valid-mae:0.392675
[20]	train-mae:0.323736	valid-mae:0.323295
[30]	train-mae:0.2674	valid-mae:0.267341
[40]	train-mae:0.22205	valid-mae:0.222388
[50]	train-mae:0.185701	valid-mae:0.186373
[60]	train-mae:0.156669	valid-mae:0.157725
[70]	train-mae:0.133586	valid-mae:0.135008
[80]	train-mae:0.115386	valid-mae:0.117217
[90]	train-mae:0.101152	valid-mae:0.103412
[100]	train-mae:0.090156	valid-mae:0.092813
[110]	train-mae:0.081776	valid-mae:0.084809
[120]	train-mae:0.075469	valid-mae:0.078835
[130]	train-mae:0.070802	valid-mae:0.074455
[140]	train-mae:0.067382	valid-mae:0.07127
[150]	train-mae:0.064895	valid-mae:0.068993
[160]	train-mae:0.063106	valid-mae:0.067362
[170]	train-mae:0.06182	valid-mae:0.066195
[180]	train-mae:0.060896	valid-mae:0.065363
[19

NameError: name 'sample' is not defined

In [106]:
sub.to_csv('../data/xgb_starter.csv', index=False, float_format='%.4f') # Thanks to @inversion