# Kaggle Zillow Preprocessing + XGBoost

###### 0.0660352

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 999
import time
from datetime import datetime

from sami_function import missing_ratio
from zillow_functions import create_newFeatures, data_preprocessing, memory_reduce, create_special_feature
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
import gc
from sklearn import linear_model
from sklearn.externals import joblib

%matplotlib inline
seed = 42
n_features = 100

### XGBoost

In [2]:
%%time
print('\nLoading data ...')

train =  pd.read_csv('../data/train_2016_v2.csv')
train17 =  pd.read_csv('../data/train_2017.csv')
prop = pd.read_csv('../data/properties_2016.csv')
sample = pd.read_csv('../data/sample_submission.csv')
prop17 = pd.read_csv('../data/properties_2017.csv', low_memory = False)

train = pd.concat([train, train17])
prop = pd.concat([prop, prop17])
df_train = pd.merge(train, prop, on='parcelid', how='left')
print('Shape train: {}'.format(df_train.shape))

del train; gc.collect()

print('\nData preprocessing ...')

df_train = data_preprocessing(df_train)


print('\nCreating new features ...')

df_train = create_newFeatures(df_train)

# New special feature
dates_model_col = ['transaction_year', 'transaction_month', 'yearbuilt', 'house_age']
df_train['spe_feature'], datesFeature_mod = create_special_feature(df_train[dates_model_col], df_train['logerror'].values)

# Month feature
month_avgs = df_train.groupby('transaction_month').agg(['mean'])['logerror', 'mean'].values - df_train['logerror'].mean()
from sklearn.linear_model import LinearRegression
month_model = LinearRegression().fit(np.arange(4, 13, 1).reshape(-1, 1), 
                                     month_avgs[3:].reshape(-1, 1))                       
df_train['super_month'] = month_model.predict(df_train['transaction_month'].values.reshape(-1, 1))


print('\nReducing consumption memory ...')

df_train = memory_reduce(df_train)

# MAE 0.05255990000000001 for 50 rounds  [:39]

print('\nCreating training set ...')

x_train = df_train.drop(['parcelid', 'logerror'], axis=1)
y_train = df_train['logerror'].values

print(x_train.shape, y_train.shape)


print('\nFeatures selection ...')
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

x_col = list(x_train.columns)
lr = LinearRegression()
rfe = RFE(lr)
rfe.fit(x_train, y_train)
x_val = [x[1] for x in [x for x in sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), x_col), reverse=True) if x[0] > 0]]

train_columns = x_val[:n_features]
x_train = x_train[train_columns]
y_mean = np.mean(y_train)


#month_values = df_train['transaction_month'].values
#x_train = np.hstack([x_train, month_model.predict(month_values.reshape(-1, 1))])

sc = StandardScaler()
x_train = sc.fit_transform(x_train)

del df_train; gc.collect()

print('\nBuilding DMatrix...')

d_train = xgb.DMatrix(x_train, y_train)

del x_train; gc.collect()

print('\nTraining ...')

params = {
    'learning_rate': 0.031,
    'max_depth': 9,
    'min_child_weight': 0,
    'gamma': 0.2,
    'subsample': 0.80,
    'n_estimators': 1000,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': y_mean,
    'seed': seed
}

# params = {
#     'colsample_bytree': 0.4603, 
#     'gamma': 0.0468, 
#     'learning_rate': 0.05, 
#     'max_depth': 3, 
#     'min_child_weight': 1.7817, 
#     'n_estimators': 2200,
#     'reg_alpha': 0.4640, 
#     'reg_lambda': 0.8571,
#     'subsample': 0.5213, 
#     'silent': 1,
#     'random_state': 7, 
#     'nthread': -1
# }

#--- cross-validation ---
cv_result = xgb.cv(
                    params, 
                    d_train, 
                    nfold=10,
                    num_boost_round=1000,
                    early_stopping_rounds=100,
                    verbose_eval=10, 
                    show_stdv=False
                  )

num_boost_rounds = cv_result['test-mae-mean'].argmin()
mean_mae = cv_result['test-mae-mean'].min()

print("\n\tMAE {} for {} rounds".format(mean_mae, num_boost_rounds))

#--- load best params GridSearch ---
# grid_clf = joblib.load('xgboost_model.pkl')


print('\nTraining model ...')
clf = xgb.train(dict(params), d_train, num_boost_round=num_boost_rounds)



Loading data ...




Shape train: (335776, 60)

Data preprocessing ...

	Outliers treated ...

Creating new features ...

Reducing consumption memory ...
	Initial size 174.91 MB
	There are 0 columns that cannot be reduced
	There are 80 columns reduced
	Final size 66.04 MB

Creating training set ...
(335776, 78) (335776,)

Features selection ...

Building DMatrix...

Training ...
[0]	train-mae:0.0608598	test-mae:0.0608757
[10]	train-mae:0.0603333	test-mae:0.0604902
[20]	train-mae:0.0599594	test-mae:0.0602459
[30]	train-mae:0.0596873	test-mae:0.0600795
[40]	train-mae:0.0594723	test-mae:0.0599549
[50]	train-mae:0.0592927	test-mae:0.0598577
[60]	train-mae:0.0591338	test-mae:0.0597723
[70]	train-mae:0.0589947	test-mae:0.0597032
[80]	train-mae:0.0588748	test-mae:0.0596425
[90]	train-mae:0.058765	test-mae:0.0595884
[100]	train-mae:0.058665	test-mae:0.0595386


KeyboardInterrupt: 

In [None]:
%%time

# watchlist = [(d_train, 'train'), (d_valid, 'valid')]
# clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

del d_train

print('\nBuilding test set ...')

sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')

del prop, sample; gc.collect()

p_test = []
batch_size = 100000
for batch in range(batch_size, df_test.shape[0]+batch_size, batch_size):
    
    print('\nWorking batch {}'.format(batch))
    
    df_test_batch = df_test[batch-batch_size:batch].copy()
    
    print('\nData preprocessing ...')
    
    df_test_batch['rawcensustractandblock'] = df_test_batch.rawcensustractandblock.fillna(df_test.rawcensustractandblock.mode()[0])
    df_test_batch = data_preprocessing(df_test_batch)
    df_test_batch = df_test_batch.fillna(1)
    
    print('\nCreating new features ...')
    
    df_test_batch = create_newFeatures(df_test_batch)
    df_test_batch['spe_feature'], nawFeature_mod = create_special_feature(df_test_batch[dates_model_col], model=datesFeature_mod)
    df_test_batch['super_month'] = month_model.predict(df_test_batch['transaction_month'].values.reshape(-1, 1))
    
    print('\nReducing consumption memory ...')
    
    df_test_batch = memory_reduce(df_test_batch)

    x_test_batch = df_test_batch[train_columns]
    #x_test_batch = np.hstack([x_test_batch, np.zeros((x_test_batch.shape[0], 1))])
    x_test_batch = sc.transform(x_test_batch)
    
    del df_test_batch; gc.collect()

    d_test = xgb.DMatrix(x_test_batch)

    del x_test_batch; gc.collect()

    print('\nPredicting on test ...')

    p_test_batch = clf.predict(d_test)

    del d_test; gc.collect()
    
    [p_test.append(p) for p in p_test_batch]

i = 0
sub = pd.read_csv('../data/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test[i::6]
    i = i + 1

print('\nWriting csv ...')
sub.to_csv('../submissions/xgb_{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False, float_format='%.4f')

print('\nPrediction available !!!')

In [6]:
train =  pd.read_csv('../data/train_2016_v2.csv')
train17 =  pd.read_csv('../data/train_2017.csv')
prop = pd.read_csv('../data/properties_2016.csv')
sample = pd.read_csv('../data/sample_submission.csv')
prop17 = pd.read_csv('../data/properties_2017.csv', low_memory = False)

train = pd.concat([train, train17])
prop = pd.concat([prop, prop17])
df_train = pd.merge(train, prop, on='parcelid', how='left')
print('Shape train: {}'.format(df_train.shape))

  interactivity=interactivity, compiler=compiler, result=result)


Shape train: (335776, 60)


In [None]:
sub

In [None]:
MAE 0.05216269999999999 for 89 rounds # waiting for submission

In [None]:
MAE 0.0521541 for 104 rounds # with special feature 

In [None]:
MAE 0.05215620000000001 for 72 rounds # standadScaler

In [None]:
MAE 0.052151499999999996 for 96 rounds # standadScaler with special feature 

In [None]:
MAE 0.052151499999999996 for 96 rounds