# Kaggle Zillow Algo Keras Neural Network

In [1]:
import time
from datetime import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from keras.models import Sequential, load_model
from keras.layers import Dropout, BatchNormalization, Dense
from keras.wrappers.scikit_learn import KerasRegressor

import gc
from zillow_functions import create_newFeatures, data_preprocessing, memory_reduce
from sami_function import missing_ratio

Using TensorFlow backend.


In [2]:
%%time
print('Loading train, prop and sample data')
train = pd.read_csv("../data/train_2016_v2.csv", parse_dates=["transactiondate"])
prop = pd.read_csv('../data/properties_2016.csv')
sample = pd.read_csv('../data/sample_submission.csv')
 
#df_train =df_train[ df_train.logerror > -0.4005 ]
#df_train=df_train[ df_train.logerror < 0.412 ]

df_train = pd.merge(train, prop, on='parcelid', how='left')
print('\tShape train : {}'.format(df_train.shape))

del train; gc.collect()

print('\nData preprocessing ...')
df_train = data_preprocessing(df_train)


print('\nCreating new features ...')
df_train = create_newFeatures(df_train)
# New special feature
# df_train['spe_feature'], nawFeature_mod = creature_special_feature(df_train[['transaction_year', 'transaction_month', 'yearbuilt', 'house_age']], df_train['logerror'].values)

print('\nReducing consumption memory ...')
df_train = memory_reduce(df_train)


print('\nBuilding train set ...')
x_train = df_train.drop(['parcelid', 'logerror'], axis=1)  
y_train = df_train["logerror"]
y_mean = np.mean(y_train)
print('\tShape train : {} Labels : {}'.format(x_train.shape, y_train.shape))
train_columns = x_train.columns

sc = StandardScaler()
x_train = sc.fit_transform(x_train)

len_x = int(x_train.shape[1])
print('\tlen_x is : {}'.format(len_x))

print('\nBuilding Neural Network ...')

nn = Sequential()
nn.add(Dense(units = 360 , kernel_initializer = 'normal', activation = 'tanh', input_dim = len_x))
nn.add(Dropout(.17))
nn.add(Dense(units = 150 , kernel_initializer = 'normal', activation = 'relu'))
nn.add(BatchNormalization())
nn.add(Dropout(.4))
nn.add(Dense(units = 60 , kernel_initializer = 'normal', activation = 'relu'))
nn.add(BatchNormalization())
nn.add(Dropout(.32))
nn.add(Dense(units = 25, kernel_initializer = 'normal', activation = 'relu'))
nn.add(BatchNormalization())
nn.add(Dropout(.22))
nn.add(Dense(1, kernel_initializer='normal'))
nn.compile(loss='mae', optimizer='adam')
#classifier.compile(loss='mean_absolute_error', optimizer='rmsprop', metrics=['mae', 'accuracy'])


print('\nTraining Neural Network ...')
nn.fit(np.array(x_train), np.array(y_train), batch_size = 32, epochs = 60, verbose=2)
nn.save('keras_model.h5')


print('\nBuilding test set ...')
sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')

del prop, sample; gc.collect()


p_test = []
batch_size = 100000
for batch in range(batch_size, df_test.shape[0]+batch_size, batch_size):
    
    print('\nWorking batch {}'.format(batch))
    df_test_batch = df_test[batch-batch_size:batch].copy()
    
    print('\nData preprocessing ...')
    df_test_batch['rawcensustractandblock'] = df_test_batch.rawcensustractandblock.fillna(df_test.rawcensustractandblock.mode()[0])
    df_test_batch = data_preprocessing(df_test_batch)
    df_test_batch = df_test_batch.fillna(-1)
    
    print('\nCreating new features ...')
    
    df_test_batch = create_newFeatures(df_test_batch)
    # df_test_batch['spe_feature'], nawFeature_mod = creature_special_feature(df_test_batch[['transaction_year', 'transaction_month', 'yearbuilt', 'house_age']], model=nawFeature_mod)
    
    print('\nReducing consumption memory ...')
    
    df_test_batch = memory_reduce(df_test_batch)

    x_test_batch = df_test_batch[train_columns]
    x_test_batch = sc.transform(x_test_batch)
    
    del df_test_batch; gc.collect()

    print('\tShape test batch : {}'.format(x_test_batch.shape))

    print('\nPredicting on batch test ...')
    
    y_pred_ann = nn.predict(x_test_batch)
    y_pred_batch = y_pred_ann.flatten()
    
    del x_test_batch, y_pred_ann; gc.collect()
    
    [p_test.append(p) for p in y_pred_batch]
    
    
print( "\nPreparing results for write ..." )

i = 0
sub = pd.read_csv('../data/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test[i::6]
    i = i + 1

print('\nWriting results ...')
sub.to_csv('../submissions/keras_nn_{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False, float_format='%.4f')

print('\nPrediction available !!!')

Loading train, prop and sample data




	Shape train : (90275, 60)

Data preprocessing ...

	Outliers treated ...

Creating new features ...

Reducing consumption memory ...
	Initial size 43.99 MB
	There are 0 columns that cannot be reduced
	There are 76 columns reduced
	Final size 14.64 MB

Building train set ...
	Shape train : (90275, 75) Labels : (90275,)
	len_x is : 75

Building Neural Network ...

Training Neural Network ...
Epoch 1/60
32s - loss: 0.0625
Epoch 2/60
31s - loss: 0.0605
Epoch 3/60
30s - loss: 0.0601
Epoch 4/60
31s - loss: 0.0600
Epoch 5/60
31s - loss: 0.0599
Epoch 6/60
31s - loss: 0.0599
Epoch 7/60
31s - loss: 0.0598
Epoch 8/60
31s - loss: 0.0598
Epoch 9/60
31s - loss: 0.0597
Epoch 10/60
30s - loss: 0.0597
Epoch 11/60
31s - loss: 0.0597
Epoch 12/60
30s - loss: 0.0597
Epoch 13/60
31s - loss: 0.0596
Epoch 14/60
30s - loss: 0.0596
Epoch 15/60
30s - loss: 0.0596
Epoch 16/60
30s - loss: 0.0595
Epoch 17/60
31s - loss: 0.0595
Epoch 18/60
30s - loss: 0.0595
Epoch 19/60
30s - loss: 0.0594
Epoch 20/60
30s - loss: 0.

	There are 82 columns reduced
	Final size 104.14 MB
	Shape test batch : (600000, 75)

Predicting on batch test ...

Working batch 2200000

Data preprocessing ...

Creating new features ...

Reducing consumption memory ...
	Initial size 327.30 MB
	There are 0 columns that cannot be reduced
	There are 82 columns reduced
	Final size 104.14 MB
	Shape test batch : (600000, 75)

Predicting on batch test ...

Working batch 2300000

Data preprocessing ...

Creating new features ...

Reducing consumption memory ...
	Initial size 327.30 MB
	There are 0 columns that cannot be reduced
	There are 82 columns reduced
	Final size 105.29 MB
	Shape test batch : (600000, 75)

Predicting on batch test ...

Working batch 2400000

Data preprocessing ...

Creating new features ...

Reducing consumption memory ...
	Initial size 327.30 MB
	There are 0 columns that cannot be reduced
	There are 82 columns reduced
	Final size 106.43 MB
	Shape test batch : (600000, 75)

Predicting on batch test ...

Working batch 