# Kaggle Zillow Algo LightGBM

###### 0.0644038

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc
import datetime as dt
import time
from datetime import datetime

from zillow_functions import create_newFeatures, data_preprocessing, memory_reduce
from sami_function import missing_ratio

In [9]:
%%time

## Version 3 - LB 0.0644042
# Train month averages for test predictions seem work better than their linear fit,
# so I changed it (overfitting test data as hell... but who doesn't here? ;))

## Version 2 - LB 0.0644120
# LGBM performs much better, so I left him alone

## Version 1 - LB 0.0644711
# Both models have the same weight, which is based on cross-validation results, but
# XGB model seems to be worse on public LB, 'cause alone gets score 0.0646474,
# which is much worse than score of the combination. I reached the limit of submissions,
# so I will check how LGBM alone performs tomorrow. Check it out for your own ;)


print('Loading data...')
prop = pd.read_csv('../data/properties_2016.csv', low_memory = False)
train = pd.read_csv('../data/train_2016_v2.csv')
sample = pd.read_csv('../data/sample_submission.csv', low_memory = False)

df_train = pd.merge(train, prop, on='parcelid', how='left')
print('\tShape train : {}'.format(df_train.shape))

del train; gc.collect()

print('\nData preprocessing ...')
df_train = data_preprocessing(df_train)

print('\nCreating new features ...')
df_train = create_newFeatures(df_train)

print('\nReducing consumption memory ...')
df_train = memory_reduce(df_train)

print('\nBuilding train set ...')
            

# feature_names = [feature for feature in feature_names if feature != 'transaction_month']

month_avgs = df_train.groupby('transaction_month').agg(['mean'])['logerror', 'mean'].values - df_train['logerror'].mean()

from sklearn.linear_model import LinearRegression
month_model = LinearRegression().fit(np.arange(4, 13, 1).reshape(-1, 1), 
                                     month_avgs[3:].reshape(-1, 1))
                             
df_train['super_month'] = month_model.predict(df_train['transaction_month'].values.reshape(-1, 1))

feature_names = df_train.columns[2:]

print('Preparing arrays ...')
X_train = df_train[feature_names].values
y_train = df_train.iloc[:, 1].values

# month_values = df_train['transaction_month'].values
# X_train = np.hstack([X_train, month_model.predict(month_values.reshape(-1, 1))])

print('Training LGBM model...')
ltrain = lgb.Dataset(X_train, label = y_train)

params = {}
params['metric'] = 'mae'
params['max_depth'] = 100
params['num_leaves'] = 32
params['feature_fraction'] = .85
params['bagging_fraction'] = .95
params['bagging_freq'] = 8
params['learning_rate'] = 0.0025
params['verbosity'] = 10

lgb_model = lgb.train(params, ltrain, verbose_eval=0, num_boost_round=2930)
                  

print('\nBuilding test set ...')

sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')

del prop; gc.collect()

results = pd.DataFrame()
p_test = []
batch_size = 100000
for batch in range(batch_size, df_test.shape[0]+batch_size, batch_size):
    
    print('\nWorking batch {}'.format(batch))
    
    df_test_batch = df_test[batch-batch_size:batch].copy()
    
    print('\nData preprocessing ...')
    
    df_test_batch['rawcensustractandblock'] = df_test_batch.rawcensustractandblock.fillna(df_test.rawcensustractandblock.mode()[0])
    df_test_batch = data_preprocessing(df_test_batch)
    df_test_batch = df_test_batch.fillna(1)
    
    print('\nCreating new features ...')
    
    df_test_batch = create_newFeatures(df_test_batch)
    #df_test_batch['spe_feature'], nawFeature_mod = create_special_feature(df_test_batch[['transaction_year', 'transaction_month', 'yearbuilt', 'house_age']], model=nawFeature_mod)
    
    df_test_batch['super_month'] = month_model.predict(df_test_batch['transaction_month'].values.reshape(-1, 1))
    
    print('\nReducing consumption memory ...')
    
    df_test_batch = memory_reduce(df_test_batch)

    x_test_batch = df_test_batch[feature_names]
    # x_test_batch = sc.transform(x_test_batch)
    
    del df_test_batch; gc.collect()
    
    
    print('\nMaking predictions ...')
    
    # x_test_batch = np.hstack([x_test_batch, np.zeros((x_test_batch.shape[0], 1))])
    
    # for month in [10, 11, 12]:
    #     x_test_batch[:, -1] = month_avgs[month - 1]
    #     assert x_test_batch.shape[1] == x_test_batch.shape[1]
    y_pred = lgb_model.predict(x_test_batch)
        
        # del x_test_batch; gc.collect()
        
    [p_test.append(p) for p in y_pred]

        
i = 0
sub = pd.read_csv('../data/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test[i::6]
    i = i + 1
    

print('\nSaving predictions...')
sub.to_csv('../submissions/light-gbm_{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False, float_format='%.4f')
print('\nDone!')


Loading data...
	Shape train : (90275, 60)

Data preprocessing ...

	Outliers treated ...

Creating new features ...

Reducing consumption memory ...
	Initial size 44.68 MB
	There are 0 columns that cannot be reduced
	There are 78 columns reduced
	Final size 14.72 MB

Building train set ...
Preparing arrays ...
Training LGBM model...

Building test set ...

Working batch 100000

Data preprocessing ...

Creating new features ...

Reducing consumption memory ...
	Initial size 340.46 MB
	There are 0 columns that cannot be reduced
	There are 85 columns reduced
	Final size 107.00 MB
Making predictions and praying for good results...

Working batch 200000

Data preprocessing ...

Creating new features ...

Reducing consumption memory ...
	Initial size 340.46 MB
	There are 0 columns that cannot be reduced
	There are 85 columns reduced
	Final size 107.00 MB
Making predictions and praying for good results...

Working batch 300000

Data preprocessing ...

Creating new features ...

Reducing cons


Reducing consumption memory ...
	Initial size 340.46 MB
	There are 0 columns that cannot be reduced
	There are 85 columns reduced
	Final size 107.00 MB
Making predictions and praying for good results...

Working batch 3000000

Data preprocessing ...

Creating new features ...

Reducing consumption memory ...
	Initial size 290.13 MB
	There are 0 columns that cannot be reduced
	There are 85 columns reduced
	Final size 90.21 MB
Making predictions and praying for good results...
Saving predictions...
Done!
CPU times: user 3h 29min 12s, sys: 4min 20s, total: 3h 33min 32s
Wall time: 1h 14min 57s
