In [2]:
import sys
sys.path.insert(0, './Modules')

from sklearn.externals import joblib
from helpers import read_in_dataset
import pandas as pd
import numpy as np
import gc

CHUNK_SIZE = 50000

In [3]:
my_model = joblib.load('models/model.pkl')

In [4]:
class medianPredictor:
    
    def fit(self, X, y=None):
        self.med = X.median()
        
    def predict(self, X, y=None):
        return np.array([self.med] * len(X))
    
class zeroPredictor:
    
    def fit(self, X, y=None):
        return self
    
    def predict(self, X, y=None):
        return np.array([0] * len(X))

In [5]:
def make_chunks(df, chunksize):
    """Generator to return chunks of a dataframe of a given size"""
    chunk = 1
    total = len(df)//chunksize + 1
    while chunk <= total:
        if chunk < total:
            yield df.iloc[((chunk-1)*chunksize):(chunk*chunksize)]
        else:
            yield df.iloc[((chunk-1)*chunksize):]
        chunk += 1

In [6]:
def add_date(df, dt):
    df['transactiondate'] = pd.to_datetime(dt)
    return df

In [7]:
def make_sub_file(model, chunksize):
    
    dates = ['2016-10-01', '2016-11-01', '2016-12-01', '2017-10-01', '2017-11-01', '2017-12-01']
    props = read_in_dataset('properties_2016')

    submission_df = pd.DataFrame(index=props.parcelid)

    for d in dates:
        props = add_date(props, d)
        for x in make_chunks(props, chunksize):
            preds = model.predict(x)
            ix = x.parcelid
            submission_df.loc[ix,str(pd.to_datetime(d).year) + str(pd.to_datetime(d).month)] = preds
        print('processed date {0}'.format(d))
        
    del props
    
    return submission_df.round(4).reset_index()

In [8]:
!md submissions

In [9]:
mp = medianPredictor()
mp.fit(read_in_dataset('train_2016').logerror)
make_sub_file(mp, CHUNK_SIZE).to_csv('submissions/median_submission.csv', index=False)
gc.collect() # because of memory issues, garbage collect

  exec(code_obj, self.user_global_ns, self.user_ns)


processed date 2016-10-01
processed date 2016-11-01
processed date 2016-12-01
processed date 2017-10-01
processed date 2017-11-01
processed date 2017-12-01


0

In [10]:
zp = zeroPredictor()
# don't need to fit
make_sub_file(zp, CHUNK_SIZE).to_csv('submissions/zero_submission.csv', index=False)
gc.collect() # because of memory issues, garbage collect

processed date 2016-10-01
processed date 2016-11-01
processed date 2016-12-01
processed date 2017-10-01
processed date 2017-11-01
processed date 2017-12-01


0

In [11]:
make_sub_file(my_model, CHUNK_SIZE).to_csv('submissions/model_submission.csv', index=False)
gc.collect() # because of memory issues, garbage collect

processed date 2016-10-01
processed date 2016-11-01
processed date 2016-12-01
processed date 2017-10-01
processed date 2017-11-01
processed date 2017-12-01


0