**Imports**

In [1]:

import os
import math
import pathlib
import json

import numpy as np

import pandas as pd
import pathlib

import sklearn as sk
import sklearn.ensemble

import pyplotterlib.standard.plotters as ppl

#
import project_path
import preproc_pipes as preProcPipeHelp

**Configuration Variables**

In [2]:
TRAIN_PATH = os.path.join("..","..","raw_data","train.csv")
TEST_PATH = os.path.join("..","..","raw_data","test.csv")


SAVE_FOLDER = os.path.join( os.getcwd(),"..","..","submissions","submit_rf_a" )
SUBMIT_PATH = os.path.join(SAVE_FOLDER, "submission.csv")

#Want to restrict to recent data only
N_DAYS_USE = 400

#
LAG_VALS = [x for x in range(16,46)]
ON_PROM_LAGS = [1,2,3,4,5,6,7,14,15,16,17,18]

#The obvious lag here is 0; since we actually always KNOW todays (and even future days)
ON_PROM_WINDOW_VALS = [2,3,4,5,6,7,14,21]

#Obvious lag is 16; this gets us a recent average sales number we can use
#7 is obviously the sensible number here...
SALE_WINDOW_VALS = [2,3,4,5,6,7,14,21] #

#
_currKwargs = {"constrainedLayout":True, "figHeightPerRow":4, "figWidthPerCol":6, "nColsGrid":4}
RECT_MULTI_PLOTTER = ppl.RectMultiPlotter(**_currKwargs)



In [3]:
#
np.random.seed(52342)

**Import data**

In [4]:
RAW_TRAIN = pd.read_csv(TRAIN_PATH)
RAW_TEST = pd.read_csv(TEST_PATH)

In [5]:
RAW_TRAIN.tail(3)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.0,8
3000887,3000887,2017-08-15,9,SEAFOOD,16.0,0


In [6]:
RAW_TEST.tail(3)

Unnamed: 0,id,date,store_nbr,family,onpromotion
28509,3029397,2017-08-31,9,PRODUCE,1
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9
28511,3029399,2017-08-31,9,SEAFOOD,0


**Temporarily merge the test and training frames to apply various transformations**

In [7]:
TEST_TRAIN_MERGED = pd.concat([RAW_TRAIN,RAW_TEST])

In [8]:
RAW_TRAIN["id"].max()

3000887

In [9]:
RAW_TEST["id"].min()

3000888

**Add a pipeline to process the data**

In [10]:
_uniqueFams = RAW_TRAIN["family"].unique()
_lagPromPipe = preProcPipeHelp.AddLagFeats(ON_PROM_LAGS, targCol="onpromotion")
_maPromPipe = preProcPipeHelp.AddLeftMovingAverage(ON_PROM_WINDOW_VALS, [0], targCol="onpromotion")
_maSalePipe = preProcPipeHelp.AddLeftMovingAverage(SALE_WINDOW_VALS, [16], targCol="sales")

In [11]:
_removeNDaysBehindPipe = preProcPipeHelp.RemoveDatesNDaysBehindMax(N_DAYS_USE)

_pipeComps = [ ("Add lag features", preProcPipeHelp.AddLagFeats(LAG_VALS)),
               ("Add lag onpromotion feature", _lagPromPipe ),
               ("Add store wide sums", preProcPipeHelp.AddStoreWideSums() ),
               ("Add moving average sale values", _maSalePipe ),
               ("Add moving average promotion values", _maPromPipe),
               ("Remove earlier data", _removeNDaysBehindPipe ),
               ("Add a log1p target column", preProcPipeHelp.AddLog1pSales() ),
               ("Add fractional number of promotions", preProcPipeHelp.AddFractProm() ),
               ("Add oil prices", preProcPipeHelp.AddOilPriceData() ),
               ("Add basic store info", preProcPipeHelp.AddStoreInfoData()),
               ("Add ordinal encoding for store type", preProcPipeHelp.OrdEncodeStoreType() ),
               ("Add ordinal encoding for store state", preProcPipeHelp.StoreStateOrdEncode() ),
               ("Add ordinal encoding for store city", preProcPipeHelp.StoreCityOrdEncode() ),
               ("Encode family ordinally", preProcPipeHelp.EncodeFamilyArbitrary(_uniqueFams)  ),
               ("Add day of week", preProcPipeHelp.AddDayOfWeekFeat() ),
               ("Add day of month", preProcPipeHelp.AddDayOfMonth() ),
               ("Add day of year sin/cos", preProcPipeHelp.AddDayOfYearSinCos() )
             ]

preProcPipe = sk.pipeline.Pipeline(_pipeComps)

**Apply the pipeline preprocessing**

In [12]:
%%prun
#          190067252 function calls (189465357 primitive calls) in 97.353 seconds

PROC_MERGED = preProcPipe.fit_transform(TEST_TRAIN_MERGED)

  summedFrame = useX.groupby(["date","store_nbr"]).sum().reset_index()


 

**Split the training and test frames up again**

In [13]:
TRAIN_FRAME = PROC_MERGED.loc[ PROC_MERGED["id"]<=RAW_TRAIN["id"].max() ]
TEST_FRAME = PROC_MERGED.loc[ PROC_MERGED["id"]>= RAW_TEST["id"].min() ]

**Apply the target encoding**

In [14]:
targEncoder =  preProcPipeHelp.TargEncodeFamilyStore()
targEncoder.fit(TRAIN_FRAME)

TRAIN_FRAME = targEncoder.transform(TRAIN_FRAME)
TEST_FRAME = targEncoder.transform(TEST_FRAME)

  self.meanFamDict = useX.groupby("family_enc").mean()["sales"].to_dict()


**Create a simple wrapper function for sklearn random forest; I want to use it directly with the dataframes**

In [15]:
class RandomForestWrapper():
    
    def __init__(self, useFeats, targFeat="sales", rfKwargs=None):
        self.useFeats = useFeats
        self.targFeat = targFeat
        rfKwargs = dict() if rfKwargs is None else rfKwargs
        self.regressor = sk.ensemble.RandomForestRegressor(**rfKwargs)
    
    def fit(self, inpX, inpY=None):
        _trainX = inpX[_useFeats]
        _trainY = inpX[self.targFeat]
        self.regressor.fit(_trainX, _trainY)
        return self
    
    def predict(self, inpX):
        _useX = inpX[self.useFeats]
        return self.regressor.predict(_useX)


**Create a function to get square log error; used in the evaluation metric**

In [16]:
def addSqrLogErrorToFrame(inpFrame, predCol="pred_A", actCol="sales", outCol="sqr_log_error"):
    inpFrame[outCol] = inpFrame.apply( lambda x: (math.log(x[predCol]+1) - math.log(x[actCol]+1))**2 ,axis=1)


**Create and fit a random forest**

In [17]:
_currLags = ["sales_l{}".format(int(x)) for x in range(16,16+7)]
_useFeats = ["family_enc", "store_nbr", "store_cluster", "fam_store_mean_enc", "store_type_ordE",
             "store_state_ordE", "day_of_week", "onpromotion",
             "sin_day_of_year", "cos_day_of_year",  "store_promotions", "fract_promotions",
            "oil_price_w10", "sales_l16_ma7", "sales_l16_ma14", "sales_l16_ma21",
             "onpromotion_l0_ma21", "onpromotion_l0_ma14", "onpromotion_l0_ma7"]
_useFeats += _currLags
targCol = "sales_log1p"
_rfKwargs = {"max_depth":12, "max_samples":0.1} #Unlikely to be optimal but....


MODEL = RandomForestWrapper(_useFeats, targCol, rfKwargs=_rfKwargs)


In [18]:
%%prun
#170726 function calls (169562 primitive calls) in 251.898 seconds

MODEL.fit(TRAIN_FRAME)

 

**Look at the error on the training data**

In [19]:
TRAIN_FRAME["pred_A"] = np.expm1( MODEL.predict(TRAIN_FRAME) )
addSqrLogErrorToFrame(TRAIN_FRAME)

In [20]:
math.sqrt( TRAIN_FRAME["sqr_log_error"].mean() )

0.4311218346618908

**Apply the model to our test data**

In [21]:
TEST_FRAME["sales"] = np.expm1( MODEL.predict(TEST_FRAME) )

**Prepare our submission file**

In [22]:
pathlib.Path(SAVE_FOLDER).mkdir(parents=True, exist_ok=True)

In [23]:
output = TEST_FRAME[["id","sales"]].sort_values(by="id")

In [24]:
output.to_csv(SUBMIT_PATH, index=False)

In [25]:
output

Unnamed: 0,id,sales
384,3000888,3.206558
784,3000889,0.000738
1184,3000890,4.256623
1584,3000891,2111.372449
1984,3000892,0.207503
...,...,...
117199,3029395,342.175531
117599,3029396,102.261545
117999,3029397,1212.678790
118399,3029398,36.941392


**Kaggle score = 0.44733**