**Imports**

In [1]:

import os
import pandas as pd
import math

import sklearn as sk
import sklearn.feature_selection
import sklearn.pipeline

import pyplotterlib.standard.plotters as ppl

#
import project_path
import preproc_pipes as preProcPipeHelp

**Configuration Variables**

In [2]:
TRAIN_PATH = os.path.join("..","..","raw_data","train.csv")

#Want to restrict to recent data only
N_DAYS_USE = 400
# N_DAYS_USE = 50


#Lag values for sales and number of items on promotion
SALE_LAG_VALS = [x for x in range(16,30)]
ON_PROM_LAGS = [1,2,3,4,5,6,7,14,15,16,17,18]

#The obvious lag here is 0; since we actually always KNOW todays (and even future days)
#(so we use lag_0 with these moving average windows)
ON_PROM_WINDOW_VALS = [2,3,4,5,6,7,14,21]

#Obvious lag is 16; since we only need to predict 15 days into the future 
#So these moving averages at sakes lag=16
SALE_WINDOW_VALS = [2,3,4,5,6,7,14,21]

#
_currKwargs = {"constrainedLayout":True, "figHeightPerRow":4, "figWidthPerCol":6, "nColsGrid":4}
RECT_MULTI_PLOTTER = ppl.RectMultiPlotter(**_currKwargs)


**Import data**

In [3]:
RAW_DF = pd.read_csv(TRAIN_PATH)

In [4]:
RAW_DF.tail(3)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.0,8
3000887,3000887,2017-08-15,9,SEAFOOD,16.0,0


**Add some features via imported pipelines**

In [5]:
_uniqueFams = RAW_DF["family"].unique()

In [6]:
_lagPromPipe = preProcPipeHelp.AddLagFeats(ON_PROM_LAGS, targCol="onpromotion")

In [7]:
_maPromPipe = preProcPipeHelp.AddLeftMovingAverage(ON_PROM_WINDOW_VALS, [0], targCol="onpromotion")
_maSalePipe = preProcPipeHelp.AddLeftMovingAverage(SALE_WINDOW_VALS, [16], targCol="sales")

In [8]:
_removeNDaysBehindPipe = preProcPipeHelp.RemoveDatesNDaysBehindMax(N_DAYS_USE)

_pipeComps = [ ("Add lag features", preProcPipeHelp.AddLagFeats(SALE_LAG_VALS)),
               ("Add lag onpromotion feature", _lagPromPipe ),
               ("Add transactions data", preProcPipeHelp.AddNumbTransactionsData() ),
               ("Add store wide sums", preProcPipeHelp.AddStoreWideSums() ),
               ("Add moving average sale values", _maSalePipe ),
               ("Add moving average promotion values", _maPromPipe),
               ("Remove earlier data", _removeNDaysBehindPipe ),
               ("Add fractional number of promotions", preProcPipeHelp.AddFractProm() ),
               ("Add oil prices", preProcPipeHelp.AddOilPriceData() ),
               ("Add basic store info", preProcPipeHelp.AddStoreInfoData()),
               ("Add ordinal encoding for store type", preProcPipeHelp.OrdEncodeStoreType() ),
               ("Add ordinal encoding for store state", preProcPipeHelp.StoreStateOrdEncode() ),
               ("Add ordinal encoding for store city", preProcPipeHelp.StoreCityOrdEncode() ),
               ("Encode family ordinally", preProcPipeHelp.EncodeFamilyArbitrary(_uniqueFams)  ),
               ("Target encode combination of store-nbr AND family", preProcPipeHelp.TargEncodeFamilyStore() ),
               ("Add day of week", preProcPipeHelp.AddDayOfWeekFeat() ),
               ("Add day of month", preProcPipeHelp.AddDayOfMonth() ),
               ("Add day of year sin/cos", preProcPipeHelp.AddDayOfYearSinCos() )
             ]

preProcPipe = sk.pipeline.Pipeline(_pipeComps)

**Apply the pipeline preprocessing**

In [9]:
%%prun
#          369375860 function calls (368413681 primitive calls) in 176.059 seconds
PROC_FRAME = preProcPipe.fit_transform(RAW_DF)

  summedFrame = useX.groupby(["date","store_nbr"]).sum().reset_index()
  meanDict = transFrame.groupby(["store_nbr"]).mean()["transactions"].to_dict()
  summedFrame = useX.groupby(["date","store_nbr"]).sum().reset_index()
  self.meanFamDict = useX.groupby("family_enc").mean()["sales"].to_dict()


 

**Calculate simple mutual information**

In [10]:
_onPromLags = ["onpromotion_l{}".format( int(x) ) for x in ON_PROM_LAGS]
_saleMaFeats = [ "sales_l16_ma{}".format(int(x)) for x in SALE_WINDOW_VALS]
_onPromMaFeats = ["onpromotion_l0_ma{}".format(int(x)) for x in ON_PROM_WINDOW_VALS]

In [11]:
cateVarKeys = ["store_nbr", "family_enc", "onpromotion", "day_of_week", "day_of_month", "store_type",
               "store_type_ordE", "store_cluster", "store_state_ordE", "store_city_ordE", "store_promotions"]

cateVarKeys += _onPromLags

#
useX = ["store_nbr", "family_enc", "onpromotion","day_of_week", "day_of_month",
        "sin_day_of_year", "cos_day_of_year", "fam_store_mean_enc","oil_price_w10",
        "store_type_ordE", "store_cluster", "store_state_ordE", "store_city_ordE",
        "store_promotions", "fract_promotions", "transactions"]


useX += ["sales_l16","sales_l17","sales_l18","sales_l19"]
useX += _onPromLags
useX += _saleMaFeats
useX += _onPromMaFeats

discFeats = [True if x in cateVarKeys else False for x in useX]


In [12]:
pd.set_option("display.max_columns",80)
PROC_FRAME.tail(3)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,sales_l16,sales_l17,sales_l18,sales_l19,sales_l20,sales_l21,sales_l22,sales_l23,sales_l24,sales_l25,sales_l26,sales_l27,sales_l28,sales_l29,onpromotion_l1,onpromotion_l2,onpromotion_l3,onpromotion_l4,onpromotion_l5,onpromotion_l6,onpromotion_l7,onpromotion_l14,onpromotion_l15,onpromotion_l16,onpromotion_l17,onpromotion_l18,transactions,store_sales,store_promotions,store_sales_y,store_sales_y.1,sales_l16_ma2,sales_l16_ma3,sales_l16_ma4,sales_l16_ma5,sales_l16_ma6,sales_l16_ma7,sales_l16_ma14,sales_l16_ma21,onpromotion_l0_ma2,onpromotion_l0_ma3,onpromotion_l0_ma4,onpromotion_l0_ma5,onpromotion_l0_ma6,onpromotion_l0_ma7,onpromotion_l0_ma14,onpromotion_l0_ma21,fract_promotions,oil_price_w10,store_city,store_state,store_type,store_cluster,store_type_ordE,store_state_ordE,store_city_ordE,family_enc,fam_store_mean_enc,day_of_week,day_of_month,sin_day_of_year,cos_day_of_year
712797,2997191.0,2017-08-13,54.0,SEAFOOD,2.0,0.0,4.0,2.0,3.0,7.0,1.0,5.0,3.0,0.0,3.0,0.0,6.0,3.0,5.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,1054.0,14246.827996,173.0,14246.827996,470145.323868,3.0,3.0,4.0,3.4,3.666667,3.571429,3.142857,3.47619,0.5,0.333333,0.25,0.2,0.166667,0.142857,0.142857,0.285714,0.0,49.158333,El Carmen,Manabi,C,3,2,15,21,32,1.9225,6,13,-0.666089,-0.745872
712798,2998973.0,2017-08-14,54.0,SEAFOOD,0.0,0.0,4.0,4.0,2.0,3.0,7.0,1.0,5.0,3.0,0.0,3.0,0.0,6.0,3.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,818.0,11882.994,126.0,11882.994,392138.802,4.0,3.333333,3.25,4.0,3.5,3.714286,3.285714,3.238095,0.0,0.333333,0.25,0.2,0.166667,0.142857,0.142857,0.238095,0.0,48.828333,El Carmen,Manabi,C,3,2,15,21,32,1.9225,0,14,-0.67882,-0.734304
712799,3000755.0,2017-08-15,54.0,SEAFOOD,3.0,0.0,4.0,4.0,4.0,2.0,3.0,7.0,1.0,5.0,3.0,0.0,3.0,0.0,6.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,802.0,12666.858,204.0,12666.858,418006.314,4.0,4.0,3.5,3.4,4.0,3.571429,3.214286,3.142857,0.0,0.0,0.25,0.2,0.166667,0.142857,0.142857,0.238095,0.0,48.648571,El Carmen,Manabi,C,3,2,15,21,32,1.9225,1,15,-0.691351,-0.722519


In [13]:
%%prun
#          3069114 function calls (3049317 primitive calls) in 134.830 seconds
_currArgs = [ PROC_FRAME[useX] , PROC_FRAME["sales"].to_numpy() ]
mutualInfoVals = sk.feature_selection.mutual_info_regression( *_currArgs, discrete_features=discFeats)

 

In [14]:
miList = sorted([ [key,val] for key,val in zip(useX,mutualInfoVals) ], key=lambda x:x[1], reverse=True )
miList

[['fam_store_mean_enc', 1.7074228523285404],
 ['sales_l16_ma21', 1.5901419134754784],
 ['sales_l16_ma14', 1.5867878827011648],
 ['sales_l16_ma7', 1.5683818774102578],
 ['sales_l16_ma6', 1.5496242071697237],
 ['sales_l16_ma5', 1.4916255914487335],
 ['sales_l16_ma4', 1.4467767065209731],
 ['sales_l16_ma3', 1.413973984488627],
 ['sales_l16_ma2', 1.3856893724874473],
 ['sales_l16', 1.3239437716419573],
 ['sales_l19', 1.3174075679676918],
 ['sales_l17', 1.290833397780296],
 ['sales_l18', 1.2881916267235125],
 ['family_enc', 1.19054370710845],
 ['onpromotion_l0_ma21', 0.8196941805615019],
 ['onpromotion_l0_ma14', 0.8063730587498474],
 ['onpromotion_l0_ma7', 0.792940691799549],
 ['onpromotion_l0_ma6', 0.7421286693255622],
 ['onpromotion_l0_ma5', 0.6953963170251498],
 ['onpromotion_l0_ma4', 0.6452452747673973],
 ['onpromotion_l0_ma3', 0.5971206903308488],
 ['onpromotion_l0_ma2', 0.5476185865233454],
 ['onpromotion', 0.5129246196280572],
 ['onpromotion_l7', 0.4816921805194996],
 ['fract_promoti

**Pragmatic estimation of mutual information score for variances from MEAN sales at each store/family combo**

In [15]:
PROC_FRAME["delta_sales"] = PROC_FRAME["sales"] - PROC_FRAME["fam_store_mean_enc"]

In [16]:
_currArgs = [ PROC_FRAME[useX] , PROC_FRAME["delta_sales"].to_numpy() ]
miDeltaSaleVals = sk.feature_selection.mutual_info_regression( *_currArgs, discrete_features=discFeats)

In [17]:
miListDelta = sorted([ [key,val] for key,val in zip(useX,miDeltaSaleVals) ], key=lambda x:x[1], reverse=True )
miListDelta

[['fam_store_mean_enc', 4.265533820790247],
 ['family_enc', 1.9976333965119002],
 ['store_nbr', 1.8591293785953105],
 ['store_cluster', 1.1456632932971327],
 ['store_city_ordE', 1.1077868965689586],
 ['sales_l16_ma21', 1.0921393790340534],
 ['sales_l16_ma14', 1.0757896946812302],
 ['sales_l16_ma7', 1.0509583413290633],
 ['sales_l16_ma6', 1.0423353565460056],
 ['sales_l16_ma5', 1.0315516505790674],
 ['sales_l16_ma4', 1.016628819010962],
 ['sales_l16_ma3', 0.9942546482263301],
 ['sales_l16_ma2', 0.953859970704384],
 ['store_state_ordE', 0.94616198413435],
 ['sales_l18', 0.8995464335642129],
 ['sales_l17', 0.893556398253514],
 ['sales_l19', 0.8890593969364922],
 ['sales_l16', 0.8848629138502098],
 ['onpromotion_l0_ma21', 0.6788778679007628],
 ['onpromotion_l0_ma14', 0.6626947122637556],
 ['onpromotion_l0_ma7', 0.6368055401469954],
 ['onpromotion_l0_ma6', 0.590459138979222],
 ['store_type_ordE', 0.584051467764908],
 ['onpromotion_l0_ma5', 0.5515716642321822],
 ['onpromotion_l0_ma4', 0.5078

**Look at the naive forecast average score as an interesting baseline**

In [18]:
def getScoreForNaive(inpFrame, predCol="sales_l16"):
    useFrame = inpFrame.copy()
    addSqrLogErrorToFrame(useFrame, predCol=predCol)
    return math.sqrt(useFrame["sqr_log_error"].mean())

def addSqrLogErrorToFrame(inpFrame, predCol="pred_A", actCol="sales", outCol="sqr_log_error"):
    inpFrame[outCol] = inpFrame.apply( lambda x: (math.log(x[predCol]+1) - math.log(x[actCol]+1))**2 ,axis=1)


In [19]:
getScoreForNaive(PROC_FRAME)

0.7857449544893435

In [20]:
getScoreForNaive(PROC_FRAME,predCol="fam_store_mean_enc")

0.8248791652245016