In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import operator
import sklearn
import matplotlib
matplotlib.use("Agg") #Needed to save figures
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline

In [9]:
# Gather some features
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])

    # Label encode some features
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear
    # Calculate time competition open time in months
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    # Promo open time in months
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0
    # Indicate that sales on that day are in promo interval
    features.append('IsPromoMonth')
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

    return data

In [10]:
print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("C:/Forcasting/XGBOOST/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("C:/Forcasting/XGBOOST/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("C:/Forcasting/XGBOOST/store.csv")

Load the training, test and store data using pandas


In [16]:
train

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,...,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,monthStr,IsPromoMonth
0,1,4,2015-07-31,5263,555,1,1,0,1.0,3,...,0.0,,2015,7,31,31,82.0,0.0,Jul,0
1,1,3,2015-07-30,5020,546,1,1,0,1.0,3,...,0.0,,2015,7,30,31,82.0,0.0,Jul,0
2,1,2,2015-07-29,4782,523,1,1,0,1.0,3,...,0.0,,2015,7,29,31,82.0,0.0,Jul,0
3,1,1,2015-07-28,5011,560,1,1,0,1.0,3,...,0.0,,2015,7,28,31,82.0,0.0,Jul,0
4,1,0,2015-07-27,6102,612,1,1,0,1.0,3,...,0.0,,2015,7,27,31,82.0,0.0,Jul,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844333,292,0,2013-01-07,9291,1002,1,1,0,0.0,1,...,0.0,,2013,1,7,2,43.0,0.0,Jan,0
844334,292,5,2013-01-05,2748,340,1,0,0,0.0,1,...,0.0,,2013,1,5,1,43.0,0.0,Jan,0
844335,292,4,2013-01-04,4202,560,1,0,0,1.0,1,...,0.0,,2013,1,4,1,43.0,0.0,Jan,0
844336,292,3,2013-01-03,4580,662,1,0,0,1.0,1,...,0.0,,2013,1,3,1,43.0,0.0,Jan,0


In [11]:
print("Assume store open, if not provided")
train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]
print("Use only Sales bigger then zero. Simplifies calculation of rmspe")
train = train[train["Sales"] > 0]

print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

Assume store open, if not provided
Consider only open stores for training. Closed stores wont count into the score.
Use only Sales bigger then zero. Simplifies calculation of rmspe
Join with store


In [12]:
features = []

print("augment features")
build_features(features, train)
build_features([], test)
print(features)

print('training data processed')

augment features
['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']
training data processed


In [13]:
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)


In [15]:
train[features]

Unnamed: 0,Store,CompetitionDistance,Promo,Promo2,SchoolHoliday,StoreType,Assortment,StateHoliday,DayOfWeek,Month,Day,Year,WeekOfYear,CompetitionOpen,PromoOpen,IsPromoMonth
0,1,1270.0,1,0,1.0,3,1,0,4,7,31,2015,31,82.0,0.0,0
1,1,1270.0,1,0,1.0,3,1,0,3,7,30,2015,31,82.0,0.0,0
2,1,1270.0,1,0,1.0,3,1,0,2,7,29,2015,31,82.0,0.0,0
3,1,1270.0,1,0,1.0,3,1,0,1,7,28,2015,31,82.0,0.0,0
4,1,1270.0,1,0,1.0,3,1,0,0,7,27,2015,31,82.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844333,292,1100.0,1,0,0.0,1,1,0,0,1,7,2013,2,43.0,0.0,0
844334,292,1100.0,0,0,0.0,1,1,0,5,1,5,2013,1,43.0,0.0,0
844335,292,1100.0,0,0,1.0,1,1,0,4,1,4,2013,1,43.0,0.0,0
844336,292,1100.0,0,0,1.0,1,1,0,3,1,3,2013,1,43.0,0.0,0


In [20]:
new_feature =['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth','Sales',"Date"]

In [21]:
train[new_feature]

Unnamed: 0,Store,CompetitionDistance,Promo,Promo2,SchoolHoliday,StoreType,Assortment,StateHoliday,DayOfWeek,Month,Day,Year,WeekOfYear,CompetitionOpen,PromoOpen,IsPromoMonth,Sales,Date
0,1,1270.0,1,0,1.0,3,1,0,4,7,31,2015,31,82.0,0.0,0,5263,2015-07-31
1,1,1270.0,1,0,1.0,3,1,0,3,7,30,2015,31,82.0,0.0,0,5020,2015-07-30
2,1,1270.0,1,0,1.0,3,1,0,2,7,29,2015,31,82.0,0.0,0,4782,2015-07-29
3,1,1270.0,1,0,1.0,3,1,0,1,7,28,2015,31,82.0,0.0,0,5011,2015-07-28
4,1,1270.0,1,0,1.0,3,1,0,0,7,27,2015,31,82.0,0.0,0,6102,2015-07-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844333,292,1100.0,1,0,0.0,1,1,0,0,1,7,2013,2,43.0,0.0,0,9291,2013-01-07
844334,292,1100.0,0,0,0.0,1,1,0,5,1,5,2013,1,43.0,0.0,0,2748,2013-01-05
844335,292,1100.0,0,0,1.0,1,1,0,4,1,4,2013,1,43.0,0.0,0,4202,2013-01-04
844336,292,1100.0,0,0,1.0,1,1,0,3,1,3,2013,1,43.0,0.0,0,4580,2013-01-03


In [22]:
train[new_feature].to_csv("final_features_for.csv")

In [7]:
params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.3,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 300

print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

Train a XGBoost model
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:5.79445	eval-rmse:5.79367	train-rmspe:0.99683	eval-rmspe:0.99682
Multiple eval metrics have been passed: 'eval-rmspe' will be used for early stopping.

Will train until eval-rmspe hasn't improved in 100 rounds.
[1]	train-rmse:4.06392	eval-rmse:4.06338	train-rmspe:0.98147	eval-rmspe:0.98144
[2]	train-rmse:2.85304	eval-rmse:2.85290	train-rmspe:0.93818	eval-rmspe:0.93810
[3]	train-rmse:2.00947	eval-rmse:2.00981	train-rmspe:0.85676	eval-rmspe:0.85652
[4]	train-rmse:1.42327	eval-rmse:1.42395	train-rmspe:0.74394	eval-rmspe:0.74311
[5]	train-rmse:1.01735	eval-rmse:1.01835	train-rmspe:0.61942	eval-rmspe:0.61711
[6]	train-rmse:0.74050	eval-rmse:0.74180	train-rmspe:0.50418	eval-rmsp

[93]	train-rmse:0.10290	eval-rmse:0.10831	train-rmspe:0.11791	eval-rmspe:0.11384
[94]	train-rmse:0.10246	eval-rmse:0.10794	train-rmspe:0.11743	eval-rmspe:0.11348
[95]	train-rmse:0.10230	eval-rmse:0.10780	train-rmspe:0.11729	eval-rmspe:0.11336
[96]	train-rmse:0.10196	eval-rmse:0.10759	train-rmspe:0.11596	eval-rmspe:0.11312
[97]	train-rmse:0.10172	eval-rmse:0.10736	train-rmspe:0.11569	eval-rmspe:0.11288
[98]	train-rmse:0.10138	eval-rmse:0.10716	train-rmspe:0.11519	eval-rmspe:0.11269
[99]	train-rmse:0.10108	eval-rmse:0.10696	train-rmspe:0.11480	eval-rmspe:0.11249
[100]	train-rmse:0.10080	eval-rmse:0.10684	train-rmspe:0.11452	eval-rmspe:0.11238
[101]	train-rmse:0.10043	eval-rmse:0.10652	train-rmspe:0.11414	eval-rmspe:0.11205
[102]	train-rmse:0.10007	eval-rmse:0.10628	train-rmspe:0.11374	eval-rmspe:0.11178
[103]	train-rmse:0.09983	eval-rmse:0.10617	train-rmspe:0.11345	eval-rmspe:0.11164
[104]	train-rmse:0.09966	eval-rmse:0.10608	train-rmspe:0.11330	eval-rmspe:0.11158
[105]	train-rmse:0.0994

[193]	train-rmse:0.08424	eval-rmse:0.09593	train-rmspe:0.09293	eval-rmspe:0.10026
[194]	train-rmse:0.08413	eval-rmse:0.09587	train-rmspe:0.09279	eval-rmspe:0.10018
[195]	train-rmse:0.08403	eval-rmse:0.09581	train-rmspe:0.09262	eval-rmspe:0.10010
[196]	train-rmse:0.08394	eval-rmse:0.09580	train-rmspe:0.09252	eval-rmspe:0.10010
[197]	train-rmse:0.08391	eval-rmse:0.09579	train-rmspe:0.09249	eval-rmspe:0.10010
[198]	train-rmse:0.08382	eval-rmse:0.09573	train-rmspe:0.09234	eval-rmspe:0.10002
[199]	train-rmse:0.08361	eval-rmse:0.09555	train-rmspe:0.09208	eval-rmspe:0.09976
[200]	train-rmse:0.08348	eval-rmse:0.09548	train-rmspe:0.09197	eval-rmspe:0.09970
[201]	train-rmse:0.08341	eval-rmse:0.09543	train-rmspe:0.09174	eval-rmspe:0.09965
[202]	train-rmse:0.08329	eval-rmse:0.09542	train-rmspe:0.09162	eval-rmspe:0.09963
[203]	train-rmse:0.08321	eval-rmse:0.09539	train-rmspe:0.09153	eval-rmspe:0.09961
[204]	train-rmse:0.08311	eval-rmse:0.09536	train-rmspe:0.09140	eval-rmspe:0.09958
[205]	train-rmse

[293]	train-rmse:0.07538	eval-rmse:0.09204	train-rmspe:0.08098	eval-rmspe:0.09613
[294]	train-rmse:0.07532	eval-rmse:0.09204	train-rmspe:0.08091	eval-rmspe:0.09612
[295]	train-rmse:0.07523	eval-rmse:0.09200	train-rmspe:0.08079	eval-rmspe:0.09607
[296]	train-rmse:0.07516	eval-rmse:0.09200	train-rmspe:0.08071	eval-rmspe:0.09606
[297]	train-rmse:0.07507	eval-rmse:0.09197	train-rmspe:0.08061	eval-rmspe:0.09604
[298]	train-rmse:0.07500	eval-rmse:0.09196	train-rmspe:0.08054	eval-rmspe:0.09605
[299]	train-rmse:0.07495	eval-rmse:0.09195	train-rmspe:0.08049	eval-rmspe:0.09606


In [24]:
print("Validating")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(yhat))
print('RMSPE: {:.6f}'.format(error))

Validating
RMSPE: 0.096058


In [25]:
np.expm1(yhat)

array([ 4044.6326,  5731.406 ,  6150.7925, ...,  6417.1167,  7640.0034,
       10487.21  ], dtype=float32)

In [26]:
X_valid.Sales.values

array([4090, 5903, 6062, ..., 6439, 7812, 9799], dtype=int64)

In [42]:
X_valid[features][52:60]

Unnamed: 0,Store,CompetitionDistance,Promo,Promo2,SchoolHoliday,StoreType,Assortment,StateHoliday,DayOfWeek,Month,Day,Year,WeekOfYear,CompetitionOpen,PromoOpen,IsPromoMonth
702443,931,1480.0,1,1,1.0,1,3,0,1,4,29,2014,18,31.0,28.25,1
320652,425,1460.0,0,0,0.0,4,3,0,5,1,17,2015,3,24181.0,0.0,0
121320,162,5340.0,0,1,0.0,4,3,0,2,3,12,2014,11,24.0,47.5,0
378921,502,220.0,1,1,0.0,1,1,0,2,11,5,2014,45,153.0,62.0,0
231684,309,8740.0,0,1,0.0,4,1,0,5,5,24,2014,21,24173.0,56.0,1
157911,211,350.0,1,0,0.0,1,3,0,4,3,7,2014,10,88.0,0.0,0
286981,380,2240.0,0,1,0.0,1,1,0,2,1,16,2013,3,-4.0,0.0,0
705528,935,22350.0,0,0,0.0,1,3,0,5,1,25,2014,4,43.0,0.0,0


In [41]:
X_valid.Sales.values[39:59]

array([12698,  9408,  4185,  7878,  2116,  5218,  5385,  9081,  3938,
        4783,  3818,  5984,  6266,  4493,  1946,  4641, 11533,  5672,
       13330, 10302], dtype=int64)

In [None]:
12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0

In [38]:
(2012 -2010)+(23-10)/4


5.25

In [33]:
12 *(2012 -2010)

24

In [34]:
(23-10)/4

3.25

In [36]:
24+3.25

27.25