In [48]:
import pandas as pd
#from pandas_profiling import ProfileReport

In [49]:
train_data = pd.read_csv("../minicomp-rossman/data/train.csv").reset_index(drop=True)
store_data = pd.read_csv("../minicomp-rossman/data/store.csv").reset_index(drop=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [52]:
#profile = ProfileReport(store_data)
#profile

In [53]:
def add_time_features(df):
    df["Date"] = pd.to_datetime(df["Date"])
    df['Year'] = df.Date.dt.year - 2013
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day
    df['DayOfWeek'] = df.Date.dt.dayofweek
    df['WeekOfYear'] = df.Date.dt.isocalendar().week.astype(int)
    df = df.drop(labels = "Date", axis=1)
    return df

In [54]:
def drop_nans_nulls_closed(df):
    df = df[df["Open"]==1]
    df = df[df["Sales"]!=0]
    df = df.dropna(subset = ["Store", "Open", "Promo", "StateHoliday", "SchoolHoliday", "Sales"])
    df["StateHoliday"].replace({0.0: "0"}, inplace=True)
    return df

In [55]:
def drop_cols(df):
    df = df.drop(labels = ["Customers"], axis=1)
    return df

In [56]:
train_data = add_time_features(train_data)
train_data = drop_nans_nulls_closed(train_data)
train_data = drop_cols(train_data)

In [57]:
merged_train = pd.merge(train_data, store_data, how='left', on="Store")

In [58]:
def recode(df, var_list=[]):
    map_dict = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    for v in var_list:
        df[v].replace(map_dict, inplace=True)
    return df

merged_train = recode(merged_train, ['StoreType', 'Assortment', 'StateHoliday'])

In [60]:
def month_to_str(df):
    map_dict = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    df['month_str'] = df.Month.map(map_dict)
    return df

merged_train = month_to_str(merged_train)

In [61]:
def check_promo(row):
    if isinstance(row['PromoInterval'],str) and row['month_str'] in row['PromoInterval']:
            return 1
    else:
            return 0

In [62]:
merged_train['PromoMonth'] =  merged_train.apply(lambda row: check_promo(row),axis=1)    

In [63]:
merged_train.head()

Unnamed: 0,Store,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,...,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,month_str,PromoMonth
0,353.0,1,3139.0,1.0,0.0,1,1.0,0,1,1,...,2,900.0,,,1,14.0,2013.0,"Feb,May,Aug,Nov",Jan,0
1,335.0,1,2401.0,1.0,0.0,1,1.0,0,1,1,...,1,90.0,,,1,31.0,2013.0,"Jan,Apr,Jul,Oct",Jan,1
2,512.0,1,2646.0,1.0,0.0,1,1.0,0,1,1,...,2,590.0,,,1,5.0,2013.0,"Mar,Jun,Sept,Dec",Jan,0
3,494.0,1,3113.0,1.0,0.0,1,1.0,0,1,1,...,1,1260.0,6.0,2011.0,0,,,,Jan,0
4,530.0,1,2907.0,1.0,0.0,1,1.0,0,1,1,...,3,18160.0,,,0,,,,Jan,0


In [64]:
train = merged_train.iloc[:int(0.9*train_data.shape[0]), :]
eva = merged_train.iloc[int(0.9*train_data.shape[0]):int(0.95*train_data.shape[0]), :]
test = merged_train.iloc[int(0.95*train_data.shape[0]):, :]

In [65]:
import xgboost as xgb
from xgboost import plot_importance

In [66]:
target = 'Sales'

my_features = ['Year', 'Month', 'Day', 'WeekOfYear', 'DayOfWeek','StateHoliday', 'SchoolHoliday', 
                           'CompetitionDistance', 'Promo2', 'StoreType', 'Assortment', 'Open', 'Promo', "PromoMonth"]

In [67]:
params = {"objective": "reg:squarederror", 
          "booster" : "gbtree", 
          "seed": 10 }

In [68]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def compute_rmspe(actual, prediction):
    """
    Computs RMSPE (root mean squared percentage error) between predictions from a model
    and the actual values of the target variable.
    """
    
    rmspe = sqrt(mean_squared_error(actual, prediction)) / prediction.mean()
    
    # rounding to 2 decimal places
    print('RMSPE is ', round(rmspe,2))
    
    return rmspe

In [69]:
def xgboost_experiment(vars_list, experiment_name, params, num_boost_round):
    dtrain = xgb.DMatrix(train[vars_list], label=train['Sales'], enable_categorical=True)
    deva = xgb.DMatrix(eva[vars_list], label=eva['Sales'])
    dtest = xgb.DMatrix(test[vars_list], label=test['Sales'], enable_categorical=True)
    
    #train
    xgb_model = xgb.train(params, dtrain, num_boost_round=num_boost_round, 
                      early_stopping_rounds=100, evals=[(deva, "Eval")], verbose_eval=False)

    # make prediction
    print('+++++ Results for experiment: ', experiment_name)
    pred = xgb_model.predict(dtest)
    print("Testerror")
    print(compute_rmspe(test[target], pred) / test[target].mean())
    pred = xgb_model.predict(dtrain)
    print("Trainerror")
    compute_rmspe(train[target], pred) 
    return xgb_model

In [None]:
#num_boost_round_list = [100, 500, 1000]

#for n in num_boost_round_list:
#    print('### Experiment with ', str(n), ' boosting rounds')
#   xgboost_experiment(my_features, "my_features", params, n)

In [77]:
my_model = xgboost_experiment(my_features, "final", params, 1000)

+++++ Results for experiment:  final
Testerror
RMSPE is  0.2
2.7841191932995542e-05
Trainerror
RMSPE is  0.15


In [78]:
plot_importance(my_model)

<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>