In [1]:
import pandas as pd
#from pandas_profiling import ProfileReport

In [2]:
train_data = pd.read_csv("../minicomp-rossman/data/train.csv").reset_index(drop=True)
store_data = pd.read_csv("../minicomp-rossman/data/store.csv").reset_index(drop=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
#profile = ProfileReport(store_data)
#profile

In [4]:
def add_time_features(df):
    df["Date"] = pd.to_datetime(df["Date"])
    df['Year'] = df.Date.dt.year - 2013
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day
    df['DayOfWeek'] = df.Date.dt.dayofweek
    df['WeekOfYear'] = df.Date.dt.isocalendar().week.astype(int)
    df = df.drop(labels = "Date", axis=1)
    return df

In [5]:
def drop_nans_nulls_closed(df):
    df = df[df["Open"]==1]
    df = df[df["Sales"]!=0]
    df = df.dropna(subset = ["Store", "Open", "Promo", "StateHoliday", "SchoolHoliday", "Sales"])
    df["StateHoliday"].replace({0.0: "0"}, inplace=True)
    return df

In [6]:
def drop_cols(df):
    df = df.drop(labels = ["Customers"], axis=1)
    return df

In [7]:
train_data = add_time_features(train_data)
train_data = drop_nans_nulls_closed(train_data)
train_data = drop_cols(train_data)

In [8]:
merged_train = pd.merge(train_data, store_data, how='left', on="Store")

In [9]:
def recode(df, var_list=[]):
    map_dict = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    for v in var_list:
        df[v].replace(map_dict, inplace=True)
    return df

merged_train = recode(merged_train, ['StoreType', 'Assortment', 'StateHoliday'])

In [10]:
def month_to_str(df):
    map_dict = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    df['month_str'] = df.Month.map(map_dict)
    return df

merged_train = month_to_str(merged_train)

In [11]:
def check_promo(row):
    if isinstance(row['PromoInterval'],str) and row['month_str'] in row['PromoInterval']:
            return 1
    else:
            return 0

In [12]:
merged_train['PromoMonth'] =  merged_train.apply(lambda row: check_promo(row),axis=1)    

## load and prepare Holdout_data 

In [13]:
holdout_data = pd.read_csv("../minicomp-rossman/data/holdout_b29.csv")

In [14]:
holdout_data = holdout_data.rename(columns={"Unnamed: 0": "Id"})
holdout_data["Id"] = holdout_data["Id"] + 1

In [15]:
def drop_nn(df):
    df = df[df["Open"]==1]
    df = df.dropna(subset = ["Store", "Open", "Promo", "StateHoliday", "SchoolHoliday"])
    df["StateHoliday"].replace({0.0: "0"}, inplace=True)
    return df

In [16]:
holdout_data = add_time_features(holdout_data)
holdout_data = drop_nn(holdout_data)
holdout_data = drop_cols(holdout_data)

In [27]:
merged_holdout = pd.merge(holdout_data, store_data, how='left', on="Store")
merged_holdout = recode(merged_holdout, ['StoreType', 'Assortment', 'StateHoliday'])
merged_holdout = month_to_str(merged_holdout)
merged_holdout['PromoMonth'] =  merged_holdout.apply(lambda row: check_promo(row),axis=1)  

In [17]:
train = merged_train.iloc[:int(0.9*train_data.shape[0]), :]
eva = merged_train.iloc[int(0.9*train_data.shape[0]):int(0.95*train_data.shape[0]), :]
test = merged_train.iloc[int(0.95*train_data.shape[0]):, :]

In [18]:
import xgboost as xgb
from xgboost import plot_importance

In [20]:
target = 'Sales'

my_features = ["Store",'Year', 'Month', 'Day', 'WeekOfYear', 'DayOfWeek','StateHoliday', 'SchoolHoliday', 
               'CompetitionDistance', 'Promo2', 'StoreType', 'Assortment', 'Open', 'Promo', "PromoMonth"]

In [21]:
merged_train[my_features].head()

Unnamed: 0,Store,Year,Month,Day,WeekOfYear,DayOfWeek,StateHoliday,SchoolHoliday,CompetitionDistance,Promo2,StoreType,Assortment,Open,Promo,PromoMonth
0,353.0,0,1,1,1,1,1,1.0,900.0,1,2,2,1.0,0.0,0
1,335.0,0,1,1,1,1,1,1.0,90.0,1,2,1,1.0,0.0,1
2,512.0,0,1,1,1,1,1,1.0,590.0,1,2,2,1.0,0.0,0
3,494.0,0,1,1,1,1,1,1.0,1260.0,0,2,1,1.0,0.0,0
4,530.0,0,1,1,1,1,1,1.0,18160.0,0,1,3,1.0,0.0,0


In [22]:
params = {"objective": "reg:squarederror", 
          "booster" : "gbtree", 
          "seed": 10 }

In [23]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def compute_rmspe(actual, prediction):
    """
    Computs RMSPE (root mean squared percentage error) between predictions from a model
    and the actual values of the target variable.
    """
    
    rmspe = sqrt(mean_squared_error(actual, prediction)) / prediction.mean()
    
    # rounding to 2 decimal places
    print('RMSPE is ', round(rmspe,2))
    
    return rmspe

In [24]:
def xgboost_experiment(vars_list, experiment_name, params, num_boost_round):
    dtrain = xgb.DMatrix(train[vars_list], label=train['Sales'], enable_categorical=True)
    deva = xgb.DMatrix(eva[vars_list], label=eva['Sales'])
    dtest = xgb.DMatrix(test[vars_list], label=test['Sales'], enable_categorical=True)
    
    #train
    xgb_model = xgb.train(params, dtrain, num_boost_round=num_boost_round, 
                      early_stopping_rounds=100, evals=[(deva, "Eval")], verbose_eval=False)

    # make prediction
    print('+++++ Results for experiment: ', experiment_name)
    pred = xgb_model.predict(dtest)
    print("Testerror")
    print(compute_rmspe(test[target], pred) / test[target].mean())
    pred = xgb_model.predict(dtrain)
    print("Trainerror")
    compute_rmspe(train[target], pred) 
    return xgb_model

In [215]:
#num_boost_round_list = [100, 500, 1000]

#for n in num_boost_round_list:
#    print('### Experiment with ', str(n), ' boosting rounds')
#   xgboost_experiment(my_features, "my_features", params, n)

In [222]:
train[my_features].head()

Unnamed: 0,Store,Year,Month,Day,WeekOfYear,DayOfWeek,StateHoliday,SchoolHoliday,CompetitionDistance,Promo2,StoreType,Assortment,Open,Promo,PromoMonth,DayS,DayOfWeekS,StoreM
0,5160.810638,0,6399.393606,1,1,1,1,1.0,900.0,1,9902.088827,8141.902433,1.0,0.0,0,7234.412655,6896.934074,6842.080243
1,12959.368313,0,6399.393606,1,1,1,1,1.0,90.0,1,9902.088827,6567.665908,1.0,0.0,1,7234.412655,6896.934074,6854.443247
2,5173.130734,0,6399.393606,1,1,1,1,1.0,590.0,1,9902.088827,8141.902433,1.0,0.0,0,7234.412655,6896.934074,6848.775611
3,7479.6841,0,6399.393606,1,1,1,1,1.0,1260.0,0,9902.088827,6567.665908,1.0,0.0,0,7234.412655,6896.934074,6848.258186
4,4425.829741,0,6399.393606,1,1,1,1,1.0,18160.0,0,6841.67834,7118.831048,1.0,0.0,0,7234.412655,6896.934074,6847.086991


In [25]:
my_model = xgboost_experiment(my_features, "final", params, 1000)

+++++ Results for experiment:  final
Testerror
RMSPE is  0.16
2.267132162128101e-05
Trainerror
RMSPE is  0.09


In [28]:
merged_holdout[my_features].head(5)

Unnamed: 0,Store,Year,Month,Day,WeekOfYear,DayOfWeek,StateHoliday,SchoolHoliday,CompetitionDistance,Promo2,StoreType,Assortment,Open,Promo,PromoMonth
0,371,1,8,1,31,4,0,1,1970.0,1,4,3,1,1,1
1,372,1,8,1,31,4,0,1,4880.0,1,4,3,1,1,0
2,373,1,8,1,31,4,0,1,11120.0,1,4,3,1,1,0
3,380,1,8,1,31,4,0,1,2240.0,1,1,1,1,1,0
4,374,1,8,1,31,4,0,1,1150.0,0,1,1,1,1,0


In [29]:
xgtest = xgb.DMatrix(merged_holdout[my_features])

In [30]:
pred = my_model.predict(xgtest)

In [31]:
output = pd.DataFrame({'Id': holdout_data.Id, 'Sales': pred})
output.to_csv('Submission2.csv', index=False)

In [32]:
sub = pd.read_csv("Submission2.csv")
sub.head()

Unnamed: 0,Id,Sales
0,1,8119.111
1,2,10948.43
2,3,6395.9316
3,4,20013.13
4,5,10836.173
