Goal: Run Sina_day 2 boosted trees model with Xgboost but with preprocessing of data as defined in the Alo notebook:

X = Store, Day of week, Open, Promo, State holiday, School holiday (dropped columns: Date, Columns)
y = Sales

Encoding: BaseN

In [1]:
from math import sqrt

import category_encoders as ce
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from xgboost import plot_importance

## Import Data

In [2]:
train_data_original = pd.read_csv("../minicomp-rossman/data/train.csv")
store_data_original = pd.read_csv("../minicomp-rossman/data/store.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


## Preprocessing

In [3]:
# define dataset
train_data = train_data_original.copy(deep=True)

In [4]:
# drop 0's and null values for Sales

train_data = train_data.loc[train_data.loc[:, "Sales"] != 0]

null_sales = train_data.loc[train_data.loc[:, "Sales"].isnull()]
train_data = train_data.loc[~train_data.index.isin(null_sales.index)]

In [5]:
# drop customer, date columns and rows with null values

train_data = train_data.drop(["Customers", "Date"], axis=1)
train_data = train_data.dropna()

## Split train and test

In [6]:
# train = train_data.iloc[: int(0.8 * train_data.shape[0]), :]
# eva = merged_train.iloc[int(0.8*train_data.shape[0]):int(0.8*train_data.shape[0]), :]
# test = train_data.iloc[int(0.8 * train_data.shape[0]) :, :]

In [7]:
train = train_data.iloc[:int(0.9*train_data.shape[0]), :]
eva = train_data.iloc[int(0.9*train_data.shape[0]):int(0.95*train_data.shape[0]), :]
test = train_data.iloc[int(0.95*train_data.shape[0]):, :]

In [8]:
X_train = train.drop(columns=["Sales"])
y_train = train.loc[:, "Sales"]

X_eva = train.drop(columns=["Sales"])
y_eva = train.loc[:, "Sales"]

X_test = test.drop(columns=["Sales"])
y_test = test.loc[:, "Sales"]

In [9]:
train_data.head()

Unnamed: 0,Store,DayOfWeek,Sales,Open,Promo,StateHoliday,SchoolHoliday
27,353.0,2.0,3139.0,1.0,0.0,a,1.0
115,335.0,2.0,2401.0,1.0,0.0,a,1.0
147,512.0,2.0,2646.0,1.0,0.0,a,1.0
162,494.0,2.0,3113.0,1.0,0.0,a,1.0
199,530.0,2.0,2907.0,1.0,0.0,a,1.0


In [10]:
# Encoding pipeline

ce_basen = ce.BaseNEncoder(cols=["StateHoliday"])
X_train = ce_basen.fit_transform(X_train, y_train)
X_test = ce_basen.transform(X_test)
X_eva = ce_basen.transform(X_eva)

In [11]:
X_train.head()

Unnamed: 0,Store,DayOfWeek,Open,Promo,StateHoliday_0,StateHoliday_1,StateHoliday_2,SchoolHoliday
27,353.0,2.0,1.0,0.0,0,0,1,1.0
115,335.0,2.0,1.0,0.0,0,0,1,1.0
147,512.0,2.0,1.0,0.0,0,0,1,1.0
162,494.0,2.0,1.0,0.0,0,0,1,1.0
199,530.0,2.0,1.0,0.0,0,0,1,1.0


## Gradient Boosted Tree

### Define metric

In [12]:
def compute_rmse(actual, prediction):
    """
    Computs RMSE (root mean squared error) between predictions from a model
    and the actual values of the target variable.
    """

    rmse = sqrt(mean_squared_error(actual, prediction))

    # rounding to 2 decimal places
    print("RMSE is ", round(rmse, 2))

    return rmse

In [13]:
target = "Sales"

features = [
    "Store",
    "DayOfWeek",
    "Open",
    "Promo",
    "StateHoliday_0",
    "StateHoliday_1",
    "StateHoliday_2",
    "SchoolHoliday",
]

In [14]:
params = {"objective": "reg:squarederror", "booster": "gbtree", "seed": 10}

In [15]:
def xgboost_experiment(features_list, experiment_name, params, num_boost_round):
    dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
    deva = xgb.DMatrix(X_eva, label=y_eva)
    dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)
    
    #train
    xgb_model = xgb.train(params, dtrain, num_boost_round=num_boost_round, 
                      early_stopping_rounds=100, evals=[(deva, "Eval")], verbose_eval=False)

    # make prediction
    print('+++++ Results for experiment: ', experiment_name)
    pred = xgb_model.predict(dtest)
    print("Testerror")
    print(compute_rmse(y_test, pred) / y_test.mean())
    pred = xgb_model.predict(dtrain)
    print("Trainerror")
    compute_rmse(y_train, pred) 
    return xgb_model

In [None]:
num_boost_round_list = [100, 1000]
f = 'train table features'

for n in num_boost_round_list:
    print('### Experiment with ', str(n), ' boosting rounds')
    xgboost_experiment(features, f, params, n)