In [3]:
import pandas as pd
import numpy as np


#visualizations libraries
import matplotlib.pyplot as plt
import seaborn as sns
color_pal = sns.color_palette()

plt.style.use('bmh')
plt.rc(
    "figure",
    autolayout=True,
    figsize=(11, 4),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)

#machine learning libraries
import xgboost as xgb
import catboost as cb 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer

from matplotlib import patheffects


import os


#project modules
import src.wrangle as wr
import src.summaries as s
import src.explore as ex


pd.options.display.float_format = '{:,.2f}'.format
from importlib import reload

import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'


In [7]:
##### get the data
df = s.get_summary_df(wr.get_clean_data(start2018=True))
#### split into train, validate, test sets
train, test = wr.split_data(df)
validate = test.loc[:'2022-06'].copy() 
test = test.loc['2022-07':].copy()
# get time series with the daily resample
X_train_ts = train.purchase_amount.copy().resample('D').sum()
X_validate_ts = validate.purchase_amount.copy().resample('D').sum()
X_test_ts = test.purchase_amount.copy().resample('D').sum()
# data frames out of the TS
X_train = X_train_ts.to_frame()
X_validate = X_validate_ts.to_frame()
X_test = X_test_ts.to_frame()

In [8]:
X_train = wr.add_date_features(X_train)
X_validate = wr.add_date_features(X_validate)
X_test = wr.add_date_features(X_test)

In [10]:
features = ['month', 'week', 'day_of_week', 'year','day_of_year']
target = 'purchase_amount'

In [11]:
X_train.head(1)

Unnamed: 0_level_0,purchase_amount,year,quarter,month,week,day_of_week,day_of_year,month_name,day_name
order_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-02,171192.78,2018,1,1,1,1,2,January,Tuesday


In [12]:
y_train = X_train.purchase_amount
y_validate = X_validate.purchase_amount
y_test = X_test.purchase_amount

X_train = X_train[features]
X_validate = X_validate[features]
X_test = X_test[features]

In [13]:
X_train.head(1)

Unnamed: 0_level_0,month,week,day_of_week,year,day_of_year
order_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-02,1,1,1,2018,2


In [14]:
# data frame to keep scores
scores = pd.DataFrame(columns=['model_name', 'train_score', 'validate_score'])

In [15]:
y_train

order_date
2018-01-02     171,192.78
2018-01-03     102,080.14
2018-01-04     181,355.01
2018-01-05   1,029,700.32
2018-01-06      41,740.42
                 ...     
2021-12-27   2,016,181.91
2021-12-28      88,262.12
2021-12-29     110,882.66
2021-12-30      54,270.69
2021-12-31         712.76
Freq: D, Name: purchase_amount, Length: 1460, dtype: float64

In [None]:
xgboost_regular_model = xgb.XGBRegressor(n_estimators = 1000, 
                         early_stopping_rounds = 50,
                         learning_rate=0.01)
xgboost_regular_model.fit(X_train, y_train,
         eval_set = [(X_train, y_train), (X_validate, y_validate)],
         verbose = 10)