<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">XGBoost Model</h1>
</div>

# References

- https://www.kaggle.com/zhangcheche/tps-2022-jan-xgboost-baseline
- https://www.kaggle.com/cv13j0/tps-jan22-quick-eda-xgboost
- [🎢 Introduction to Exploratory Data Analysis
](https://www.kaggle.com/robikscube/introduction-to-exploratory-data-analysis/)
- [The Ultimate Pandas Introduction [2022]](https://www.kaggle.com/robikscube/the-ultimate-pandas-introduction-2022)


<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [None]:
import pandas as pd
import numpy as np
import time

import matplotlib.pylab as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
# from sklearn.metrics import roc_auc_score
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier

from pathlib import Path

import optuna
from optuna.samplers import TPESampler

import holidays

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>


In [None]:
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 15)
plt.style.use('ggplot')

In [None]:
class Config:
    debug = False
    competition = "TPS_202201"
    seed = 42
    NFOLDS = 5
    EPOCHS = 10

In [None]:
data_dir = Path('../input/tabular-playground-series-jan-2022') # Change for every project

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data</h1>
</div>

In [None]:
%%time
train_df = pd.read_csv(data_dir / "train.csv", parse_dates=['date']
#                       nrows=100000
                      )

test_df = pd.read_csv(data_dir / "test.csv", parse_dates=['date'])
sample_submission = pd.read_csv(data_dir / "sample_submission.csv")

print(f"train data: Rows={train_df.shape[0]}, Columns={train_df.shape[1]}")
print(f"test data : Rows={test_df.shape[0]}, Columns={test_df.shape[1]}")

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Explore the Data</h1>
</div>

In [None]:
train_df.info()

In [None]:
train_df.head()

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Visualizations</h1>
</div>

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

## Holidays

Source: https://www.kaggle.com/zhangcheche/tps-2022-jan-xgboost-baseline

In [None]:
# Country List:['Finland' 'Norway' 'Sweden']
holiday_FI = holidays.CountryHoliday('FI', years=[2015, 2016, 2017, 2018, 2019])
holiday_NO = holidays.CountryHoliday('NO', years=[2015, 2016, 2017, 2018, 2019])
holiday_SE = holidays.CountryHoliday('SE', years=[2015, 2016, 2017, 2018, 2019])

holiday_dict = holiday_FI.copy()
holiday_dict.update(holiday_NO)
holiday_dict.update(holiday_SE)

train_df['date'] = pd.to_datetime(train_df['date']) # Convert the date to datetime.
train_df['holiday_name'] = train_df['date'].map(holiday_dict)
train_df['is_holiday'] = np.where(train_df['holiday_name'].notnull(), 1, 0)
train_df['holiday_name'] = train_df['holiday_name'].fillna('Not Holiday')

test_df['date'] = pd.to_datetime(test_df['date']) # Convert the date to datetime.
test_df['holiday_name'] = test_df['date'].map(holiday_dict)
test_df['is_holiday'] = np.where(test_df['holiday_name'].notnull(), 1, 0)
test_df['holiday_name'] = test_df['holiday_name'].fillna('Not Holiday')

## Time Features

In [None]:
def create_time_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create features base on the date variable, the idea is to extract as much 
    information from the date componets.
    Args
        df: Input data to create the features.
    Returns
        df: A DataFrame with the new time base features.
    """
    
    df['date'] = pd.to_datetime(df['date']) # Convert the date to datetime.
    
    # Start the creating future process.
    df['year'] = df['date'].dt.year
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['dayofmonth'] = df['date'].dt.days_in_month
    df['dayofyear'] = df['date'].dt.dayofyear
    df['weekofyear'] = df['date'].dt.weekofyear
    df['weekday'] = df['date'].dt.weekday
    df['is_weekend'] = np.where((df['weekday'] == 5) | (df['weekday'] == 6), 1, 0)
    
    return df

train_df = create_time_features(train_df)
test_df = create_time_features(test_df)


# Label Encode Categorical Features

Source: https://www.kaggle.com/zhangcheche/tps-2022-jan-xgboost-baseline

In [None]:
CATEGORICAL = ['country', 'store', 'product', 'holiday_name']

In [None]:
def encode_categorical_features(df, categorical_colums = CATEGORICAL):
    """
    Use the label encoder to encode categorical features...
    Args
        df
        categ_colums
    Returns
        df
    """
    le = LabelEncoder()
    for col in categorical_colums:
        df[col] = le.fit_transform(df[col])
    return df


# Transform Target

In [None]:
def transform_target(df: pd.DataFrame, target: str) -> pd.DataFrame:
    """
    Apply a log transformation to the target for better optimization 
    during training.
    """
    df[target] = np.log(df[target])
    return df

train_df = transform_target(train_df, 'num_sold')

# Extract Target and Drop Unused Columns

In [None]:
x_data = train_df.drop(['row_id', 'date', 'num_sold'], axis=1)
y = train_df.num_sold

x_test = test_df.drop(['row_id', 'date'], axis=1)


In [None]:
X = encode_categorical_features(x_data)
X_test = encode_categorical_features(x_test)

In [None]:
X.sample(5)

In [None]:
avoid = ['row_id', 'date', 'num_sold']
FEATURES = [feat for feat in X.columns if feat not in avoid]

# Print a list of all the features created...
print(FEATURES)

In [None]:
X[FEATURES].sample(5)

In [None]:
X.shape, X[FEATURES].shape

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">SMAPE Error Function</h1>
</div>

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">XGBoost Model</h1>
</div>


In [None]:
# smape val: 10.24285605422652

xgb_params = {
                'n_estimators': 4492,
                'learning_rate': 0.01,
                'subsample': 1.0,
                'colsample_bytree': 0.2,
                'max_depth': 15,
                'gamma': 1.0328829988080024,
                'reg_alpha': 100,
                'reg_lambda': 93 }

# xgb_params['tree_method'] = 'gpu_hist'
# xgb_params['predictor'] = 'gpu_predictor'

In [None]:
# https://www.kaggle.com/zhangcheche/tps-2022-jan-xgboost-baseline
# smape val: 4.585028665637415

xgb_params = {
    "n_estimators": 500, # 3082,
    "learning_rate": 0.01,
    "subsample": 0.7,
    "colsample_bytree": 0.8,
    "max_depth": 15,
    "gamma": 4.475257278569414,
    "reg_alpha": 95,
    "reg_lambda": 100,
}


<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Train with Cross Validation</h1>
</div>

In [None]:
final_test_predictions = []
final_valid_predictions = {}
scores = []

feat_import = np.zeros(len(FEATURES))

kf = KFold(n_splits=Config.NFOLDS, shuffle=True, random_state=Config.seed)

for fold, (train_idx, valid_idx) in enumerate(kf.split(X = X, y = y)):

    print(10*"=", f"Fold={fold}", 10*"=")
    start_time = time.time()
    x_train = X.loc[train_idx, :]
    x_valid = X.loc[valid_idx, :]
    
    y_train = y[train_idx]
    y_valid = y[valid_idx]
    model = XGBRegressor(**xgb_params)

    model.fit(x_train, y_train,
          early_stopping_rounds=200,
          eval_set=[(x_valid, y_valid)],
          eval_metric='rmse',
          verbose=0)

    
    preds_valid = model.predict(x_valid)
    final_valid_predictions.update(dict(zip(valid_idx, preds_valid)))
    
    smape = SMAPE(y_valid,  preds_valid)
    scores.append(smape)
    
    test_preds = model.predict(X_test)
    final_test_predictions.append(test_preds)
    
    feat_import += model.feature_importances_

    run_time = time.time() - start_time
    print(f"Fold={fold}, SMAPE: {smape:.8f}, Run Time: {run_time:.2f}")


<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Scores</h1>
</div>

smape val: 10.24285605422652


In [None]:
mean_score = np.mean(scores)

print(f"Scores -> mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}")

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Inference on Test Data</h1>
</div>

In [None]:
y_test = model.predict(x_test)

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Submission</h1>
</div>


In [None]:
y_test

In [None]:
pred = np.exp(y_test)

In [None]:
sample_submission.num_sold = pred
sample_submission.to_csv('submission.csv', index=False)
sample_submission