# XGBoost

Creating several models that I hope to blend near the end of the competiton.

## Models

- [👽 TPS Feb 22: XGBoost+Optuna+CV+OOF](https://www.kaggle.com/mmellinger66/tps-feb-22-xgboost-cv-oof/)
- [👽 TPS Feb 22: ExtraTreeClassifier + CV + OOF](https://www.kaggle.com/mmellinger66/tps-feb-22-extratreeclassifier-cv-oof/)
- [👽TPS Feb 22: EDA+LGBM+Optuna+CV+OOF](https://www.kaggle.com/mmellinger66/tps-feb-22-eda-lgbm-cv-oof)

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [None]:
import os
from pathlib import Path
import time
import gc

import pandas as pd
import numpy as np
import datatable as dt  # Fast table loading

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

# from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from scipy.stats import mode

from xgboost import XGBClassifier

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

# Visualization Libraries
import matplotlib.pylab as plt
import seaborn as sns

import warnings

warnings.filterwarnings("ignore")

plt.style.use("fivethirtyeight")  # ggplot fivethirtyeight bmh
pd.options.display.max_columns = 500

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [None]:
# Black formatter https://black.readthedocs.io/en/stable/

! pip install nb-black > /dev/null

%load_ext lab_black

In [None]:
MODEL = "xgb1"

In [None]:
class Config:
    debug = False
    optimize = False
    competition = "TPS_202202"
    seed = 42
    N_ESTIMATORS = 300  # 100, 5000
    N_FOLDS = 5  # 5,10,15
    SEED_LENGTH = 1  # 5,10

In [None]:
TARGET = "target"
TARGET_ENC = "target_num"

In [None]:
# Change for every competition
data_dir = Path("../input/tabular-playground-series-feb-2022")

In [None]:
if not os.path.exists("results"):
    os.makedirs("results")

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data</h1>
</div>

Using datatable as described in [Tutorial on reading datasets](https://www.kaggle.com/hiro5299834/tutorial-on-reading-datasets)

In [None]:
%%time
# https://www.kaggle.com/hiro5299834/tutorial-on-reading-datasets

# train_df = pd.read_csv(data_dir / "train.csv")
# test_df = pd.read_csv(data_dir / "test.csv")
train_df = dt.fread(data_dir / "train.csv").to_pandas()
test_df = dt.fread(data_dir / "test.csv").to_pandas()

submission_df = pd.read_csv(data_dir / "sample_submission.csv")

print(f"train data: Rows={train_df.shape[0]}, Columns={train_df.shape[1]}")
print(f"test data : Rows={test_df.shape[0]}, Columns={test_df.shape[1]}")

## Create kfolds

In [None]:
def create_folds(df, n_folds=5, seed=42):
    df["fold"] = -1

    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

In [None]:
train_df = create_folds(train_df)

## Identify Categorical and Continuous Featues

Note, some categorical feature could look continuous.  Will need to verify > 20 unique values, for example.

In [None]:
continuous_features = [
    f for f in train_df.columns if f not in ("row_id", "target", "fold")
]

In [None]:
plt.figure()
fig, ax = plt.subplots(3, 3, figsize=(20, 22))

for i, feature in enumerate(continuous_features[:9]):
    #     print(f"Feature: {feature}")
    plt.subplot(3, 3, i + 1)
    sns.histplot(
        train_df[feature], color="blue", kde=True, bins=5, label="train_" + feature
    )
    sns.histplot(
        test_df[feature], color="olive", kde=True, bins=5, label="test_" + feature
    )
    plt.xlabel(feature, fontsize=9)
    plt.legend()
plt.show()

## Target Analysis

In [None]:
plt.figure()
fig, ax = plt.subplots(1, 1, figsize=(15, 4))

sns.countplot(x=train_df[TARGET], data=train_df)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment="right")

plt.show()

## Label Encode Target Manually

In [None]:
targets = train_df[TARGET].unique()
target_dict = {t: i for i, t in enumerate(targets)}  # Use {} not []
target_dict

In [None]:
# Need the inverse dictionary for the submission file
target_dict_inv = {v: k for k, v in target_dict.items()}
target_dict_inv

In [None]:
train_df[TARGET_ENC] = train_df[TARGET].map(target_dict)

In [None]:
train_df = train_df.drop(["target"], axis=1)

In [None]:
# FEATURES = continuous_features[:20]
FEATURES = continuous_features
# df = train_df.copy()

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

## GCD Clustering Idea

Idea from [@ambrosm](https://www.kaggle.com/ambrosm)

- [TPSFEB22-03 Clustering Improves the Predictions](https://www.kaggle.com/ambrosm/tpsfeb22-03-clustering-improves-the-predictions)

In [None]:
from math import factorial

elements = [
    e
    for e in train_df.columns
    if e not in ["row_id", "target", "target_num", "fold", "gcd", "isTrain"]
]


def bias(w, x, y, z):
    return factorial(10) / (
        factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4 ** 10
    )


def bias_of(s):
    #     print(f"Bias of: {s}")
    w = int(s[1 : s.index("T")])
    x = int(s[s.index("T") + 1 : s.index("G")])
    y = int(s[s.index("G") + 1 : s.index("C")])
    z = int(s[s.index("C") + 1 :])
    return factorial(10) / (
        factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4 ** 10
    )


train_i = pd.DataFrame(
    {
        col: ((train_df[col] + bias_of(col)) * 1000000).round().astype(int)
        for col in elements
    }
)
test_i = pd.DataFrame(
    {
        col: ((test_df[col] + bias_of(col)) * 1000000).round().astype(int)
        for col in elements
    }
)


def gcd_of_all(df_i):
    gcd = df_i[elements[0]]
    for col in elements[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd


train_df["gcd"] = gcd_of_all(train_i)
test_df["gcd"] = gcd_of_all(test_i)

In [None]:
FEATURES = [
    col
    for col in train_df.columns
    if col not in ["row_id", "fold", "target", "target_num", "isTrain"]
]

In [None]:
# def objective(trial, X, y):
def objective(trial, X_train, X_valid, y_train, y_valid):

    xgb_params = {
        "objective": trial.suggest_categorical("objective", ["multi:softmax"]),
        "eval_metric": "mlogloss",
        #         "objective": "multi:softmax",
        "use_label_encoder": trial.suggest_categorical("use_label_encoder", [False]),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 100, step=0.1),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["gpu_hist"]
        ),  # hist, gpu_hist
        "predictor": "gpu_predictor",
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = XGBClassifier(**xgb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

    print(f"Number of boosting rounds: {model.best_iteration}")
    #     oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)

    return accuracy_score(y_valid, oof)

In [None]:
# Setting optuna verbosity to show only warning messages
# If the line is uncommeted each iteration results will be shown
# optuna.logging.set_verbosity(optuna.logging.WARNING)

X = train_df[FEATURES].copy()
y = train_df["target_num"]

print(X.shape)
print(y.shape)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
time_limit = 3600 * 3

if Config.optimize:
    study = optuna.create_study(direction="maximize")
    study.optimize(
        lambda trial: objective(trial, X_train, X_valid, y_train, y_valid),
        n_trials=10,  # 2, 10, 50, 100
        # timeout=time_limit,  # this or n_trials
    )

## Show Optimization Results

In [None]:
if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)

In [None]:
# Historic
if Config.optimize:
    plot_optimization_history(study)

In [None]:
# Importance
if Config.optimize:
    plot_param_importances(study)

- [XGBoost GPU Support](https://xgboost.readthedocs.io/en/stable/gpu/index.html)

In [None]:
xgb_params = {
    #     "objective": "binary:logistic",
    "use_label_encoder": False,
    "n_estimators": Config.N_ESTIMATORS,
    "eval_metric": "mlogloss",
    "learning_rate": 0.1,
    #     "learning_rate": 0.15525187869673937,
    "subsample": 0.66,
    "colsample_bytree": 0.9500000000000001,
    "max_depth": 4,
    #     "booster": "gbtree",
    "gamma": 1.7000000000000002,
    "tree_method": "hist",  # "gpu_hist", "hist"
    "reg_lambda": 0.9541035898656812,
    "reg_alpha": 2.3445012085324084,
    "random_state": 42,
    "n_jobs": 4,
    "min_child_weight": 256,
}

## Use Best Hyperparameters

In [None]:
if Config.optimize:
    xgb_params = study.best_trial.params

In [None]:
def train_model(df, test, FEATURES, TARGET, params, n_folds=5, seed=42):

    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    # oof_preds = np.zeros((df.shape[0],)) # Zero array
    # print(f"oof_preds size={df.shape[0]}")
    print(f"\n===== Estimators: {params['n_estimators']}, Random State: {seed} =====")

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = (
            xvalid.row_id.values.tolist()
        )  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = XGBClassifier(**xgb_params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            #              eval_metric = "auc",
            verbose=False,
            early_stopping_rounds=3000,
        )

        # Mean of the predictions
        preds_valid = model.predict(xvalid)
        test_preds = model.predict(xtest)

        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

        fold_score = accuracy_score(yvalid, preds_valid)  # Validation Set Score

        fold_scores.append(fold_score)

        # Feature importance
        fi = pd.DataFrame(
            index=FEATURES,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Accuracy: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )

In [None]:
(
    model,
    feature_importance_lst,
    fold_scores,
    final_valid_predictions,
    final_test_predictions,
) = train_model(
    train_df, test_df, FEATURES, TARGET_ENC, xgb_params, Config.N_FOLDS, Config.seed
)

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Scores</h1>
</div>

In [None]:
cv_score = np.mean(fold_scores)  # Used in filename
print(f"scores -> mean: {cv_score:0.6f}, std: {np.std(fold_scores):0.6f}")

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Model Feature Importance</h1>
</div>

- [Feature Importance and Feature Selection With XGBoost in Python](https://machinelearningmastery.com/feature-importance-and-feature-selection-with-xgboost-in-python/)

In [None]:
fis_df = pd.concat(feature_importance_lst, axis=1)
fis_df.sort_values("4_importance", ascending=False).head(10)

In [None]:
fis_df[["4_importance"]].sort_values("4_importance", ascending=True).head(20).plot(
    kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
)
plt.show()

In [None]:
# fis_df.head()
fis_df.sort_values("4_importance", ascending=True).head()

In [None]:
from xgboost import plot_importance

fig, ax = plt.subplots(figsize=(10, 6))
plot_importance(model, max_num_features=20, ax=ax, grid=False)
plt.show()

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Save OOF Predictions</h1>
</div>
Save the dictionary that we created for all the training predictions that were made when each fold was used for validation

In [None]:
final_valid_predictions_df = pd.DataFrame.from_dict(
    final_valid_predictions, orient="index"
).reset_index()
final_valid_predictions_df.columns = ["id", "pred_xgb1"]
final_valid_predictions_df["pred_xgb1"] = final_valid_predictions_df["pred_xgb1"].map(
    target_dict_inv
)
# final_valid_predictions_df.to_csv("train_xgb_1.csv", index=False)
final_valid_predictions_df.to_csv(
    # f"results/test_preds_cv{cv_score:0.6f}_s{Config.seed}_k{Config.N_FOLDS}_{MODEL}.csv",
    f"results/oof_preds_cv{cv_score:0.6f}_s{Config.seed}_k{Config.N_FOLDS}_{MODEL}.csv",
)


final_valid_predictions_df.head()

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Submission</h1>
</div>

In [None]:
### Non-zero means results are different
mean_result = np.mean(np.column_stack(final_test_predictions), axis=1).astype("int")
mode_result = mode(final_test_predictions).mode[0]
r3 = mean_result - mode_result
r3.sum()

In [None]:
message = f"{MODEL}_cv{cv_score:0.6f}_seed{Config.seed}_k{Config.N_FOLDS}_n{Config.N_ESTIMATORS}"
print(f"\nSubmit Message: {message}\n")

submission_df["target"] = mode_result
submission_df["target"] = submission_df["target"].map(target_dict_inv)
submission_df.to_csv(
    f"results/test_preds_cv:{cv_score:0.6f}_s:{Config.seed}_k:{Config.N_FOLDS}_{MODEL}.csv",
    index=False,
)
submission_df.to_csv("submission.csv", index=False)
submission_df