# ExtraTreeClassifier

Creating several models that I hope to blend near the end of the competiton.

## Models

- [👽 TPS Feb 22: XGBoost+Optuna+CV+OOF](https://www.kaggle.com/mmellinger66/tps-feb-22-xgboost-cv-oof/)
- [👽 TPS Feb 22: ExtraTreeClassifier + CV + OOF](https://www.kaggle.com/mmellinger66/tps-feb-22-extratreeclassifier-cv-oof/)
- [👽TPS Feb 22: EDA+LGBM+Optuna+CV+OOF](https://www.kaggle.com/mmellinger66/tps-feb-22-eda-lgbm-cv-oof)

# References

- [TPS Feb 2022 ExtraTreeClassifier
](https://www.kaggle.com/hiro5299834/tps-feb-2022-extratreeclassifier)

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [None]:
import os
from pathlib import Path
import time
import gc

import pandas as pd
import numpy as np
import datatable as dt  # Fast table loading

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

# from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier
from scipy.stats import mode

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

# Visualization Libraries
import matplotlib.pylab as plt
import seaborn as sns

plt.style.use("fivethirtyeight")  # ggplot fivethirtyeight bmh
pd.options.display.max_columns = 500

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [None]:
# Black formatter https://black.readthedocs.io/en/stable/

! pip install nb-black > /dev/null

%load_ext lab_black

In [None]:
class Config:
    debug = False
    optimize = False
    competition = "TPS_202202"
    seed = 42
    N_ESTIMATORS = 1300  # 1300, 5000
    N_FOLDS = 5  # 5,10,15
    SEED_LENGTH = 1  # 5,10

In [None]:
TARGET = "target"
TARGET_ENC = "target_num"

In [None]:
MODEL = "etrees1"

In [None]:
# Change for every competition
data_dir = Path("../input/tabular-playground-series-feb-2022")

In [None]:
if not os.path.exists("results"):
    os.makedirs("results")

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data</h1>
</div>

Using datatable as described in [Tutorial on reading datasets](https://www.kaggle.com/hiro5299834/tutorial-on-reading-datasets)

In [None]:
%%time
# https://www.kaggle.com/hiro5299834/tutorial-on-reading-datasets

# train_df = pd.read_csv(data_dir / "train.csv")
# test_df = pd.read_csv(data_dir / "test.csv")
train_df = dt.fread(data_dir / "train.csv").to_pandas()
test_df = dt.fread(data_dir / "test.csv").to_pandas()

submission_df = pd.read_csv(data_dir / "sample_submission.csv")

print(f"train data: Rows={train_df.shape[0]}, Columns={train_df.shape[1]}")
print(f"test data : Rows={test_df.shape[0]}, Columns={test_df.shape[1]}")

## Create kfolds

In [None]:
def create_folds(df, n_folds=5, seed=42):
    df["fold"] = -1

    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

In [None]:
train_df = create_folds(train_df)

In [None]:
continuous_features = [
    f for f in train_df.columns if f not in ("row_id", "target", "fold")
]

## Label Encode Target Manually

In [None]:
targets = train_df[TARGET].unique()
target_dict = {t: i for i, t in enumerate(targets)}  # Use {} not []

# Need the inverse dictionary for the submission file
target_dict_inv = {v: k for k, v in target_dict.items()}

train_df["target_num"] = train_df[TARGET].map(target_dict)

train_df = train_df.drop(["target"], axis=1)

TARGET = "target_num"

In [None]:
xparams = {
    "n_estimators": Config.N_ESTIMATORS,
    "random_state": 42,
}

In [None]:
# FEATURES = continuous_features[:20]
FEATURES = continuous_features

In [None]:
len(FEATURES)

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

## GCD Clustering Idea

Idea from [@ambrosm](https://www.kaggle.com/ambrosm)

- [TPSFEB22-03 Clustering Improves the Predictions](https://www.kaggle.com/ambrosm/tpsfeb22-03-clustering-improves-the-predictions)

In [None]:
from math import factorial

elements = [
    e
    for e in train_df.columns
    if e not in ["row_id", TARGET, TARGET_ENC, "fold", "gcd", "isTrain"]
]


def bias(w, x, y, z):
    return factorial(10) / (
        factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4 ** 10
    )


def bias_of(s):
    #     print(f"Bias of: {s}")
    w = int(s[1 : s.index("T")])
    x = int(s[s.index("T") + 1 : s.index("G")])
    y = int(s[s.index("G") + 1 : s.index("C")])
    z = int(s[s.index("C") + 1 :])
    return factorial(10) / (
        factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4 ** 10
    )


train_i = pd.DataFrame(
    {
        col: ((train_df[col] + bias_of(col)) * 1000000).round().astype(int)
        for col in elements
    }
)
test_i = pd.DataFrame(
    {
        col: ((test_df[col] + bias_of(col)) * 1000000).round().astype(int)
        for col in elements
    }
)


def gcd_of_all(df_i):
    gcd = df_i[elements[0]]
    for col in elements[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd


train_df["gcd"] = gcd_of_all(train_i)
test_df["gcd"] = gcd_of_all(test_i)

In [None]:
FEATURES = [
    col
    for col in train_df.columns
    if col not in ["row_id", "fold", TARGET, TARGET_ENC, "isTrain"]
]

In [None]:
def train_model(df, test, FEATURES, TARGET, params, n_folds=5, seed=42):

    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set

    test = test[FEATURES].copy()

    # oof_preds = np.zeros((df.shape[0],)) # Zero array
    # print(f"oof_preds size={df.shape[0]}")
    print(f"\n===== Estimators: {params['n_estimators']}, Random State: {seed} =====")
    params["random_state"] = seed

    for fold in range(Config.N_FOLDS):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = (
            xvalid.row_id.values.tolist()
        )  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = ExtraTreesClassifier(**params)
        # n_estimators=Config.N_ESTIMATORS,
        # n_jobs=-1,
        # random_state=seed,
        # verbose=500,
        # )
        model.fit(xtrain, ytrain)
        # Mean of the predictions
        preds_valid = model.predict(xvalid)
        test_preds = model.predict(xtest)

        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

        fold_score = accuracy_score(yvalid, preds_valid)  # Validation Set Score

        fold_scores.append(fold_score)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Accuracy: {fold_score}, Run Time: {run_time:.2f}")

    return model, fold_scores, final_valid_predictions, final_test_predictions

In [None]:
(model, fold_scores, final_valid_predictions, final_test_predictions) = train_model(
    train_df, test_df, FEATURES, TARGET_ENC, xparams, Config.N_FOLDS, Config.seed
)

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Scores</h1>
</div>

In [None]:
cv_score = np.mean(fold_scores)  # Used in filename
print(f"scores -> mean: {cv_score:0.6f}, std: {np.std(fold_scores):0.6f}")

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Model Feature Importance</h1>
</div>

- [Feature Importance with ExtraTreesClassifier](https://www.kaggle.com/marc000/feature-importance-with-extratreesclassifier)
- https://www.geeksforgeeks.org/ml-extra-tree-classifier-for-feature-selection/

In [None]:
feature_importance = model.feature_importances_

# Normalizing the individual importances
feature_importance_normalized = np.std(
    [tree.feature_importances_ for tree in model.estimators_], axis=0
)

In [None]:
feature_importance_df = pd.DataFrame({"importance": model.feature_importances_})
feature_importance_df["feature"] = train_df[FEATURES].columns
feature_importance_df.sort_values(by="importance", inplace=True)
feature_importance_df = feature_importance_df.set_index("feature", drop=True)
feature_importance_df.head()

In [None]:
# fig, ax = plt.subplots(figsize=(15, 6))

feature_importance_df[:15].plot(
    kind="barh", title="Feature Importance", figsize=(10, 6)
)
plt.xlabel("Feature Importances Score")
plt.show()

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Save OOF Predictions</h1>
</div>
Save the dictionary that we created for all the training predictions that were made when each fold was used for validation

In [None]:
final_valid_predictions_df = pd.DataFrame.from_dict(
    final_valid_predictions, orient="index"
).reset_index()
final_valid_predictions_df.columns = ["id", "pred_xtree1"]
final_valid_predictions_df["pred_tree1"] = final_valid_predictions_df[
    "pred_xtree1"
].map(target_dict_inv)
# final_valid_predictions_df.to_csv("train_xtree_1.csv", index=False)
final_valid_predictions_df.to_csv(
    # f"results/oof_preds_{MODEL}_s{Config.seed}_k{Config.N_FOLDS}.csv", index=False
    f"results/oof_preds_cv{cv_score:0.6f}_s{Config.seed}_k{Config.N_FOLDS}_{MODEL}.csv",
)


final_valid_predictions_df.head()

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Submission</h1>
</div>

In [None]:
np.column_stack(final_test_predictions)

In [None]:
mean_result = np.mean(np.column_stack(final_test_predictions), axis=1).astype("int")

In [None]:
mode_result = mode(final_test_predictions).mode[0]

### Non-zero means results are different

In [None]:
r3 = mean_result - mode_result
r3.sum()

In [None]:
message = f"{MODEL}_cv{cv_score:0.6f}_seed{Config.seed}_k{Config.N_FOLDS}_n{Config.N_ESTIMATORS}"
print(f"\nSubmit Message: {message}\n")

submission_df["target"] = mode_result
submission_df["target"] = submission_df["target"].map(target_dict_inv)

submission_df.to_csv(
    f"results/test_preds_cv{cv_score:0.6f}_s{Config.seed}_k{Config.N_FOLDS}_{MODEL}.csv",
)
submission_df.to_csv("submission.csv", index=False)  # Submit button compatible file
submission_df