In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("forestCover.csv", na_values=['?'])

In [3]:
df.drop(columns=['Water_Level', 'Observation_ID',"Inclination", "Facet"], inplace=True)

In [4]:
df['Soil_Type1'] = df['Soil_Type1'].map({'positive': 0, 'negative': 1})

In [5]:
#df.to_csv("df_tree_clean.csv", index=False)

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

target_col = "Cover_Type"

y = df[target_col]
X = df.drop(columns=[target_col])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=0
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((464809, 54), (116203, 54), (464809,), (116203,))

# Impute missing values 

In [14]:
Compute median from training set only
slope_median = X_train["Slope"].median()

 Impute in training and test set using the same median
X_train["Slope"] = X_train["Slope"].fillna(slope_median)
X_test["Slope"]  = X_test["Slope"].fillna(slope_median)

# SKEWNESS : CLASS WEIGHT = BALANCED + strat folds

In [17]:
import time, json
import numpy as np
import pandas as pd
import joblib

from scipy.stats import randint, uniform, loguniform

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import (
    StratifiedKFold,
    HalvingRandomSearchCV,
    cross_validate,
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, f1_score, matthews_corrcoef

tree = DecisionTreeClassifier(random_state=0,
                             class_weight="balanced")


param_distributions = {
    "criterion": ["gini", "entropy", "log_loss"],  
    "splitter": ["best", "random"],
    "max_depth": randint(4, 60),                    
    "min_samples_split": randint(2, 100),
    "min_samples_leaf": randint(1, 50),
    "max_features": [None, "sqrt", "log2"],
    "ccp_alpha": uniform(0.0, 0.02),               
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

search = HalvingRandomSearchCV(
    estimator=tree,
    param_distributions=param_distributions,
    scoring="balanced_accuracy",
    refit=True,                     # refit best on FULL training split
    cv=cv,
    factor=3,
    resource="n_samples",
    min_resources="smallest",
    aggressive_elimination=True,
    n_candidates="exhaust",
    random_state=0,
    n_jobs=-1,
    verbose=2,
    error_score=np.nan,
    return_train_score=False,
)

print(">>> Starting Tree Hyperband (balanced_accuracy)…")
t0 = time.time()
search.fit(X_train, y_train)
t1 = time.time()
print("\n=== Hyperband DONE ===")
print(f"Elapsed: {t1 - t0:.1f}s")
print("Best params:", search.best_params_)
print(f"Best CV balanced_accuracy: {search.best_score_:.4f}")

>>> Starting Tree Hyperband (balanced_accuracy)…
n_iterations: 9
n_required_iterations: 9
n_possible_iterations: 9
min_resources_: 70
max_resources_: 464809
aggressive_elimination: True
factor: 3
----------
iter: 0
n_candidates: 6640
n_resources: 70
Fitting 5 folds for each of 6640 candidates, totalling 33200 fits


PicklingError: Could not pickle the task to send it to the workers.

In [10]:
from sklearn.metrics import make_scorer, f1_score, matthews_corrcoef

multi_scoring = {
    "balanced_accuracy": "balanced_accuracy",
    "macro_f1": make_scorer(f1_score, average="macro", zero_division=0),
    "weighted_f1": make_scorer(f1_score, average="weighted", zero_division=0),
    "mcc": make_scorer(matthews_corrcoef),
}

# Helper for nice output
def mean_std_str(values):
    return f"{np.mean(values):.3f} ± {np.std(values, ddof=1):.3f}"

# Consistent CV
from sklearn.model_selection import StratifiedKFold
cv_post = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)


# Halving CV

In [None]:
-
def acc_key(row):
    p = row["params"]
    return (
        p.get("criterion"),
        p.get("splitter"),
        p.get("max_depth"),
        p.get("min_samples_split"),
        p.get("min_samples_leaf"),
        p.get("max_features"),
        p.get("ccp_alpha"),
        p.get("class_weight", "balanced"),
    )

cv_df["acc_key"] = cv_df.apply(acc_key, axis=1)
cv_df_unique = (cv_df.sort_values("rank_test_score")
                  .drop_duplicates(subset="acc_key", keep="first")
                  .reset_index(drop=True))
top5 = cv_df_unique.head(5).reset_index(drop=True)

splits = list(cv.split(X_train, y_train))   # cv is the same StratifiedKFold used in search
cv_post = splits

rows = []
for _, row in top5.iterrows():
    params = dict(row["params"])
    params.pop("random_state", None)  # ensure no conflict
    # IMPORTANT: preserve class_weight used in the search
    cand = DecisionTreeClassifier(random_state=0, class_weight="balanced", **params)

    res = cross_validate(
        cand, X_train, y_train,
        scoring=multi_scoring,
        cv=cv_post,               # same folds as search
        n_jobs=-1,
        return_train_score=False,
    )
    rows.append({
        "rank": int(row["rank_test_score"]),
        "params": params,
        "balanced_accuracy": mean_std_str(res["test_balanced_accuracy"]),
        "macro_f1":          mean_std_str(res["test_macro_f1"]),
        "weighted_f1":       mean_std_str(res["test_weighted_f1"]),
        "mcc":               mean_std_str(res["test_mcc"]),
    })

top5_multi = pd.DataFrame(rows).sort_values("rank").reset_index(drop=True)
with pd.option_context("display.max_colwidth", None):
    print("\n=== Decision Tree | Top-5 (by balanced_accuracy rank) | Multi-metric CV (mean ± std) ===")
    print(top5_multi.to_string(index=False))


In [15]:
top5_multi

Unnamed: 0,rank,params,balanced_accuracy,macro_f1,weighted_f1,mcc
0,1,"{'ccp_alpha': 6.8172494472173995e-06, 'criteri...",0.818 ± 0.003,0.827 ± 0.004,0.885 ± 0.001,0.815 ± 0.002
1,2,"{'ccp_alpha': 5.5419132513638835e-05, 'criteri...",0.810 ± 0.006,0.819 ± 0.005,0.878 ± 0.001,0.803 ± 0.002
2,5,"{'ccp_alpha': 0.00014236695335442028, 'criteri...",0.674 ± 0.008,0.705 ± 0.005,0.797 ± 0.003,0.675 ± 0.004
3,6,"{'ccp_alpha': 0.00032337351230194767, 'criteri...",0.587 ± 0.035,0.617 ± 0.029,0.757 ± 0.004,0.611 ± 0.005
4,11,"{'ccp_alpha': 0.000506289213397042, 'criterion...",0.545 ± 0.019,0.574 ± 0.016,0.734 ± 0.006,0.578 ± 0.010


# not good enough when compared to KNN also fast try more in detail search. 

# BO 

In [17]:
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical, Real
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
import numpy as np, pandas as pd, joblib, json, time, os

In [19]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

search_spaces = {
    "criterion": Categorical(["gini", "log_loss"]),
    "splitter":  Categorical(["best", "random"]),
    "max_depth": Integer(8, 80),
    "min_samples_split": Integer(2, 200),
    "min_samples_leaf":  Integer(1, 80),
    "max_features": Categorical([None, "sqrt", "log2"]),
    # log-scaled ccp_alpha: explore small pruning values carefully
    "ccp_alpha": Real(1e-6, 2e-2, prior="log-uniform"),
}

tree_base = DecisionTreeClassifier(
    random_state=0,
    class_weight="balanced"
)

bayes = BayesSearchCV(
    estimator=tree_base,
    search_spaces=search_spaces,
    n_iter=50,                      
    scoring="balanced_accuracy",
    cv=cv,
    n_jobs=-1,                   
    refit=True,                    
    random_state=0,
    verbose=2
)

t0 = time.time()
bayes.fit(X_train, y_train)
t1 = time.time()
print(f"\n[BO] Done in {t1 - t0:.1f}s")
print("Best params:", bayes.best_params_)
print(f"Best CV balanced_accuracy: {bayes.best_score_:.4f}")


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

# Save

In [None]:
# Save fitted best model + params
os.makedirs("tree_artifacts", exist_ok=True)
joblib.dump(bayes.best_estimator_, "tree_artifacts/tree_best_model.pkl", compress=3)
with open("tree_artifacts/tree_best_params.json", "w") as f:
    json.dump(bayes.best_params_, f, indent=2)

In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, f1_score, matthews_corrcoef

cv_df = pd.DataFrame(bayes.cv_results_).copy()

def acc_key(row):
    p = row["params"]
    return (
        p.get("criterion"),
        p.get("splitter"),
        p.get("max_depth"),
        p.get("min_samples_split"),
        p.get("min_samples_leaf"),
        p.get("max_features"),
        p.get("ccp_alpha"),
    )

cv_df["acc_key"] = cv_df.apply(acc_key, axis=1)

cv_df_unique = (cv_df.sort_values("rank_test_score")
                  .drop_duplicates(subset="acc_key", keep="first")
                  .reset_index(drop=True))

top5 = cv_df_unique.head(5).reset_index(drop=True)

rows = []
for _, row in top5.iterrows():
    params = row["params"].copy()

    params["class_weight"] = "balanced"

    cand = DecisionTreeClassifier(random_state=0, **params)

    res = cross_validate(
        cand, X_train, y_train,
        scoring=multi_scoring,
        cv=cv_post,
        n_jobs=-1,
        return_train_score=False,
    )

    rows.append({
        "rank": int(row["rank_test_score"]),
        "params": params,
        "balanced_accuracy": mean_std_str(res["test_balanced_accuracy"]),
        "macro_f1":          mean_std_str(res["test_macro_f1"]),
        "weighted_f1":       mean_std_str(res["test_weighted_f1"]),
        "mcc":               mean_std_str(res["test_mcc"]),
    })

top5_multi = pd.DataFrame(rows).sort_values("rank").reset_index(drop=True)

with pd.option_context("display.max_colwidth", None):
    print("\n=== Decision Tree (Bayes) | Top-5 UNIQUE by balanced_acc rank | Multi-metric CV (mean ± std) ===")
    print(top5_multi.to_string(index=False))



=== Decision Tree (Bayes) | Top-5 UNIQUE by balanced_acc rank | Multi-metric CV (mean ± std) ===
 rank                                                                                                                                                                                               params balanced_accuracy      macro_f1   weighted_f1           mcc
    1 {'ccp_alpha': 5.141449890033588e-06, 'criterion': 'log_loss', 'max_depth': 80, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 21, 'splitter': 'best', 'class_weight': 'balanced'}     0.900 ± 0.003 0.854 ± 0.003 0.913 ± 0.001 0.860 ± 0.002
    2 {'ccp_alpha': 4.803358932542255e-06, 'criterion': 'log_loss', 'max_depth': 80, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 26, 'splitter': 'best', 'class_weight': 'balanced'}     0.900 ± 0.002 0.844 ± 0.003 0.907 ± 0.001 0.852 ± 0.002
    3 {'ccp_alpha': 5.052652197728508e-06, 'criterion': 'log_loss', 'max_depth': 80, 'max_features': None, 'min_s

In [22]:
top5_multi

Unnamed: 0,rank,params,balanced_accuracy,macro_f1,weighted_f1,mcc
0,1,"{'ccp_alpha': 5.141449890033588e-06, 'criterio...",0.900 ± 0.003,0.854 ± 0.003,0.913 ± 0.001,0.860 ± 0.002
1,2,"{'ccp_alpha': 4.803358932542255e-06, 'criterio...",0.900 ± 0.002,0.844 ± 0.003,0.907 ± 0.001,0.852 ± 0.002
2,3,"{'ccp_alpha': 5.052652197728508e-06, 'criterio...",0.899 ± 0.002,0.843 ± 0.002,0.906 ± 0.001,0.850 ± 0.002
3,4,"{'ccp_alpha': 4.718810292020584e-06, 'criterio...",0.899 ± 0.002,0.846 ± 0.002,0.909 ± 0.001,0.854 ± 0.002
4,5,"{'ccp_alpha': 4.959218335617341e-06, 'criterio...",0.898 ± 0.004,0.843 ± 0.004,0.903 ± 0.001,0.845 ± 0.002


# Save

In [23]:
import joblib
from pathlib import Path
import json

# Directory for artifacts
out_dir = Path("tree_artifacts")
out_dir.mkdir(exist_ok=True)

best_tree = bayes.best_estimator_
joblib.dump(best_tree, out_dir / "tree_best_model.pkl", compress=3)

best_params = bayes.best_params_
with open(out_dir / "tree_best_params.json", "w") as f:
    json.dump(best_params, f, indent=2)

print("Tree model and parameters saved to:", out_dir)


Tree model and parameters saved to: tree_artifacts


In [None]:
out_dir = "tree_artifacts"
import os; os.makedirs(out_dir, exist_ok=True)

best_tree = search.best_estimator_           # already fitted on full training data
joblib.dump(best_tree, f"{out_dir}/tree_best_model.pkl", compress=3)

with open(f"{out_dir}/tree_best_params.json", "w") as f:
    json.dump(search.best_params_, f, indent=2)