## Displaying Features in `merged.csv`

In [None]:
import pandas as pd
from scripts.data_split.stratifiedSplit import stratified_split
from scripts.TreatImbalance.BalancingTrainingData import hybrid_balance
from scripts.Training.TrainEvaluate import train_and_evaluate 



In [None]:
data_path = "../data/preprocessed/preprocessed_reduced_data.csv"

target_col = "fire"
test_size = 0.2
desired_minority_prop = (
        0.30  # user-chosen: 0.30 means 30% minority in balanced training set
    )
balanced_train_savepath = "../data/learningTestData/balanced_train.csv"


data_df = pd.read_csv(data_path)
print("Loaded dataset with shape:", data_df.shape)
if target_col not in data_df.columns:
        raise ValueError(
            f"Target column '{target_col}' not found in CSV columns: {data_df.columns.tolist()}"
        )




In [None]:
train_df, test_df = stratified_split(
        data_df, target_col=target_col, test_size=test_size, random_state=42)




In [None]:
balanced_train_df = hybrid_balance(
        train_df,
        target_col=target_col,
        minority_target=1,
        desired_minority_prop=desired_minority_prop,
        random_state=42,
        save_path=balanced_train_savepath,
        verbose=True,
    )


## Building KNN Tree From Scratch

In [None]:
import numpy as np

# Try GPU ----------------------------------------------------------
try:
    import cupy as cp
    GPU_AVAILABLE = True
except ImportError:
    GPU_AVAILABLE = False

from joblib import Parallel, delayed


class MyKNNClassifier:
    def __init__(self, n_neighbors=5, n_jobs=1):
        self.n_neighbors = n_neighbors
        self.n_jobs = n_jobs
        self.gpu = GPU_AVAILABLE    # auto-detect

    # --- SKLEARN API ---
    def get_params(self, deep=True):
        return {"n_neighbors": self.n_neighbors, "n_jobs": self.n_jobs}

    def set_params(self, **params):
        for k, v in params.items():
            setattr(self, k, v)
        return self

    # -----------------------------------------------------------
    # Distance (GPU or CPU)
    # -----------------------------------------------------------
    def _euclidean_gpu(self, X_train, x):
        return cp.sqrt(cp.sum((X_train - x) ** 2, axis=1))

    def _euclidean_cpu(self, X_train, x):
        return np.sqrt(np.sum((X_train - x) ** 2, axis=1))

    # -----------------------------------------------------------
    # Fit
    # -----------------------------------------------------------
    def fit(self, X, y):
        if self.gpu:
            self.X_train = cp.asarray(X, dtype=cp.float32)
            self.y_train = cp.asarray(y)
            self.classes_ = cp.unique(self.y_train)
        else:
            self.X_train = np.asarray(X, dtype=float)
            self.y_train = np.asarray(y)
            self.classes_ = np.unique(self.y_train)

        return self

    # -----------------------------------------------------------
    # Predict ONE sample (CPU mode)
    # -----------------------------------------------------------
    def _predict_one_cpu(self, x):
        dists = self._euclidean_cpu(self.X_train, x)
        idx = np.argpartition(dists, self.n_neighbors)[:self.n_neighbors]
        k_labels = self.y_train[idx]
        return np.bincount(k_labels, minlength=len(self.classes_)).argmax()

    # -----------------------------------------------------------
    # Predict ONE sample (GPU mode)
    # -----------------------------------------------------------
    def _predict_one_gpu(self, x):
        dists = self._euclidean_gpu(self.X_train, x)
        idx = cp.argpartition(dists, self.n_neighbors)[:self.n_neighbors]
        k_labels = self.y_train[idx]
        pred = cp.bincount(k_labels, minlength=len(self.classes_)).argmax()
        return int(pred.get())

    # -----------------------------------------------------------
    # Predict
    # -----------------------------------------------------------
    def predict(self, X):
        if self.gpu:
            X = cp.asarray(X, dtype=cp.float32)
            return np.array([self._predict_one_gpu(X[i]) for i in range(X.shape[0])])

        # CPU mode ------------------------------
        X = np.asarray(X, dtype=float)

        if self.n_jobs == 1:
            return np.array([self._predict_one_cpu(x) for x in X])

        return np.array(
            Parallel(n_jobs=self.n_jobs)(
                delayed(self._predict_one_cpu)(x) for x in X
            )
        )

    # -----------------------------------------------------------
    # Predict proba
    # -----------------------------------------------------------
    def predict_proba(self, X):
        if self.gpu:
            X = cp.asarray(X, dtype=cp.float32)
            proba_list = []

            for i in range(X.shape[0]):
                x = X[i]
                dists = self._euclidean_gpu(self.X_train, x)
                idx = cp.argpartition(dists, self.n_neighbors)[:self.n_neighbors]
                k_labels = self.y_train[idx]
                counts = cp.bincount(k_labels, minlength=len(self.classes_))
                proba_list.append((counts / self.n_neighbors).get())

            return np.array(proba_list)

        # CPU mode
        X = np.asarray(X, dtype=float)
        proba = np.zeros((X.shape[0], len(self.classes_)))

        for i, x in enumerate(X):
            dists = self._euclidean_cpu(self.X_train, x)
            idx = np.argpartition(dists, self.n_neighbors)[:self.n_neighbors]
            k_labels = self.y_train[idx]
            counts = np.bincount(k_labels, minlength=len(self.classes_))
            proba[i] = counts / self.n_neighbors

        return proba

    # -----------------------------------------------------------
    # Decision function
    # -----------------------------------------------------------
    def decision_function(self, X):
        proba = self.predict_proba(X)
        if proba.shape[1] == 1:
            return proba[:, 0]
        return proba[:, 1]



In [None]:
params = {
    "n_neighbors": 5,
    "n_jobs" : -1 , 
}
results_dt = train_and_evaluate(
    balanced_train_df,
    test_df,
    estimator=MyKNNClassifier(),
    algo_name = "MyKNnClassifier",
    params = params,
)

## Building Decision Tree From Scratch

In [None]:
import numpy as np

class MyDecisionTreeClassifier:
    class Node:
        def __init__(self, feature=None, threshold=None, left=None, right=None, value=None, counts=None):
            self.feature = feature
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value      # predicted class
            self.counts = counts    # class distribution in leaf

    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf

    # ============================================================
    # Sklearn compatibility
    # ============================================================
    def get_params(self, deep=True):
        return {
            "max_depth": self.max_depth,
            "min_samples_split": self.min_samples_split,
            "min_samples_leaf": self.min_samples_leaf,
        }

    def set_params(self, **params):
        for k, v in params.items():
            setattr(self, k, v)
        return self

    # ============================================================
    # Internal helpers
    # ============================================================
    def entropy(self, counts):
        total = counts.sum()
        if total == 0:
            return 0
        p = counts / total
        return -(p * np.log2(p + 1e-9)).sum()

    def fit(self, X, y):
        self.X = np.array(X)
        self.y = np.array(y)
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)
        idx = np.arange(len(y))
        self.tree_ = self._build(idx, depth=0)
        return self

    def _build(self, idx, depth):
        y = self.y[idx]
        counts = np.bincount(y, minlength=self.n_classes_)

        # stopping conditions
        if (
            len(idx) < self.min_samples_split or
            (self.max_depth is not None and depth >= self.max_depth)
        ):
            return self.Node(value=np.argmax(counts), counts=counts)

        n_features = self.X.shape[1]
        parent_entropy = self.entropy(counts)

        best_gain = 0
        best_feat = None
        best_thresh = None
        best_left = None
        best_right = None

        for f in range(n_features):
            col = self.X[idx, f]
            order = np.argsort(col)
            sorted_idx = idx[order]
            sorted_y = self.y[sorted_idx]
            sorted_col = col[order]

            left_counts = np.zeros(self.n_classes_, dtype=np.int32)
            right_counts = np.bincount(sorted_y, minlength=self.n_classes_)

            for i in range(len(idx) - 1):
                c = sorted_y[i]
                left_counts[c] += 1
                right_counts[c] -= 1

                if sorted_col[i] == sorted_col[i+1]:
                    continue

                left_n = i + 1
                right_n = len(idx) - left_n

                if left_n < self.min_samples_leaf or right_n < self.min_samples_leaf:
                    continue

                thresh = (sorted_col[i] + sorted_col[i+1]) / 2

                gain = parent_entropy - (
                    (left_n / len(idx)) * self.entropy(left_counts)
                    + (right_n / len(idx)) * self.entropy(right_counts)
                )

                if gain > best_gain:
                    best_gain = gain
                    best_feat = f
                    best_thresh = thresh
                    best_left = sorted_idx[:left_n]
                    best_right = sorted_idx[left_n:]

        if best_gain == 0:
            return self.Node(value=np.argmax(counts), counts=counts)

        left_node = self._build(best_left, depth+1)
        right_node = self._build(best_right, depth+1)
        return self.Node(feature=best_feat, threshold=best_thresh,
                         left=left_node, right=right_node, counts=counts)

    def _predict_node(self, x, node):
        if node.value is not None:
            return node
        if x[node.feature] <= node.threshold:
            return self._predict_node(x, node.left)
        return self._predict_node(x, node.right)

    # ============================================================
    # Required sklearn prediction API
    # ============================================================
    def predict(self, X):
        X = np.array(X)
        preds = []
        for x in X:
            node = self._predict_node(x, self.tree_)
            preds.append(node.value)
        return np.array(preds)

    def predict_proba(self, X):
        """
        Return class probabilities using the leaf distribution.
        """
        X = np.array(X)
        proba = []
        for x in X:
            node = self._predict_node(x, self.tree_)
            counts = node.counts
            p = counts / counts.sum()
            proba.append(p)
        return np.array(proba)

    def decision_function(self, X):
        """
        Return raw scores (class counts before normalization).
        """
        X = np.array(X)
        scores = []
        for x in X:
            node = self._predict_node(x, self.tree_)
            scores.append(node.counts)
        return np.array(scores)


In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)


params = {
    "max_depth": 20,
    "min_samples_split": 5,
    "min_samples_leaf": 5,
    "criterion": "entropy",

}

results_dt = train_and_evaluate(
    balanced_train_df,
    test_df,
    estimator=DecisionTreeClassifier(),
    algo_name = "DecisionTreeClassifier_skLearn",
    params = params,
)


In [None]:
params = {
    "max_depth": 20,
    "min_samples_split": 5,
    "min_samples_leaf": 5,
    "criterion": "entropy",

}

results_dt = train_and_evaluate(
    balanced_train_df,
    test_df,
    estimator=MyDecisionTreeClassifier(),
    algo_name = "MyDecisionTreeClassifier",
    params = params,
)
