## Displaying Features in `merged.csv`

In [None]:
import pandas as pd


DATA_PATH = '../data/preprocessed/preprocessed_reduced_data.csv'
# Load the merged cleaned dataset
df = pd.read_csv(DATA_PATH)

# Display feature names
print("Features/Columns in preprocessed_reduced_data.csv:")
print(df.columns.tolist())


In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_validate
from sklearn.metrics import (
    make_scorer, f1_score, recall_score, precision_score, roc_auc_score
)
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.combine import SMOTETomek


# ============================================================
# AVAILABLE SAMPLERS
# ============================================================
def apply_resampling(method, X, y):
    if method == "none":
        return X, y
    if method == "undersample":
        rus = RandomUnderSampler(random_state=42)
        return rus.fit_resample(X, y)
    if method == "nearmiss":
        nm = NearMiss(version=1)
        return nm.fit_resample(X, y)
    if method == "smote":
        sm = SMOTE(k_neighbors=3, random_state=42)
        return sm.fit_resample(X, y)
    if method == "hybrid":
        smt = SMOTETomek(random_state=42)
        return smt.fit_resample(X, y)

    raise ValueError("Unknown resampling method")


# ============================================================
# EVALUATE RESAMPLING WITH ALL METRICS
# ============================================================
def evaluate_resampling(method, X, y, cv):

    X_res, y_res = apply_resampling(method, X, y)

    model = DecisionTreeClassifier(
        max_depth=20,
        min_samples_leaf=3,
        random_state=42
    )

    scoring = {
        "recall": make_scorer(recall_score),
        "precision": make_scorer(precision_score),
        "f1": make_scorer(f1_score),
        "roc_auc": "roc_auc"
    }

    scores = cross_validate(
        model, X_res, y_res,
        cv=cv, scoring=scoring, n_jobs=-1
    )

    # Mean of each metric
    res = {metric: scores[f"test_{metric}"].mean() for metric in scoring}

    # Detection of degenerate model
    if len(set(y_res)) == 1:
        res["degenerate"] = True
    else:
        res["degenerate"] = False

    return res


# ============================================================
# MASTER FUNCTION â€“ SELECT BEST RESAMPLING METHOD
# ============================================================
def choose_best_resampling(
        csv_path,
        target_col="fire",
        metric="recall",
        k_folds=3,
        output_csv="balanced_data.csv"
):

    df = pd.read_csv(csv_path)

    train_df, test_df = train_test_split(
        df, test_size=0.2, stratify=df[target_col], random_state=42
    )

    print("âœ” Using stratified split â†’ class ratios preserved.")

    X_train = train_df.drop(columns=[target_col])
    y_train = train_df[target_col]

    cv = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

    methods = ["none", "undersample", "nearmiss", "smote", "hybrid"]
    results = {}

    print("\n=== Testing Resampling Methods ===")
    for m in methods:
        metrics = evaluate_resampling(m, X_train, y_train, cv)
        results[m] = metrics

        print(f"\n{m.upper()}:")
        for k, v in metrics.items():
            if k != "degenerate":
                print(f"  {k:10s} = {v:.4f}")
        if metrics["degenerate"]:
            print("  âš  Model predicts only one class (degenerate)")

    # Select based on recall
    best_method = max(results, key=lambda m: results[m]["recall"])
    print("\nðŸ”¥ BEST METHOD (by recall):", best_method.upper())

    # Apply to full training set
    X_res, y_res = apply_resampling(best_method, X_train, y_train)
    balanced_df = pd.concat(
        [pd.DataFrame(X_res), pd.DataFrame(y_res, columns=[target_col])],
        axis=1
    )

    balanced_df.to_csv(output_csv, index=False)
    print(f"ðŸ’¾ Saved balanced train set to: {output_csv}")

    return best_method, results, balanced_df, test_df


choose_best_resampling( csv_path="../data/preprocessed/preprocessed_reduced_data.csv", 
                       target_col="fire", 
                       metric="recall", 
                       k_folds=3, 
                       output_csv="../data/preprocessed/preprocessed_reduced_balanced_data.csv" )

âœ” Using stratified split â†’ class ratios preserved.

=== Testing Resampling Methods ===

NONE:
  recall     = 0.9232
  precision  = 0.9964
  f1         = 0.9584
  roc_auc    = 0.9798

UNDERSAMPLE:
  recall     = 0.9573
  precision  = 0.9793
  f1         = 0.9682
  roc_auc    = 0.9809

NEARMISS:
  recall     = 0.9472
  precision  = 0.9959
  f1         = 0.9709
  roc_auc    = 0.9828

SMOTE:
  recall     = 0.9685
  precision  = 0.9951
  f1         = 0.9817
  roc_auc    = 0.9926

HYBRID:
  recall     = 0.9699
  precision  = 0.9972
  f1         = 0.9833
  roc_auc    = 0.9924

ðŸ”¥ BEST METHOD (by recall): HYBRID
ðŸ’¾ Saved balanced train set to: ../data/preprocessed/preprocessed_reduced_balanced_data.csv


('hybrid',
 {'none': {'recall': np.float64(0.92320266153764),
   'precision': np.float64(0.9963862816567582),
   'f1': np.float64(0.9583963528286937),
   'roc_auc': np.float64(0.979806545307965),
   'degenerate': False},
  'undersample': {'recall': np.float64(0.9573236563604871),
   'precision': np.float64(0.9792576059812331),
   'f1': np.float64(0.9681647196072879),
   'roc_auc': np.float64(0.9808534924721138),
   'degenerate': False},
  'nearmiss': {'recall': np.float64(0.9471943220810474),
   'precision': np.float64(0.9958639232348063),
   'f1': np.float64(0.9709177486479322),
   'roc_auc': np.float64(0.9828378762983299),
   'degenerate': False},
  'smote': {'recall': np.float64(0.9685372113869573),
   'precision': np.float64(0.9951397882313442),
   'f1': np.float64(0.9816578622878412),
   'roc_auc': np.float64(0.9926088540822965),
   'degenerate': False},
  'hybrid': {'recall': np.float64(0.969863500556407),
   'precision': np.float64(0.9971950428511193),
   'f1': np.float64(0.9833

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

def balanced_realistic_cv(df, target_col="fire", n_splits=5):
    X = df.drop(columns=[target_col])
    y = df[target_col]

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []}

    for train_idx, test_idx in skf.split(X, y):

        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # ===================================================
        # 1. Light undersampling â†’ keep 70% majority
        # ===================================================
        rus = RandomUnderSampler(sampling_strategy=0.7, random_state=42)
        X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

        # ===================================================
        # 2. Light SMOTE â†’ increase minority but not to 50/50
        # ===================================================
        sm = SMOTE(sampling_strategy=0.4, random_state=42)
        X_train_bal, y_train_bal = sm.fit_resample(X_train_rus, y_train_rus)

        # ===================================================
        # 3. Decision Tree with class_weight for realism
        # ===================================================
        model = DecisionTreeClassifier(
            class_weight="balanced",
            random_state=42
        )
        model.fit(X_train_bal, y_train_bal)

        # ===================================================
        # 4. Test only on real data
        # ===================================================
        y_pred = model.predict(X_test)

        # Save metrics
        metrics["accuracy"].append(accuracy_score(y_test, y_pred))
        metrics["precision"].append(precision_score(y_test, y_pred, zero_division=0))
        metrics["recall"].append(recall_score(y_test, y_pred, zero_division=0))
        metrics["f1"].append(f1_score(y_test, y_pred, zero_division=0))

    return {m: np.mean(vals) for m, vals in metrics.items()}


df = pd.read_csv("../data/preprocessed/preprocessed_reduced_data.csv")

results = balanced_realistic_cv(df, target_col="fire")

print(results)


ValueError: The specified ratio required to remove samples from the minority class while trying to generate new samples. Please increase the ratio.

## Building K-Nearest Neighbors (KNN) From Scratch

In [None]:
import numpy as np
from collections import Counter

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

class MyKNNClassifier:
    def __init__(self, k=5):
        self.k = k
    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)
    def predict(self, X):
        X = np.array(X)
        y_pred = []
        for x in X:
            distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
            k_indices = np.argsort(distances)[:self.k]
            k_neighbor_labels = self.y_train[k_indices]
            most_common = Counter(k_neighbor_labels).most_common(1)[0][0]
            y_pred.append(most_common)
        return np.array(y_pred)


In [None]:
knn = MyKNNClassifier(k=5)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)
