## Displaying Features in `merged.csv`

In [2]:
import pandas as pd
import numpy as np

# Load the merged cleaned dataset
df = pd.read_csv('../data/Merged/reduced_North_data.csv')

# Display feature names
print("Features/Columns in merge.csv:")
print(df.columns.tolist())


Features/Columns in merge.csv:
['elevation', 'GRIDCODE', 'spring_prec', 'summer_prec', 'autumn_prec', 'winter_prec', 'summer_tmax', 'autumn_tmax', 'spring_tmax', 'winter_tmax', 'winter_tmin', 'summer_tmin', 'ORG_CARBON', 'CEC_CLAY', 'GYPSUM', 'BSAT', 'PH_WATER', 'SAND', 'SILT', 'BULK', 'TCARBON_EQ', 'TOTAL_N', 'COARSE', 'CEC_SOIL', 'CN_RATIO', 'ESP', 'ELEC_COND', 'TEX_5', 'TEX_9', 'ALUM_SAT', 'fire']


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/Merged/reduced_North_data.csv')

# Features (all except 'fire'), Target ('fire')
X = df.drop('fire', axis=1)
y = df['fire']

# Split into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Confirm shapes
print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")


Training samples: 99283, Test samples: 24821


## Building K-Nearest Neighbors (KNN) From Scratch

In [6]:
import numpy as np
from collections import Counter

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

class MyKNNClassifier:
    def __init__(self, k=5):
        self.k = k
    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)
    def predict(self, X):
        X = np.array(X)
        y_pred = []
        for x in X:
            distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
            k_indices = np.argsort(distances)[:self.k]
            k_neighbor_labels = self.y_train[k_indices]
            most_common = Counter(k_neighbor_labels).most_common(1)[0][0]
            y_pred.append(most_common)
        return np.array(y_pred)


In [7]:
knn = MyKNNClassifier(k=5)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)


In [9]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Most likely, knn_preds is a simple array of scalars
y_pred = [p for p in knn_preds]  # OR, just: y_pred = knn_preds if it's already a list-like

# Now evaluate as before
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))



Accuracy: 0.9981064421256194

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     24772
           1       0.75      0.06      0.11        49

    accuracy                           1.00     24821
   macro avg       0.87      0.53      0.56     24821
weighted avg       1.00      1.00      1.00     24821


Confusion Matrix:
 [[24771     1]
 [   46     3]]


In [5]:
import cupy as cp
import numpy as np
from collections import Counter

def euclidean_distances(X1, X2):
    # Renvoie la matrice des distances euclidiennes (vectorisé, GPU)
    # X1: (batch_size, n_features), X2: (n_train, n_features)
    # Résultat: (batch_size, n_train)
    return cp.sqrt(cp.sum((X1[:, cp.newaxis, :] - X2[cp.newaxis, :, :]) ** 2, axis=2))

class MyKNNClassifierGPU:
    def __init__(self, k=5):
        self.k = k
    def fit(self, X, y):
        # X et y doivent être des np.arrays float32/int32
        self.X_train = cp.array(X.astype(np.float32))
        self.y_train = cp.array(y.astype(np.int32))
    def predict(self, X, batch_size=100):
        X = cp.array(X.astype(np.float32))
        N = X.shape[0]
        y_pred = []
        for start in range(0, N, batch_size):
            end = min(start + batch_size, N)
            X_batch = X[start:end]
            distances = euclidean_distances(X_batch, self.X_train)  # (batch, n_train)
            neighbors_idx = cp.argpartition(distances, self.k, axis=1)[:, :self.k]
            # Pour chaque ligne du batch, vote majoritaire
            for i, idxs in enumerate(neighbors_idx):
                k_labels = cp.asnumpy(self.y_train[idxs])
                most_common = Counter(k_labels).most_common(1)[0][0]
                y_pred.append(most_common)
        return np.array(y_pred)   # pour sklearn metrics

# --- Préparation des données ---
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/Merged/reduced_North_data.csv')
X = df.drop('fire', axis=1).values.astype(np.float32)
y = df['fire'].astype(np.int32).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Entraînement et prédiction ---
knn_gpu = MyKNNClassifierGPU(k=5)
knn_gpu.fit(X_train, y_train)
y_pred = knn_gpu.predict(X_test, batch_size=100)   # ajuste batch_size selon ta RAM GPU

# --- Évaluation des performances ---
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9981064421256194

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     24772
           1       0.75      0.06      0.11        49

    accuracy                           1.00     24821
   macro avg       0.87      0.53      0.56     24821
weighted avg       1.00      1.00      1.00     24821


Confusion Matrix:
 [[24771     1]
 [   46     3]]


## Building Decision Tree (DT) From Scratch

In [6]:
import numpy as np
import cupy as cp

class DecisionTreeNode:
    def __init__(self, gini, num_samples, num_samples_per_class, predicted_class):
        self.gini = gini
        self.num_samples = num_samples
        self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = None
        self.threshold = None
        self.left = None
        self.right = None

class SimpleDecisionTreeClassifierCuPy:
    def fit(self, X, y):
        self.n_classes_ = len(np.unique(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)

    def _gini(self, y):
        # Gini sur GPU si tableau long, sinon CPU
        if len(y) > 500:
            y_gpu = cp.array(y)
            classes = cp.unique(y_gpu)
            m = y_gpu.size
            probs = cp.array([cp.sum(y_gpu == c) / m for c in classes])
            return float(1.0 - cp.sum(probs ** 2))
        else:
            m = len(y)
            return 1.0 - sum((np.sum(y == c) / m) ** 2 for c in np.unique(y))

    def _grow_tree(self, X, y, depth=0):
        # Utilise np ici car souvent peu de classes
        num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
        predicted_class = np.argmax(num_samples_per_class)
        node = DecisionTreeNode(
            gini=self._gini(y),
            num_samples=len(y),
            num_samples_per_class=num_samples_per_class,
            predicted_class=predicted_class,
        )

        if node.gini == 0:  # pure node
            return node

        idx, thr = self._best_split(X, y)
        if idx is None:
            return node

        indices_left = X[:, idx] < thr
        X_left, y_left = X[indices_left], y[indices_left]
        X_right, y_right = X[~indices_left], y[~indices_left]
        node.feature_index = idx
        node.threshold = thr
        node.left = self._grow_tree(X_left, y_left, depth + 1)
        node.right = self._grow_tree(X_right, y_right, depth + 1)
        return node

    def _best_split(self, X, y):
        m, n = X.shape
        if m <= 1:
            return None, None

        num_parent = [np.sum(y == c) for c in range(self.n_classes_)]
        best_gini = 1.0
        best_idx, best_thr = None, None
        for idx in range(n):
            # Accélère le tri sur GPU si possible
            if m > 2000:
                sorted_idx = cp.asnumpy(cp.argsort(cp.array(X[:, idx])))
            else:
                sorted_idx = np.argsort(X[:, idx])
            thresholds = X[sorted_idx, idx]
            classes = y[sorted_idx]
            num_left = [0] * self.n_classes_
            num_right = num_parent.copy()
            for i in range(1, m):
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                if i == 0 or m - i == 0:
                    continue
                # calcule gini_left/gini_right par GPU si assez de données
                if i > 500 and m-i > 500:
                    left_arr = cp.array(num_left)
                    right_arr = cp.array(num_right)
                    gini_left = 1.0 - float(cp.sum((left_arr / i) ** 2))
                    gini_right = 1.0 - float(cp.sum((right_arr / (m - i)) ** 2))
                else:
                    gini_left = 1.0 - sum((num_left[x] / i) ** 2 for x in range(self.n_classes_))
                    gini_right = 1.0 - sum((num_right[x] / (m - i)) ** 2 for x in range(self.n_classes_))
                gini = (i * gini_left + (m - i) * gini_right) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2
        return best_idx, best_thr

    def predict(self, X):
        return [self._predict(inputs) for inputs in X]

    def _predict(self, inputs):
        node = self.tree_
        while node.left is not None:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class

# --- Utilisation ---
# les X et y doivent être des np.array (pas cupy) en entrée
# car la logique d’arbre est sur CPU

tree = SimpleDecisionTreeClassifierCuPy()
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Decision Tree Accuracy (hybride CuPy):", accuracy_score(y_test, y_pred_tree))
print("\nClassification Report:\n", classification_report(y_test, y_pred_tree))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_tree))


Decision Tree Accuracy (hybride CuPy): 0.998751057572217

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     24772
           1       0.76      0.53      0.63        49

    accuracy                           1.00     24821
   macro avg       0.88      0.77      0.81     24821
weighted avg       1.00      1.00      1.00     24821


Confusion Matrix:
 [[24764     8]
 [   23    26]]


## Building Random Forest (RF) From Scratch

In [7]:
import numpy as np

class SimpleRandomForestClassifierCuPy:
    def __init__(self, n_estimators=10, max_samples=0.7):
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.trees = []

    def fit(self, X, y):
        n_samples = X.shape[0]
        n_sub = int(self.max_samples * n_samples)
        for _ in range(self.n_estimators):
            indices = np.random.choice(n_samples, n_sub, replace=True)
            tree = SimpleDecisionTreeClassifierCuPy()  # Cupy optimized
            tree.fit(X[indices], y[indices])
            self.trees.append(tree)

    def predict(self, X):
        # Prédictions de chaque arbre (majorité en colonne)
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.array([np.bincount(tree_preds[:, i]).argmax() for i in range(X.shape[0])])

# Utilisation sur tes données
forest = SimpleRandomForestClassifierCuPy(n_estimators=10, max_samples=0.7)
forest.fit(X_train, y_train)
y_pred_forest = forest.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Random Forest Accuracy (Cupy optimisé):", accuracy_score(y_test, y_pred_forest))
print("\nClassification Report:\n", classification_report(y_test, y_pred_forest))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_forest))


KeyboardInterrupt: 