This is the baseline for the project. This will be implemented as various classifier models over the 29,950 training instances (15k fire = 0, 14,950 fire = 1).

In [None]:
import pandas as pd
import os
import csv
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Change directory to where you want files saved
os.chdir('E:/School/Spring25/CSC522/Project/')

# File containing all data
data = pd.read_csv("combined_data.csv")

X = pd.DataFrame(data)

# Drop the response from X
X.drop("FIRE", axis = 1, inplace = True)
X.drop("DATE", axis = 1, inplace = True)

# Assign y to the 'FIRE' column in data
y = data['FIRE']

# Set seed for reproducability
X_train, X_test, y_train, y_test, data_train, data_test = train_test_split(X, y, data, test_size = 0.2, random_state = 1234)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'E:/School/Spring25/CSC522/Project/'

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, precision_score,
                            recall_score, f1_score, classification_report,
                            confusion_matrix, ConfusionMatrixDisplay)

# Evaluates pipelines passed
def evaluate_pipelines(pipelines, X_train, y_train, X_evaluate, y_evaluate):
    rows = []
    for name, pipeline in pipelines:
        pipeline.fit(X_train, y_train)

        train_preds = pipeline.predict(X_train)
        test_preds = pipeline.predict(X_test)

        training_accuracy = accuracy_score(y_train, train_preds)
        testing_accuracy = accuracy_score(y_test, test_preds)

        training_recall = recall_score(y_train, train_preds)
        testing_recall = recall_score(y_test, test_preds)

        training_precision = precision_score(y_train, train_preds)
        testing_precision = precision_score(y_test, test_preds)

        training_f1 = f1_score(y_train, train_preds)
        testing_f1 = f1_score(y_test, test_preds)

        rows.append({'name': name, 'Training Accuracy': training_accuracy, 'Eval Accuracy': testing_accuracy,
                     'Training Recall': training_recall, 'Eval Recall': testing_recall,
                     'Training Precision': training_precision, 'Eval Precision': testing_precision,
                     'Training F1': training_f1, 'Eval F1': testing_f1})

    display(pd.DataFrame(rows))

# Plots confusion matrix
def plot_confusion_matrix(y_true, y_pred, clf_name):
    # Generate confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Create figure
    plt.figure()

    # Use seaborn to plot confusion matrix
    sns.heatmap(cm, annot = True, fmt = 'g', cmap='viridis')

    # Labels/title
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'{clf_name} Confusion Matrix')

    # Display plot
    plt.show()

random_forest = make_pipeline(RandomForestClassifier())
decision_tree = make_pipeline(DecisionTreeClassifier())
logistic_regression = make_pipeline(LogisticRegression())
multilayer_perceptron = make_pipeline(MLPClassifier(max_iter = 10000))

evaluate_pipelines([
    ('Random Forest', random_forest),
    ('Decision Tree', decision_tree),
    ('Logistic Regression', logistic_regression),
    ('Multilayer Perceptron', multilayer_perceptron)
], X_train, y_train, X_test, y_test)

# CLFs to plot confusion matrix for
clfs = [('Random Forest', random_forest),
        ('Decision Tree', decision_tree),
        ('Logistic Regression', logistic_regression),
        ('Multilayer Perceptron', multilayer_perceptron)]

for name, clf in clfs:
    print(f"\nTraining {name}...")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    plot_confusion_matrix(y_test, y_pred, name)
