# Federal University of Pampa <www.unipampa.edu.br>
# Course: Deep Learning
# Author: Sandro Camargo <sandrocamargo@unipampa.edu.br>
# Logistic Regression Example
# Dataset: https://archive.ics.uci.edu/dataset/547/algerian+forest+fires+dataset

To open this code in your Google Colab environment, [click here](https://colab.research.google.com/github/Sandrocamargo/deep-learning/blob/master/dl_class03_LogisticRegression.ipynb).

A Python library is a collection of related functions. A library contains bundles of encapsuated code which can be used repeatedly in different programs.

In [None]:
# Import Libraries
import pandas as pd # Data Manipulation library
import numpy as np # Fast Numeric Computing library
import tensorflow as tf # Optimizers
import matplotlib.pyplot as plt # Plot library
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression

In [None]:
# Loading dataset
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00547/Algerian_forest_fires_dataset_UPDATE.csv', header=1, skiprows=[124,125,126,170])
# About the parameters
# Header=1: column names (day, month, year, ...) are in the line 1 of this CSV file
# skiprows=[124,125,126,170]: this lines, which not contains valid data, are not imported. If this parameter is missing, all lines are imported.

# inspecting columns and data types from "data" dataframe
data.info()

In [None]:
# Store target column in y
# Store the inputs in X
y = data[data.columns[13]]
X = data.drop(columns=data.columns[13])

In [None]:
# There are whitespaces in target column, is some samples.
print(y.value_counts())
y = pd.Series(y)
y = y.str.strip() # Remove whitespaces from extremes
print(y.value_counts())

In [None]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25,  random_state=1, stratify=y)

The dataset must be randomly splitted in two parts: training set and testing set. The main approaches to split are holdout and n-fold cross validation.
*   Training set is used for building (training) the model.
*   Testing set is used for testing the generalization ability of the model built.

Moreover, inputs($x$) and outputs($y$) must be splitted in each set.




In [None]:
# Verifying dataset dimensions
print('The training dataset (inputs) dimensions are: ', train_x.shape)
print('The training dataset (outputs) dimensions are: ', train_y.shape)
print('The testing dataset (inputs) dimensions are: ', test_x.shape)
print('The testing dataset (outputs) dimensions are: ', test_y.shape)

In [None]:
model = LogisticRegression()
model.fit(train_x, train_y)

In [None]:
y_pred = model.predict(test_x)
y_prob = model.predict_proba(test_x)[:, 1]
print(y_pred)
#print(y_prob)

In [None]:
print(f"Intercepto: {model.intercept_}")
print(f"Coeficientes: {model.coef_}")
print(f"Acurácia: {accuracy_score(test_y, y_pred):.4f}")

In [None]:
# Get feature names and coefficients
feature_names = X.columns  # <-- uses the real column names
coefs = model.coef_[0]     # for binary classification

# Build a DataFrame for plotting
coef_df = pd.DataFrame({"Feature": feature_names, "Coefficient": coefs}).sort_values(by="Coefficient")

# Plot
plt.figure(figsize=(8, 5))
plt.barh(coef_df["Feature"], coef_df["Coefficient"], color="steelblue")
plt.axvline(0, color="black", linestyle="--")
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.title("Logistic Regression Coefficients")
plt.tight_layout()
plt.show()

In [None]:

# Mapping values to binary: hot encoding
binary_mapping = {'fire': 1, 'not fire': 0}

train_y_bin = train_y.map(binary_mapping)
test_y_bin = test_y.map(binary_mapping)


In [None]:
# === 5) Matriz de confusão ===
cm = confusion_matrix(test_y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap="Blues")
plt.title("Matriz de Confusão - Logistic Regression")
plt.show()

In [None]:
print(classification_report(test_y, y_pred, target_names=["Not Fire", "Fire"]))

In [None]:
# Plot decision boundary

# Create grid for decision surface
xx, yy = np.meshgrid(
    np.linspace(test_x.iloc[:,7].min(), test_x.iloc[:,7].max(), 200),
    np.linspace(test_x.iloc[:,12].min(), test_x.iloc[:,12].max(), 200)
)
grid = np.c_[xx.ravel(), yy.ravel()]

# Create a base array with means of all 13 features
base = np.tile(train_x.mean().values, (xx.ravel().shape[0], 1))

# Replace FFMC and FWI with grid values
base[:, 7] = xx.ravel()
base[:, 12] = yy.ravel()

probs = model.predict_proba(base)[:, 1].reshape(xx.shape)

# Plot decision boundary line
plt.figure(figsize=(6, 5))
plt.contourf(xx, yy, probs, levels=[0, 0.5, 1], colors=["#FFB6C1", "#ADD8E6"], alpha=0.3)
plt.scatter(test_x.iloc[:,7], test_x.iloc[:,12], c=test_y_bin, cmap="coolwarm", edgecolors="k", alpha=0.7)
plt.contour(xx, yy, probs, levels=[0.5], colors="black")
plt.xlabel("FFMC")
plt.ylabel("FWI")
plt.title("Logistic Regression - Fronteira de Decisão (fixing other features at mean)")
plt.show()