In [None]:
# ===== Pacotes / Packages =====
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

# Fixar seed para reprodutibilidade / Fix seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
# ===== Carregar dados .xlsx / Load data .xlsx =====
df = pd.read_excel("NAME.xlsx")
print(df.head())

In [None]:
# ===== Carregar dados .csv / Load data .csv =====
df = pd.read_csv("NAME.csv", sep=',') # especifica o separador / specifies the separator
print(df.head())

In [None]:
# Separar features (X) e target (y) / Separate features (X) and target (y)
X = df.iloc[:, 1:].values   # todas as colunas menos a primeira / all columns except the first
y = df.iloc[:, 0].values    # apenas a primeira coluna / just the first column

# Se 'y' contiver strings, codificar os rótulos / If 'y' contains strings, encode the labels
if y.dtype == 'object': # ou use isinstance(y[0], str) se preferir verificar um elemento / or use isinstance(y[0], str) if you prefer to check one element
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
else:
    y_encoded = y

# Transformar y em categorias (one-hot encoding) / Transform y into categories (one-hot encoding)
# Use y_encoded para one-hot encoding / Use y_encoded for one-hot encoding
y_cat = to_categorical(y_encoded)

# Dividir em treino e teste / Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y_encoded # stratify com y_encoded
)

# Normalizar dados / Normalize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Define parameters / define parâmetros

def build_and_train_cnn(
    num_conv_layers,
    num_filters,
    X_train_data,
    y_train_data,
    X_test_data,
    y_test_data,
    num_classes
):
    # Reshape X_train_data and X_test_data for Conv1D input
    X_train_reshaped = X_train_data.reshape(X_train_data.shape[0], X_train_data.shape[1], 1)
    X_test_reshaped = X_test_data.reshape(X_test_data.shape[0], X_test_data.shape[1], 1)

    # Initialize Sequential Keras model
    model = Sequential()

    current_sequence_length = X_train_reshaped.shape[1] # Get initial sequence length

    # Loop to add convolutional blocks
    for i in range(num_conv_layers):
        if i == 0:
            model.add(Conv1D(filters=num_filters, kernel_size=1, activation='relu', input_shape=(current_sequence_length, 1)))
        else:
            model.add(Conv1D(filters=num_filters, kernel_size=1, activation='relu'))

        # Add MaxPooling1D only if the current sequence length allows it
        if current_sequence_length >= 2:
            model.add(MaxPooling1D(pool_size=2))
            current_sequence_length = current_sequence_length // 2 # Update sequence length
        # If current_sequence_length is less than 2, skip MaxPooling1D for this layer

    # After the loop, add a Flatten layer
    model.add(Flatten())

    # Add a Dense layer
    model.add(Dense(128, activation='relu'))

    # Add a final Dense output layer
    model.add(Dense(num_classes, activation='softmax'))

    # Compile the model
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    # Define EarlyStopping callback
    early_stop = EarlyStopping(
        monitor="val_loss",
        patience=5,
        restore_best_weights=True
    )

    # Train the model
    history = model.fit(
        X_train_reshaped,
        y_train_data,
        validation_data=(X_test_reshaped, y_test_data),
        epochs=200,
        batch_size=16,
        verbose=0,
        callbacks=[early_stop]
    )

    return history

print("The 'build_and_train_cnn' function has been defined.")

In [None]:
# Evaluation of number of convolutional layers / avaliação do número de camadas convulocionais

num_conv_layers_range = range(1, 10) # Vary from 1 to 4 convolutional layers
fixed_filters = 64 # Keep filters fixed at 64

train_losses_conv = []
val_losses_conv = []

print("Evaluating model performance by varying the number of convolutional layers...")
for num_layers in num_conv_layers_range:
    print(f"Training model with {num_layers} convolutional layers and {fixed_filters} filters...")
    history = build_and_train_cnn(
        num_conv_layers=num_layers,
        num_filters=fixed_filters,
        X_train_data=X_train,
        y_train_data=y_train,
        X_test_data=X_test,
        y_test_data=y_test,
        num_classes=y_cat.shape[1]
    )

    # Get the best validation loss and corresponding training loss
    best_epoch_idx = np.argmin(history.history['val_loss'])
    train_loss = history.history['loss'][best_epoch_idx]
    val_loss = history.history['val_loss'][best_epoch_idx]

    train_losses_conv.append(train_loss)
    val_losses_conv.append(val_loss)
    print(f"  -> Best Val Loss: {val_loss:.4f}, Train Loss: {train_loss:.4f}\n")

print("Evaluation complete for varying convolutional layers.")

# Plotting results for varying number of convolutional layers
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(list(num_conv_layers_range), train_losses_conv, marker='o', label='Training Loss')
plt.plot(list(num_conv_layers_range), val_losses_conv, marker='o', label='Validation Loss')
plt.title('Loss vs. Number of Convolutional Layers (Filters Fixed at 64)')
plt.xlabel('Number of Convolutional Layers')
plt.ylabel('Loss')
plt.xticks(list(num_conv_layers_range))
plt.legend()
plt.grid(True)

In [None]:
# Evaluation of number of filters / avaliação do número de filtros

num_filters_range = [16, 32, 64, 128] # Vary the number of filters
fixed_conv_layers = 8 # Keep convolutional layers fixed at 2

train_losses_filters = []
val_losses_filters = []

print("Evaluating model performance by varying the number of filters...")
for num_filters in num_filters_range:
    print(f"Training model with {fixed_conv_layers} convolutional layers and {num_filters} filters...")
    history = build_and_train_cnn(
        num_conv_layers=fixed_conv_layers,
        num_filters=num_filters,
        X_train_data=X_train,
        y_train_data=y_train,
        X_test_data=X_test,
        y_test_data=y_test,
        num_classes=y_cat.shape[1]
    )

    # Get the best validation loss and corresponding training loss
    best_epoch_idx = np.argmin(history.history['val_loss'])
    train_loss = history.history['loss'][best_epoch_idx]
    val_loss = history.history['val_loss'][best_epoch_idx]

    train_losses_filters.append(train_loss)
    val_losses_filters.append(val_loss)
    print(f"  -> Best Val Loss: {val_loss:.4f}, Train Loss: {train_loss:.4f}\n")

print("Evaluation complete for varying filters.")

# Plotting results for varying number of filters
plt.subplot(1, 2, 2)
plt.plot(num_filters_range, train_losses_filters, marker='o', label='Training Loss')
plt.plot(num_filters_range, val_losses_filters, marker='o', label='Validation Loss')
plt.title('Loss vs. Number of Filters (Layers Fixed)')
plt.xlabel('Number of Filters')
plt.ylabel('Loss')
plt.xticks(num_filters_range)
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# ===== Definir modelo 1D CNN / Define 1D CNN Model =====

# Assuming X has shape (number_of_samples, number_of_features)
# We need to reshape X for Conv1D to (number_of_samples, number_of_features, 1)
X_train_reshaped = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_reshaped = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

model = Sequential([
    Conv1D(filters=64, kernel_size=1, activation='relu', input_shape=(X_train_reshaped.shape[1], 1)), #input_shape just here / input_shape somente aqui
    MaxPooling1D(pool_size=2),
    Conv1D(filters=64, kernel_size=1, activation='relu'),
    MaxPooling1D(pool_size=2),
    Conv1D(filters=64, kernel_size=1, activation='relu'),
    MaxPooling1D(pool_size=2),

    Flatten(), # Flatten the output of the convolutional layers / Achatar a saída das camadas convolucionais
    Dense(128, activation='relu'), # Dense layer / Camada densa(conectada)
    Dense(y_cat.shape[1], activation="softmax") # Output layer
])

In [None]:
# Compilar modelo / compile model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

EarlyStopping
early_stop = EarlyStopping(
    monitor="val_loss",       # monitora o erro de validação / monitors the validation error
    patience=5,               # quantas épocas sem melhora antes de parar / how many epochs without improvement before stopping
    restore_best_weights=True # garante que volte para a melhor época / ensures that it returns to the best season
)

# Treinar modelo / Train model
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=200,
    batch_size=16,
    verbose=1,
    callbacks=[early_stop]
)

# Avaliar modelo / Evaluate model
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test accuracy: {acc:.4f}")

# Melhor época
melhor_ep = np.argmin(history.history["val_loss"]) + 1
melhor_val = history.history["val_loss"][melhor_ep - 1]
print(f"Best epoch chosen by EarlyStopping: {melhor_ep} (val_loss = {melhor_val:.4f})")


In [None]:
# ===== Plotar curvas de Loss e Acurácia / Plot Loss and Accuracy Curves =====

# Plot training & validation accuracy values
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
# ===== Matriz de confusão / Confusion matrix =====
# Previsões / Predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

# Matriz / Matrix
cm = confusion_matrix(y_true, y_pred_classes)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=np.unique(y),
            yticklabels=np.unique(y))
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion matrix")
plt.show()

# ===== Relatório de classificação / classification report =====
print("Classification report:\n")
print(classification_report(y_true, y_pred_classes, target_names=[f"Class {c}" for c in np.unique(y)]))

In [None]:
# Use curvas individuais (multiclass ROC) quando quiser analisar o desempenho do modelo em cada classe separadamente. / Use curvas individuais (ROC multiclasse) quando quiser analisar o desempenho do modelo em cada classe separadamente.

# Fit LabelEncoder to the original unique classes / Ajustar LabelEncoder às classes originais exclusivas
label_encoder = LabelEncoder()
label_encoder.fit(np.unique(y))

# Compute ROC curve and ROC area for each class / Calcular a curva ROC e a área ROC para cada classe
fpr = dict()
tpr = dict()
roc_auc = dict()

unique_classes = np.unique(y)  # Use original unique classes for labeling / Use classes originais exclusivas para rotulagem
n_classes = len(unique_classes)

for i in range(n_classes):
    class_value = unique_classes[i]
    # Use label_encoder to get the correct index for the current class / Use label_encoder para obter o índice correto para a classe atual
    class_index = label_encoder.transform([class_value])[0]

    # Check if the current class is present in the test set / Verifique se a classe atual está presente no conjunto de teste
    if np.sum(y_test[:, class_index]) > 0:
        fpr[i], tpr[i], _ = roc_curve(y_test[:, class_index], y_pred[:, class_index])
        roc_auc[i] = auc(fpr[i], tpr[i])
    else:
        # If a class is not in the test set, set AUC to NaN and skip plotting its curve / Se uma classe não estiver no conjunto de teste, defina AUC como NaN e pule a plotagem de sua curva
        roc_auc[i] = np.nan
        print(f"Warning: Class {class_value} has no positive samples in the test set. Skipping ROC curve for this class.")


# Plot ROC curves / Traçar curvas ROC
plt.figure(figsize=(8, 6))
colors = ['blue', 'red', 'green', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan']
for i in range(n_classes):
    # Only plot if the AUC was calculated (class was in the test set) / Somente plote se a AUC foi calculada (a classe estava no conjunto de teste)
    if not np.isnan(roc_auc[i]):
        # Ensure there are enough colors / Certifique-se de que há cores suficientes
        color = colors[i % len(colors)]
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label='ROC curve of class {0} (area = {1:0.2f})'.format(unique_classes[i], roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass ROC Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Use macro-average ROC quando quiser uma visão geral do desempenho do modelo em todas as classes, sem considerar desequilíbrio entre elas. / Use ROC macro-médio quando quiser uma visão geral do desempenho do modelo em todas as classes, sem considerar desequilíbrio entre elas.

# Calculate macro-average ROC curve and AUC / Calcular a curva ROC macromédia e a AUC
# First aggregate all false positive rates / Primeiro, agregue todas as taxas de falsos positivos
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes) if not np.isnan(roc_auc[i])]))

# Then interpolate all ROC curves at this points / Em seguida, interpole todas as curvas ROC nesses pontos
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    if not np.isnan(roc_auc[i]):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Average it and compute AUC / Calcule a média e a AUC
mean_tpr /= sum([not np.isnan(roc_auc[i]) for i in range(n_classes)]) # Divide by the number of classes that were in the test set/ Divida pelo número de classes que estavam no conjunto de teste

macro_roc_auc = auc(all_fpr, mean_tpr)

# Plot macro-average ROC curve
plt.figure(figsize=(8, 6))
plt.plot(all_fpr, mean_tpr, color='red', linestyle='-', linewidth=2,
         label='Macro-average ROC curve (area = {0:0.2f})'.format(macro_roc_auc))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass ROC Curve (Macro-average)')
plt.legend(loc="lower right")
plt.show()