In [None]:
# ===== Pacotes / Packages =====
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report, r2_score, roc_curve, auc
from sklearn.decomposition import PCA
from sklearn.inspection import DecisionBoundaryDisplay

In [None]:
# ===== 1. Ler os dados .XLSX / Read .XLSX data =====
dados = pd.read_excel("NAME.xlsx")
print(dados.head())

In [None]:
# === Ler os dados .CSV / Read .CSV data ===
dados = pd.read_csv('NAME.csv', sep=';')  # especifica o separador / specifies the separator
print(dados.head())

In [None]:
# ===== 2. Garantir que preditoras sejam numéricas / Ensure predictors are numeric =====
X = dados.drop(dados.columns[0], axis=1).apply(pd.to_numeric)
y = dados[dados.columns[0]].values.ravel()  # Classe na primeira coluna / Class in first column

# ===== 3. Padronizar os dados / Standardize data =====
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ===== Exclui NaN / Excludes NaN =====
dados_cleaned = dados.dropna()

In [None]:
# ===== 4. Divisão treino/teste / Training/testing division =====
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=1234, stratify=y
)

print("Train:", X_train.shape, y_train.shape)
print("Test:", X_test.shape, y_test.shape)

In [None]:
# ===== 5. Treinar SVM / Train SVM =====

# Esse código utiliza a estratégia "um grupo versus todos os demais" (one-vs-rest) para a classificação multiclasse. Para fazer um grupo versus o outro (one-vs-one) acrescente: decision_function_shape='ovo'
# This code uses the "one group versus all others" strategy for multiclass classification. To do one-vs-one classification add: decision_function_shape='ovo'

modelo_svm = SVC(kernel="linear", probability=True, random_state=1234, decision_function_shape='ovo') #testar com outros kernel / test with other kernels = linear, poly, rbf e sigmoid.
modelo_svm.fit(X_train, y_train)

In [None]:
# ===== 6. Predição / Prediction =====
y_pred_train = modelo_svm.predict(X_train)
y_pred_test = modelo_svm.predict(X_test)

print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))

In [None]:
# Print the number of support vectors / Número de vetores de suporte
print("Number of support vectors:", modelo_svm.support_vectors_.shape[0])

In [None]:
# ===== 7. Métricas / Metrics =====
classes = sorted(np.unique(y))

# Matriz de confusão / Confusion matrix
conf_mat = confusion_matrix(y_test, y_pred_test, labels=classes)
cm_df = pd.DataFrame(conf_mat, index=classes, columns=classes)

# Visualização / Preview
plt.figure(figsize=(6,5))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predict")
plt.ylabel("True")
plt.title("Confusion Matrix - SVM")
plt.show()

# Classification report
print("\nClassification Report Teste:\n")
print(classification_report(y_test, y_pred_test, target_names=[str(c) for c in classes]))

In [None]:
# ===== 8. Validação Cruzada com K-Fold / K-Fold Cross-Validation =====

# Validar o modelo SVM com validação cruzada (K=5 por padrão) / Validate the SVM model with cross-validation (K=5 by default)
cv_scores = cross_val_score(modelo_svm, X_scaled, y, cv=5)

print("Accuracies per Fold:", cv_scores)
print("Average Accuracy (K-Fold):", np.mean(cv_scores))
print("Standard Deviation of Accuracy (K-Fold):", np.std(cv_scores))

In [None]:
# ===== 9. Feature Importance (para kernel linear) =====

# Extrair os coeficientes do modelo (para kernel linear) / Extract model coefficients (for linear kernel)
# Os coeficientes indicam a importância de cada feature na decisão de fronteira / The coefficients indicate the importance of each feature in the frontier decision
coeficientes = modelo_svm.coef_[0]  # Pegando os coeficientes para a primeira classe (ajustar se necessário) / Getting the coefficients for the first class (adjust if necessary)

# Obter os nomes das features / Get feature names
nomes_features = X.columns

# Criar um DataFrame para visualizar coeficientes e nomes das features / Create a DataFrame to visualize coefficients and feature names
feature_importance = pd.DataFrame({
    'feature': nomes_features,
    'importance': np.abs(coeficientes) # Usar valor absoluto para importância / Use absolute value for importance
})

# Ordenar as features por importância / Sort features by importance
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Visualizar as top N features (ex: top 10) / View the top N features (e.g. top 10)
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(20), palette='viridis')
plt.title('Top 10 Features by Importance (Linear SVM Coefficients)')
plt.xlabel('Importance (Absolute Coefficient Value)')
plt.ylabel('Feature')
plt.show()

In [None]:
# ===== 10. Plot classification boundaries =====

def plot_decision_boundary_multiclass(model, X, y, title="SVM Decision Boundary (PCA 2D)"):
    # Reduz para 2D com PCA / Reduce to 2D with PCA
    pca = PCA(n_components=2)
    X_2d = pca.fit_transform(X)

    # Converter rótulos de classe para numéricos para o scatter plot / Convert class labels to numeric for scatter plot
    y_numeric, classes = pd.factorize(y)

    # Modelo auxiliar só para visualização / Auxiliary model for viewing only
    viz_model = SVC(kernel=model.kernel, C=model.C, gamma=model.gamma, decision_function_shape='ovr')
    viz_model.fit(X_2d, y_numeric) # Treinar o modelo auxiliar com rótulos numéricos / Train auxiliary model with numeric labels


    fig, ax = plt.subplots(figsize=(6, 5))

    # Fronteira de decisão (somente 'predict') / Decision boundary ('predict' only)
    DecisionBoundaryDisplay.from_estimator(
        viz_model,
        X_2d,
        response_method="predict",
        plot_method="pcolormesh",
        cmap="coolwarm",
        alpha=0.3,
        ax=ax
    )

    # Pontos / Points
    scatter = ax.scatter(X_2d[:, 0], X_2d[:, 1], c=y_numeric, s=30, edgecolors="k", cmap="coolwarm") # Usar y_numeric para colorir os pontos / Use y_numeric to color the points

    # Vetores de suporte / Support vectors
    if hasattr(viz_model, "support_vectors_"):
        ax.scatter(
            viz_model.support_vectors_[:, 0],
            viz_model.support_vectors_[:, 1],
            s=150,
            facecolors="none",
            edgecolors="k"
        )

    # Criar um mapeamento manual para a legenda com os rótulos originais / Create a manual mapping for the legend with original labels
    handles, _ = scatter.legend_elements()
    legend = ax.legend(handles, classes, loc="upper right", title="Classes")
    ax.add_artist(legend)


    ax.set_title(title)
    plt.show()

# Usar no seu modelo já treinado (com 9 features), mas reduzindo p/ PCA 2D só no gráfico / Use on your already trained model (with 9 features), but reducing to 2D PCA only on the graph
plot_decision_boundary_multiclass(modelo_svm, X_scaled, y, "SVM Decision Boundary (Multiclass, PCA 2D)")

In [None]:
# Use curvas individuais (multiclass ROC) quando quiser analisar o desempenho do modelo em cada classe separadamente. / Use curvas individuais (ROC multiclasse) quando quiser analisar o desempenho do modelo em cada classe separadamente.

# Fit LabelEncoder to the original unique classes / Ajustar LabelEncoder às classes originais exclusivas
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)
y_pred_test_encoded = label_encoder.transform(y_pred_test)


# Compute ROC curve and ROC area for each class / Calcular a curva ROC e a área ROC para cada classe
fpr = dict()
tpr = dict()
roc_auc = dict()

unique_classes = label_encoder.classes_ # Use the classes from the fitted encoder / Use as classes do codificador ajustado
n_classes = len(unique_classes)

for i in range(n_classes):
    class_value = unique_classes[i]

    # Create binary true and predicted labels for the current class (one-vs-rest)
    y_test_binary = (y_test_encoded == i).astype(int)
    y_pred_test_binary = (y_pred_test_encoded == i).astype(int)

    # Check if the current class is present in the test set (has positive samples)
    if np.sum(y_test_binary) > 0:
        fpr[i], tpr[i], _ = roc_curve(y_test_binary, y_pred_test_binary)
        roc_auc[i] = auc(fpr[i], tpr[i])
    else:
        # If a class is not in the test set, set AUC to NaN and skip plotting its curve
        roc_auc[i] = np.nan
        print(f"Warning: Class {class_value} has no positive samples in the test set. Skipping ROC curve for this class.")


# Plot ROC curves / Traçar curvas ROC
plt.figure(figsize=(8, 6))
colors = ['blue', 'red', 'green', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan']
for i in range(n_classes):
    # Only plot if the AUC was calculated (class was in the test set) / Somente plote se a AUC foi calculada (a classe estava no conjunto de teste)
    if not np.isnan(roc_auc[i]):
        # Ensure there are enough colors / Certifique-se de que há cores suficientes
        color = colors[i % len(colors)]
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label='ROC curve of class {0} (area = {1:0.2f})'.format(unique_classes[i], roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass ROC Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Use macro-average ROC quando quiser uma visão geral do desempenho do modelo em todas as classes, sem considerar desequilíbrio entre elas. / Use ROC macro-médio quando quiser uma visão geral do desempenho do modelo em todas as classes, sem considerar desequilíbrio entre elas.

# Calculate macro-average ROC curve and AUC / Calcular a curva ROC macromédia e a AUC
# First aggregate all false positive rates / Primeiro, agregue todas as taxas de falsos positivos
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes) if not np.isnan(roc_auc[i])]))

# Then interpolate all ROC curves at this points / Em seguida, interpole todas as curvas ROC nesses pontos
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    if not np.isnan(roc_auc[i]):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Average it and compute AUC / Calcule a média e a AUC
mean_tpr /= sum([not np.isnan(roc_auc[i]) for i in range(n_classes)]) # Divide by the number of classes that were in the test set/ Divida pelo número de classes que estavam no conjunto de teste

macro_roc_auc = auc(all_fpr, mean_tpr)

# Plot macro-average ROC curve
plt.figure(figsize=(8, 6))
plt.plot(all_fpr, mean_tpr, color='red', linestyle='-', linewidth=2,
         label='Macro-average ROC curve (area = {0:0.2f})'.format(macro_roc_auc))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass ROC Curve (Macro-average)')
plt.legend(loc="lower right")
plt.show()