## Imports

In [None]:
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.base import clone
import numpy as np
import pandas as pd
import matplotlib as plt

## Parameters

SEED = 14
num_features = 4

## Ex 1

In [None]:
#EX1
#EX1.1 ##################################
#EX1.1.1
#Usar critério 70/30 
def create_split_train_test(X, y, test_size=0.30, random_state=SEED):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

    return X_train, X_test, y_train, y_test

#Usar critério 40/30/30
def create_split_tvt(X, y, val_size=0.30, test_size=0.30, random_state=SEED):
    # Primeiro, separa o conjunto de teste
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Calcula a proporção da validação em relação ao restante (treino + validação)
    val_relative_size = val_size / (1 - test_size)

    # Depois, separa treino e validação
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_relative_size, random_state=random_state
    )

    return X_train, X_val, X_test, y_train, y_val, y_test

#EX1.1.2
def create_split_kfold(X, y, n_splits=5, random_state=None, shuffle=False):

    kf = KFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle)
    folds = []

    for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        folds.append((fold, X_train, X_test, y_train, y_test))

    return folds

#EX 1.2 ##################################
#EX 1.2.1
def calcular_matriz_confusao(y_true, y_pred):
    #Retorna a matriz de confusão.
    return confusion_matrix(y_true, y_pred)

def mean_std_confusion_matrix(list_of_cms):
    arr = np.stack(list_of_cms, axis=0)  # shape (n_folds, n_classes, n_classes)
    mean_cm = np.mean(arr, axis=0)
    std_cm  = np.std(arr, axis=0, ddof=0)
    return mean_cm, std_cm

#EX1.2.2
def recall(y_true, y_pred, average='macro'):
    #Calcula o Recall.
    #O parâmetro 'average' pode ser: 'binary', 'micro', 'macro', 'weighted'.
    return recall_score(y_true, y_pred, average=average, zero_division=0)

#EX1.2.3
def precision(y_true, y_pred, average='macro'):
    #Calcula a Precision.
    return precision_score(y_true, y_pred, average=average, zero_division=0)

#EX1.2.4
def f1(y_true, y_pred, average='macro'):
    #Calcula o F1-score.
    return f1_score(y_true, y_pred, average=average, zero_division=0)

def print_metrics(y_true, y_predict, label, printing=True):
    """
    Aceita:
      - y_true, y_predict (arrays individuais)
      - OU listas de y_true / y_predict vindos de K-folds.
    """

    # Detecta automaticamente se é K-fold (listas) ou caso único
    is_kfold = isinstance(y_true, list)

    if not is_kfold:
        # Caso normal: calcular tudo para uma única predição
        metrics = {
            "confusion_matrix": calcular_matriz_confusao(y_true, y_predict),
            "recall": recall(y_true, y_predict),
            "precision": precision(y_true, y_predict),
            "f1-score": f1(y_true, y_predict)
        }

    else:
        # Caso K-folds
        precisions = []
        recalls = []
        f1s = []
        confusion_matrices = []

        for yt, yp in zip(y_true, y_predict):
            confusion_matrices.append(calcular_matriz_confusao(yt, yp))
            recalls.append(recall(yt, yp))
            precisions.append(precision(yt, yp))
            f1s.append(f1(yt, yp))

        # guardar tudo no dicionário
        cms_mean, cms_std = mean_std_confusion_matrix(confusion_matrices)

        metrics = {
            "confusion_matrices_mean": cms_mean,
            "confusion_matrices_std": cms_std,
            "precision_mean": np.mean(precisions),
            "precision_std": np.std(precisions),
            "recall_mean": np.mean(recalls),
            "recall_std": np.std(recalls),
            "f1_mean": np.mean(f1s),
            "f1_std": np.std(f1s)
        }

    if printing:
        print(f"\n===== {label} =====")

        if not is_kfold:
            print("Confusion Matrix:")
            print(metrics["confusion_matrix"])
            print()
            print(f"Recall:          {metrics['recall']:.4f}")
            print(f"Precision:       {metrics['precision']:.4f}")
            print(f"F1-Score:        {metrics['f1-score']:.4f}")

        else:
            print("K-Fold results (means ± std):\n")
            print(f"Precision:       {metrics['precision_mean']:.4f} ± {metrics['precision_std']:.4f}")
            print(f"Recall:          {metrics['recall_mean']:.4f} ± {metrics['recall_std']:.4f}")
            print(f"F1-Score:        {metrics['f1_mean']:.4f} ± {metrics['f1_std']:.4f}")
            print("\n(Confusion matrices individuais guardadas no dicionário.)")

        print("=============================\n")

    return metrics