<a href="https://colab.research.google.com/github/SirSirocco/May_25_Ipynb_ExperimentPlan/blob/main/ENG4502_experiment_plan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DEPENDÊNCIAS GERAIS

In [182]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from google.colab import drive
from itertools import product
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# FUNÇÕES

## Exibição

In [183]:
def df_show_domain(df: pd.DataFrame, columns: list[str]) -> None:
    """
    Exibe o tipo e os valores possíveis de columns segundo a ordem da lista de
    colunas.

    :param df: DataFrame a ter os domínios exibidos.
    :param columns: Lista dos nomes das colunas a serem exibidas.
    :return: None
    """
    for col in columns:
        print(f"Coluna:   {col}")
        print(f"dtype:    {df[col].dtype}")
        print(f"Domínio:  {df[col].unique()}\n")

def df_show_head(df: pd.DataFrame, n: int = 5) -> None:
    display(df.head(n))
    print(f"Shape: {df.shape}")

def df_show_null(df: pd.DataFrame) -> None:
    display(df.isnull().sum())

## Métricas

In [184]:
def efficiency_score(y_true: list[float], y_pred: list[float], average: bool = True) -> float:
    prec = precision_score(y_true, y_pred, average=None)
    rec = recall_score(y_true, y_pred, average=None)

    length = len(prec)
    if length == 0 or length != len(rec):
        raise ValueError("precision_score and recall_score must have the same length, which has to be greater than zero")

    eff = [(prec[i] + rec[i]) / 2.0 for i in range(length)]

    return np.mean(eff) if average else eff

def get_metrics_classification(y_true: pd.Series, y_pred: pd.Series, classes: list[str], view: bool = False) -> dict:
    """
    Exibe métricas de avaliação do modelo se view for True e as retorna
    em um dicionário. Se a métrica for atrelada à classe,
    a chave do dicionário terá a forma "<metrica>_<classe>".
    """
    # Constantes auxiliares
    DICT_METRICS_GENERAL = {
        "Acurácia": accuracy_score,
    }
    DICT_METRICS_CLASS = {
        "Precisão": precision_score,
        "Sensibilidade": recall_score,
        "F1-Score": f1_score,
        "Eficiência": efficiency_score,
    }

    metrics_general = dict()
    metrics_class = dict()
    dict_metric = dict()
    format_classes = [x.replace(" ", "_") for x in classes]
    num_classes = len(classes)

    # Computa métricas globais
    for metric, function in DICT_METRICS_GENERAL.items():
        metrics_general[metric] = dict_metric[metric] = (function(y_true, y_pred))

    # Computa métricas por classe
    for metric, function in DICT_METRICS_CLASS.items():
        metrics_class[metric] = function(y_true, y_pred, average=None)
        for i in range(num_classes):
            dict_metric[f"{metric}_{format_classes[i]}"] = metrics_class[metric][i]

    if view: # Exibe
        for metric, value in metrics_general.items():
            print(f"{metric}: %.4f" % value)

        for i in range(num_classes):
            print(f"{format_classes[i]}:")

            for metric, values in metrics_class.items():
                print(f"\t{metric}: %.4f" % values[i])

        # Adiciona respiro visual
        print("")

    return dict_metric

In [185]:
def evaluate_knn_cases(
    cases_dict: dict,
    distance_funcs: dict[str, any],
    k_list: list[int],
    classes: list[str],
    test_size: float = 0.2,
    random_state: int = 42
) -> dict:
    """
    Avalia cada combinação de (pré-processamento, distância, k) com KNN.
    Gera uma única entrada por combinação contendo X, y e todos os parâmetros + métricas.

    Parâmetros:
        - cases_dict: saída de get_preprocessing_cases.
        - distance_funcs: dict {"nome_metrica": metric}, onde metric pode ser string ou callable.
        - k_list: lista de valores de k a testar.
        - classes: lista de nomes das classes (usada na função de métricas).
        - test_size: fração para teste no split.
        - random_state: seed para reprodutibilidade.

    Saída:
        dict:
            {
                "<chave-antiga>_KNN_MET{nome-distância}_K{valor-k}": {
                    "X": X_test,
                    "y": y_test,
                    "params": {
                        ...params de pré-processamento...,
                        "distance": "nome-distância",
                        "k": valor,
                        ...métricas como chaves: "Acurácia", "Precisão_classe_A", etc.
                    }
                }
            }
    """
    output_dict = {}

    for base_key, case_info in cases_dict.items():
        X_full = case_info["X"]
        y_full = case_info["y"]
        params_orig = case_info["params"]

        # Split train/test
        X_train, X_test, y_train, y_test = train_test_split(
            X_full, y_full,
            test_size=test_size,
            random_state=random_state,
            stratify=y_full
        )

        for dist_name, dist_value in distance_funcs.items():
            for k in k_list:
                # Instancia o modelo
                knn = KNeighborsClassifier(n_neighbors=k, metric=dist_value)
                knn.fit(X_train, y_train)
                y_pred = knn.predict(X_test)

                # Calcula métricas
                metrics = get_metrics_classification(y_test, y_pred, classes, view=False)

                # Monta nova chave
                new_key = f"{base_key}_KNN_MET{dist_name}_K{k}"

                # Combina params antigos, de KNN, e métricas em um único dicionário
                all_params = {
                    **params_orig,
                    "distance": dist_name,
                    "k": k,
                    **metrics
                }

                # Salva no dicionário final
                output_dict[new_key] = {
                    "X": X_test,
                    "y": y_test,
                    "params": all_params
                }

    return output_dict

## Validação

In [186]:
def is_valid_series(s: pd.Series):
    return not s.empty and s.notnull().all() and pd.api.types.is_numeric_dtype(s)

## Pré-Processamento

In [187]:
def preprocessing_map(df: pd.DataFrame, features: list[str], maps: list[dict]) -> pd.DataFrame:
    """
    Aplica mapeamentos personalizados a múltiplas colunas de um DataFrame.

    :param df: DataFrame original com os dados a serem transformados.
    :param features: Lista de nomes de colunas que serão mapeadas.
    :param maps: Lista de dicionários contendo os mapeamentos a serem aplicados, um para cada coluna,
                 respeitando a ordem das colunas em features.

    :return: Novo DataFrame com os mapeamentos aplicados nas colunas especificadas.
    """
    df_copy = df.copy()

    for feature, feature_map in zip(features, maps):
        df_copy[feature] = df[feature].map(feature_map)

    return df_copy

def subs_na_mean(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
    df_copy = df.copy()
    for col in columns:
        mean = df[col].mean()
        df_copy[col] = df[col].fillna(mean)
    return df_copy

def subs_outliers(df: pd.DataFrame, columns: list[str], view:bool = False) -> pd.DataFrame:
    """
    Substitui outliers pela média. Se view for True, exibe
    dois gráficos para cada coluna: antes e depois da mudança.
    """

    # Guarda resultado anterior
    df_before = df.copy()

    for column in columns:
        # Obtém quartis 1 e 3
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)

        # Obtém IQR
        iqr = q3 - q1

        """
        Outlier se em (-inf, Q1 - 1.5 * IQR) ou (Q3 + 1.5 * IQR, +inf).
        Subsititui outliers por NA ('not available') segundo método IQR (InterQuartile Range).
        """
        df.loc[(df[column] < q1 - 1.5 * iqr) | (df[column] > q3 + 1.5 * iqr), column] = pd.NA

        # Substitui NAs pela média
        df[column] = df[column].fillna(df[column].mean())

    if view:
        for column in columns:
            # Exibe resultado inicial
            plt.boxplot(df_before[column])
            plt.title(f"ANTES: {column}")
            plt.show()

            # Exibe novo resultado
            plt.boxplot(df[column])
            plt.title(f"DEPOIS: {column}")
            plt.show()

    return df

# def get_preprocessing_cases(df: pd.DataFrame,
#                             target: str,
#                             random_seed: int,
#                             feat_eng_list: list[pd.Series],
#                             outliers: list[bool],
#                             normalization: list[bool],
#                             pca: list[int]) -> dict:
#     SEPARATOR = "_"

#     PREFIXES = {
#         "feat_eng": "FE",
#         "outliers": "OUT",
#         "normalization": "NORM",
#         "pca": "PCA",
#     }

#     RADIXES = {
#         "absent": "na",
#         "True": "1",
#         "False": "0",
#     }

#     scaler = StandardScaler()
#     dict_cases = dict()
#     df_original = df.copy()

#     # RANDOM SEED
#     np.random.seed(random_seed) # Fixa semente aletória

#     for feat_eng, outlier, norm, pca_n in product(feat_eng_list, outliers, normalization, pca):
#         dict_params = dict()
#         key_parts = list()
#         df = df_original.copy()

#         # FEATURE ENGINEERING
#         key_partial = PREFIXES["feat_eng"]

#         if is_valid_series(feat_eng):
#             df = pd.concat([df, feat_eng], axis=1)
#             key_partial += f"_{feat_eng.name}"
#         else:
#             key_partial += RADIXES["absent"]

#         key_parts.append(key_partial)
#         dict_params["feat_eng"] = feat_eng.name

#         # OUTLIERS
#         key_partial = PREFIXES["outliers"]

#         if outlier:
#             df = subs_outliers(df, df.columns, view=False)
#             key_partial += RADIXES["True"]
#         else:
#             key_partial += RADIXES["False"]

#         X = df.drop(columns=[target])
#         y = df[target]

#         key_parts.append(key_partial)
#         dict_params["outliers"] = outliers

#         # NORMALIZAÇÃO
#         key_partial = PREFIXES["normalization"]

#         if norm:
#             X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
#             key_partial += RADIXES["True"]
#         else:
#             key_partial += RADIXES["False"]

#         key_parts.append(key_partial)
#         dict_params["normalization"] = norm

#         # PCA
#         key_partial = PREFIXES["pca"]

#         pca_obj = PCA(n_components=pca_n)
#         X = pca_obj.fit_transform(X)

#         key_partial += f"{pca_n}"

#         key_parts.append(key_partial)

#         dict_params["pca"] = pca_n

#         key = SEPARATOR.join(key_parts)
#         dict_cases[key] = {
#             "X": X,
#             "y": y,
#             "params": dict_params,
#         }
#     return dict_cases

In [206]:
def apply_feature_engineering(df, X, y, feat_eng, prefix, radixes):
    key = prefix
    if is_valid_series(feat_eng):
        df = pd.concat([df, feat_eng], axis=1)
        key += f"_{feat_eng.name}"
        feat_name = feat_eng.name
    else:
        key += radixes["absent"]
        feat_name = None
    return df, X, y, key, {"feat_eng": feat_name}

def apply_outliers(df, X, y, out, prefix, radixes):
    key = prefix
    if out:
        df = subs_outliers(df, df.columns, view=False)
        key += radixes["True"]
    else:
        key += radixes["False"]
    X = df.drop(columns=[y.name])
    y = df[y.name]
    return df, X, y, key, {"outliers": out}

apply_normalization_scaler = StandardScaler()
def apply_normalization(df, X, y, norm, prefix, radixes):
    key = prefix

    if norm:
        X = pd.DataFrame(apply_normalization_scaler.fit_transform(X), columns=X.columns, index=X.index)
        key += radixes["True"]
    else:
        key += radixes["False"]
    return df, X, y, key, {"normalization": norm}

def apply_pca(df, X, y, pca_n, prefix, radixes):
    key = f"{prefix}{pca_n}"
    pca_obj = PCA(n_components=pca_n)
    X_pca = pca_obj.fit_transform(X)
    return df, X_pca, y, key, {"pca": pca_n}

def build_key(key_parts, separator="_"):
    return separator.join(key_parts)

# --- Main Function ---
def get_preprocessing_cases(df: pd.DataFrame,
                            target: str,
                            random_seed: int,
                            feat_eng_list: list[pd.Series],
                            outliers: list[bool],
                            normalization: list[bool],
                            pca: list[int]) -> dict:

    PREFIXES = {"feat_eng": "FE", "outliers": "OUT", "normalization": "NORM", "pca": "PCA"}
    RADIXES = {"absent": "na", "True": "1", "False": "0"}

    dict_cases = {}
    df_original = df.copy()

    np.random.seed(random_seed)

    for feat_eng, out, norm, pca_n in product(feat_eng_list, outliers, normalization, pca):
        param_sequence = [
            (apply_feature_engineering, feat_eng, PREFIXES["feat_eng"]),
            (apply_outliers, out, PREFIXES["outliers"]),
            (apply_normalization, norm, PREFIXES["normalization"]),
            (apply_pca, pca_n, PREFIXES["pca"]),
        ]

        key_parts = []
        df = df_original.copy()
        X = df.drop(columns=[target])
        y = df[target]
        dict_params = {}

        for func, param, prefix in param_sequence:
            df, X, y, key, update = func(df, X, y, param, prefix, RADIXES)
            key_parts.append(key)
            dict_params.update(update)

        key = build_key(key_parts)
        dict_cases[key] = {"X": X, "y": y, "params": dict_params}

    return dict_cases

In [215]:
def run_knn_tests(cases_dict: dict, metrics: list[str], k_values: list[int], classes: list[str]) -> dict:
    """
    Executa testes do KNeighborsClassifier para cada caso no dicionário,
    variando a métrica e o número de k vizinhos.

    Args:
        cases_dict: dict gerado por get_preprocessing_cases, com chaves e dados X,y.
        metrics: lista de métricas, entre 'minkowski', 'euclidean' e 'manhattan'.
        k_values: lista de inteiros para k do KNN.
        classes: lista das classes do problema (nomes das classes).

    Retorna:
        dict expandido com novas chaves contendo resultados dos testes KNN.
    """

    metric_roots = {
        "minkowski": "mnk",
        "euclidean": "ecd",
        "manhattan": "mnh"
    }

    results = {}

    for base_key, data in cases_dict.items():
        X = data["X"]
        y = data["y"]

        for metric, k in product(metrics, k_values):
            if metric not in metric_roots:
                raise ValueError(f"Métrica desconhecida: {metric}")

            metric_root = metric_roots[metric]

            knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
            knn.fit(X, y)
            y_pred = knn.predict(X)

            metric_dict = get_metrics_classification(y, pd.Series(y_pred, index=y.index), classes, view=False)

            new_key = f"{base_key}_KNN_MET{metric_root}_K{k}"

            results[new_key] = {
                "X": X,
                "y": y,
                "params": {**data["params"], "knn_metric": metric, "knn_k": k},
                "metrics": metric_dict
            }

    return results

def flatten_nested_fields(input_dict: dict, fields_to_flatten: list[str]) -> dict:
    """
    Desestrutura campos aninhados de cada entrada do dicionário e os promove ao nível superior.

    Parâmetros:
        input_dict: dicionário de entrada, com estrutura potencialmente aninhada.
        fields_to_flatten: lista de chaves que devem ser desmembradas e incorporadas na raiz de cada entrada.

    Retorno:
        Um novo dicionário com os mesmos identificadores de entrada, mas com os campos especificados desestruturados.
    """
    output_dict = {}

    for key, data in input_dict.items():
        flat_entry = {}

        for sub_key, value in data.items():
            if sub_key in fields_to_flatten and isinstance(value, dict):
                flat_entry.update(value)  # Desestrutura o dicionário
            else:
                flat_entry[sub_key] = value  # Mantém normalmente

        output_dict[key] = flat_entry

    return output_dict

# def flatten_single_entry(entry: dict, fields_to_flatten: list[str]) -> dict:
#     """
#     Versão para uma única entrada de dicionário.
#     """
#     flat_entry = {}

#     for key, value in entry.items():
#         if key in fields_to_flatten and isinstance(value, dict):
#             flat_entry.update(value)
#         else:
#             flat_entry[key] = value

#     return flat_entry

def flatten_single_entry(entry: dict, fields_to_flatten: list[str]) -> dict:
    """
    Desestrutura campos de uma única entrada, retornando apenas os atributos desejados.
    """
    flat_entry = {}

    for field in fields_to_flatten:
        value = entry.get(field)
        if isinstance(value, dict):
            flat_entry.update(value)

    return flat_entry

# SETUP DO AMBIENTE

In [190]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# OBTENÇÃO DO DATASET

In [191]:
PATH = "/content/drive/MyDrive/07_per_shared/datasets/dataset_obesity-2024_2.csv"
df = pd.read_csv(PATH)
df_show_head(df)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,TW1
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal Weight,42.0
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal Weight,42.0
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal Weight,46.0
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Normal Weight,54.0
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Normal Weight,44.0


Shape: (2111, 18)


# EXIBIÇÃO DO DOMÍNIO

In [192]:
df_show_domain(df, df.columns)

Coluna:   Gender
dtype:    object
Domínio:  ['Female' 'Male']

Coluna:   Age
dtype:    float64
Domínio:  [21.       23.       27.       ... 22.524036 24.361936 23.664709]

Coluna:   Height
dtype:    float64
Domínio:  [1.62     1.52     1.8      ... 1.752206 1.73945  1.738836]

Coluna:   Weight
dtype:    float64
Domínio:  [ 64.        56.        77.       ... 133.689352 133.346641 133.472641]

Coluna:   family_history_with_overweight
dtype:    object
Domínio:  ['yes' 'no']

Coluna:   FAVC
dtype:    object
Domínio:  ['no' 'yes']

Coluna:   FCVC
dtype:    float64
Domínio:  [2.       3.       1.       2.450218 2.880161 2.00876  2.596579 2.591439
 2.392665 1.123939 2.027574 2.658112 2.88626  2.714447 2.750715 1.4925
 2.205439 2.059138 2.310423 2.823179 2.052932 2.596364 2.767731 2.815157
 2.737762 2.568063 2.524428 2.971574 1.0816   1.270448 1.344854 2.959658
 2.725282 2.844607 2.44004  2.432302 2.592247 2.449267 2.929889 2.015258
 1.031149 1.592183 1.21498  1.522001 2.703436 2.362918 2.140

# DADOS FALTANTES

In [193]:
print("------ ANTES DA IMPUTAÇÃO ------")
df_show_null(df)

"""
Substituamos os dados faltantes ao imputarmos a média, método
simples e escalável.
"""
NAS = df.columns[df.isnull().any()] # Colunas com ao menos um valor nulo
df_naless = subs_na_mean(df, NAS)

print("\n------ DEPOIS DA IMPUTAÇÃO ------")
df_show_null(df_naless)

------ ANTES DA IMPUTAÇÃO ------


Unnamed: 0,0
Gender,0
Age,0
Height,100
Weight,0
family_history_with_overweight,0
FAVC,0
FCVC,0
NCP,0
CAEC,0
SMOKE,0



------ DEPOIS DA IMPUTAÇÃO ------


Unnamed: 0,0
Gender,0
Age,0
Height,0
Weight,0
family_history_with_overweight,0
FAVC,0
FCVC,0
NCP,0
CAEC,0
SMOKE,0


# CODIFICAÇÃO

In [194]:
# Codificação Real
ORDINALS = ["CAEC", "CALC"]
ORDINAL_MAPS = [
    {"no": 0, "Sometimes": 1, "Frequently": 2, "Always": 3},
    {"no": 0, "Sometimes": 1, "Frequently": 2, "Always": 3},
]
df_ord = preprocessing_map(df_naless, ORDINALS, ORDINAL_MAPS)

# Codificação Binária
BINARIES = ["Gender", "family_history_with_overweight", "FAVC", "SMOKE", "SCC", "NObeyesdad"]
BINARY_MAPS = [
    {"Male": 0, "Female": 1},
    {"no": 0, "yes": 1},
    {"no": 0, "yes": 1},
    {"no": 0, "yes": 1},
    {"no": 0, "yes": 1},
    {"Normal Weight": 0, "Obesity": 1}
]
df_bin = preprocessing_map(df_ord, BINARIES, BINARY_MAPS)

# Codificação One-Hot
NOMINALS = ["MTRANS"]
df_nom = pd.get_dummies(df_bin, columns=NOMINALS, dtype="Int32")

df_show_head(df_nom)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,...,FAF,TUE,CALC,NObeyesdad,TW1,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,1,21.0,1.62,64.0,1,0,2.0,3.0,1,0,...,0.0,1.0,0,0,42.0,0,0,0,1,0
1,1,21.0,1.52,56.0,1,0,3.0,3.0,1,1,...,3.0,0.0,1,0,42.0,0,0,0,1,0
2,0,23.0,1.8,77.0,1,0,2.0,3.0,1,0,...,2.0,1.0,2,0,46.0,0,0,0,1,0
3,0,27.0,1.8,87.0,0,0,3.0,3.0,1,0,...,2.0,0.0,2,0,54.0,0,0,0,0,1
4,0,22.0,1.78,89.8,0,0,2.0,1.0,1,0,...,0.0,0.0,1,0,44.0,0,0,0,1,0


Shape: (2111, 22)


# ENGENHARIA DE CARACTERÍSTICAS

In [195]:
# CRIAÇÃO DA SÉRIE DO IMC (Índice de Massa Corporal)

"""
Supõe altura em metros e peso em quilogramas.
"""
se_imc = df_naless["Weight"] / (df_naless["Height"]) ** 2.0
se_imc.name = "imc"
display(se_imc)

Unnamed: 0,imc
0,24.386526
1,24.238227
2,23.765432
3,26.851852
4,28.342381
...,...
2106,44.901475
2107,43.741923
2108,43.543817
2109,44.071535


# CASOS DE PRÉ-PROCESSAMENTO

In [198]:
dict_cases = get_preprocessing_cases(df_nom,
                        target="NObeyesdad",
                        random_seed=42,
                        feat_eng_list=[pd.Series(), se_imc],
                        outliers=[False, True],
                        normalization=[False, True],
                        pca=[2, 4, 6, 8, 10, 12]
                        )
df_cases = pd.DataFrame(data=dict_cases).T
df_show_head(df_cases)

Unnamed: 0,X,y,params
FEna_OUT0_NORM0_PCA2,"[[-23.4447875636599, -3.935936743582154], [-31...",0 0 1 0 2 0 3 0 4 ...,"{'feat_eng': None, 'outliers': False, 'normali..."
FEna_OUT0_NORM0_PCA4,"[[-23.444787563659887, -3.935936743582147, -0....",0 0 1 0 2 0 3 0 4 ...,"{'feat_eng': None, 'outliers': False, 'normali..."
FEna_OUT0_NORM0_PCA6,"[[-23.444787563659887, -3.935936743582147, -0....",0 0 1 0 2 0 3 0 4 ...,"{'feat_eng': None, 'outliers': False, 'normali..."
FEna_OUT0_NORM0_PCA8,"[[-23.444787563659887, -3.935936743582147, -0....",0 0 1 0 2 0 3 0 4 ...,"{'feat_eng': None, 'outliers': False, 'normali..."
FEna_OUT0_NORM0_PCA10,"[[-23.444787563659887, -3.935936743582147, -0....",0 0 1 0 2 0 3 0 4 ...,"{'feat_eng': None, 'outliers': False, 'normali..."


Shape: (48, 3)


In [202]:
dict_cases_knn = evaluate_knn_cases(dict_cases,
                                    {
                                        "euclidean": "minkowski",
                                        "manhattan":  "manhattan",

                                    },
                                    [1, 3, 5, 7, 9],
                                    ["Normal Weight", "Obesity"],
                                    test_size = 0.30,
                                    random_state = 42
 )


df_cases_knn = pd.DataFrame(data=dict_cases_knn).T
df_show_head(df_cases_knn)

df_example = pd.Series(data=dict_cases_knn["FEna_OUT0_NORM0_PCA2_KNN_METeuclidean_K1"]["params"])
df_show_head(df_example)

Unnamed: 0,X,y,params
FEna_OUT0_NORM0_PCA2_KNN_METeuclidean_K1,"[[3.1473778595124884, 51.501824837814134], [1....",1286 1 772 0 1659 1 1416 1 750 ...,"{'feat_eng': None, 'outliers': False, 'normali..."
FEna_OUT0_NORM0_PCA2_KNN_METeuclidean_K3,"[[3.1473778595124884, 51.501824837814134], [1....",1286 1 772 0 1659 1 1416 1 750 ...,"{'feat_eng': None, 'outliers': False, 'normali..."
FEna_OUT0_NORM0_PCA2_KNN_METeuclidean_K5,"[[3.1473778595124884, 51.501824837814134], [1....",1286 1 772 0 1659 1 1416 1 750 ...,"{'feat_eng': None, 'outliers': False, 'normali..."
FEna_OUT0_NORM0_PCA2_KNN_METeuclidean_K7,"[[3.1473778595124884, 51.501824837814134], [1....",1286 1 772 0 1659 1 1416 1 750 ...,"{'feat_eng': None, 'outliers': False, 'normali..."
FEna_OUT0_NORM0_PCA2_KNN_METeuclidean_K9,"[[3.1473778595124884, 51.501824837814134], [1....",1286 1 772 0 1659 1 1416 1 750 ...,"{'feat_eng': None, 'outliers': False, 'normali..."


Shape: (480, 3)


Unnamed: 0,0
feat_eng,
outliers,False
normalization,False
pca,2
distance,euclidean


Shape: (15,)


In [216]:
dict_final = flatten_single_entry(dict_cases_knn["FEna_OUT0_NORM0_PCA2_KNN_METeuclidean_K1"], ["params", "metrics"])

display(dict_final)

{'feat_eng': None,
 'outliers': False,
 'normalization': False,
 'pca': 2,
 'distance': 'euclidean',
 'k': 1,
 'Acurácia': 0.9558359621451105,
 'Precisão_Normal_Weight': np.float64(0.9563953488372093),
 'Precisão_Obesity': np.float64(0.9551724137931035),
 'Sensibilidade_Normal_Weight': np.float64(0.9619883040935673),
 'Sensibilidade_Obesity': np.float64(0.9486301369863014),
 'F1-Score_Normal_Weight': np.float64(0.9591836734693877),
 'F1-Score_Obesity': np.float64(0.9518900343642611),
 'Eficiência_Normal_Weight': np.float64(0.9591918264653883),
 'Eficiência_Obesity': np.float64(0.9519012753897025)}