# Importação de bibliotecas

In [1]:
!pip install --ignore-installed mlflow --quiet


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!pip install pyngrok --quiet


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Utilitárias para dados
import pandas as pd
import numpy as np

# Visualização
import matplotlib.pyplot as plt
import seaborn as sns

# Model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Pré-processamento e métricas
from sklearn.preprocessing import label_binarize
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from sklearn.metrics import roc_auc_score, roc_curve, auc

#  Algoritmos
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# MLFlow e Ngrok
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from pyngrok import ngrok


In [4]:
df = pd.read_csv('CTG_train_pre_processado.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3483 entries, 0 to 3482
Data columns (total 22 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   LB        3483 non-null   float64
 1   AC        3483 non-null   float64
 2   FM        3483 non-null   float64
 3   UC        3483 non-null   float64
 4   DL        3483 non-null   float64
 5   DS        3483 non-null   float64
 6   DP        3483 non-null   float64
 7   ASTV      3483 non-null   float64
 8   MSTV      3483 non-null   float64
 9   ALTV      3483 non-null   float64
 10  MLTV      3483 non-null   float64
 11  Width     3483 non-null   float64
 12  Min       3483 non-null   float64
 13  Max       3483 non-null   float64
 14  Nmax      3483 non-null   float64
 15  Nzeros    3483 non-null   float64
 16  Mode      3483 non-null   float64
 17  Mean      3483 non-null   float64
 18  Median    3483 non-null   float64
 19  Variance  3483 non-null   float64
 20  Tendency  3483 non-null   floa

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
LB,0.433962,0.679245,0.245283,0.377358,0.679245
AC,0.222222,0.166667,0.444444,0.444444,0.055556
FM,0.0,0.10395,0.0,0.08316,0.0
UC,0.066667,0.133333,0.333333,0.333333,0.333333
DL,0.0,0.4,0.066667,0.0,0.8
DS,0.0,0.0,0.0,0.0,0.0
DP,0.0,0.2,0.0,0.0,0.0
ASTV,0.202703,0.72973,0.243243,0.405405,0.148649
MSTV,0.163934,0.508197,0.262295,0.213115,0.557377
ALTV,0.010989,0.0,0.0,0.0,0.0


# Separação dos conjuntos

In [6]:
df_test = pd.read_csv('CTG_test_pre_processado.csv', index_col=0)
df_train = pd.read_csv('CTG_train_pre_processado.csv', index_col=0)

X_train = df_train.drop(columns=['NSP'])
y_train = df_train['NSP']
X_test = df_test.drop(columns=['NSP'])
y_test = df_test['NSP']

X_test = X_test.astype(float)
y_test = y_test.astype(float)
X_train = X_train.astype(float)
y_train = y_train.astype(float)

# Definição de funções e parâmetros

In [7]:
def evaluate_best_model(grid_search, X_test, y_test):
    # Melhor modelo do grid search
    best_model = grid_search.best_estimator_

    # Previsões do modelo
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)

    test_metrics = {}

    # Cálculo das métricas
    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    fbeta = fbeta_score(y_test, y_pred, average='weighted', beta=2)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    # Criação dos dummies pois a curva roc usa dados binários
    y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
    roc_auc = roc_auc_score(y_test_bin, y_proba, multi_class='ovo', average='weighted')

    # Exibindo retornos
    test_metrics['ACURACIA'] = accuracy
    test_metrics['PRECISAO'] = precision
    test_metrics['REVOCACAO'] = recall
    test_metrics['F1'] = f1
    test_metrics['Fbeta_2'] = fbeta
    test_metrics['ROC_AUC'] = roc_auc

    print(f"ACURÁCIA no conjunto de teste: {accuracy}")
    print(f"PRECISÃO no conjunto de teste: {precision}")
    print(f"REVOCAÇÃO no conjunto de teste: {recall}")
    print(f"F1 no conjunto de teste: {f1}")
    print(f"Fbeta no conjunto de teste (com beta = 2): {fbeta}")
    print(f"Área sob a curva ROC no conjunto de teste: {roc_auc}")

    print('Matriz de confusão:')
    conf_matrix = confusion_matrix(y_test, y_pred, labels=[1, 2, 3])

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", xticklabels=["1", "2", "3"], yticklabels=["1", "2", "3"])
    plt.title('Matriz de Confusão')
    plt.tight_layout()
    plt.show()

    print('Area sob a curva ROC por classe:')

    plt.figure(figsize=(8, 6))
    for i in range(y_test_bin.shape[1]):
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_proba[:, i])
        plt.plot(fpr, tpr, marker='.', label=f'NSP {i + 1}')

    plt.xlabel('Falso positivo')
    plt.ylabel('Positivos')
    plt.title('Curva ROC para cada NSP')
    plt.legend(loc='best')
    plt.tight_layout()
    plt.show()

    return test_metrics

In [8]:
fbeta_scorer = make_scorer(fbeta_score, beta=2, average='weighted')

In [9]:
scores = {'ACURACIA': 'accuracy',
          'PRECISAO': 'precision_weighted',
          'REVOCACAO': 'recall_weighted',
          'F1': 'f1_weighted',
          'Fbeta_2': fbeta_scorer,
          'ROC_AUC': 'roc_auc_ovo_weighted'}

In [10]:
experiment_mapping = {
    'KNN': 'KNN',
    'DecisionTree': 'Decision Tree',
    'MLP': 'MLP',
    'NaiveBayes': 'Naive Bayes',
    'SVM': 'SVM',
    'RandomForest': 'Random Forest',
    'LogisticRegression': 'Logistic Regression'
}

In [11]:
classifiers = {
    'KNN': {
        'model': KNeighborsClassifier(),
        'param_grid': {
            'n_neighbors': [3, 5, 8, 10, 12, 15, 18, 20, 21, 22, 25],
            'weights': ['uniform', 'distance']
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'param_grid': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 10, 20, 30, 40, 50]
        }
    },
    'MLP': {
        'model': MLPClassifier(max_iter=1000),
        'param_grid': {
            'hidden_layer_sizes': [(50,50), (100,)],
            'activation': ['tanh', 'relu'],
            'solver': ['sgd', 'adam'],
            'alpha': [0.0001, 0.05],
            'learning_rate': ['constant','adaptive'],
        }
    },
    'NaiveBayes': {
        'model': GaussianNB(),
        'param_grid': {
            'var_smoothing': [1e-09, 1e-08, 1e-07]
        }
    },
    'SVM': {
        'model': SVC(probability=True),
        'param_grid': {
            'C': [0.1, 1, 10, 100],
            'gamma': [1, 0.1, 0.01, 0.001],
            'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'param_grid': {
            'n_estimators': [10, 50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'criterion': ['gini', 'entropy']
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(),
        'param_grid': {
            'C': [0.01, 0.1, 1, 10, 100],
            'penalty': ['l1', 'l2', 'elasticnet', 'none'],
            'solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg', 'sag'],
            'max_iter': [100, 200, 300]
        }
    }
}

# Modelos

Para cada modelos vamos gerar o melhor conjunto de hiperparâmetros considerando cada métrica.

In [12]:
get_ipython().system_raw("mlflow ui --port 5000 &")
for clf_name, clf_info in classifiers.items():
    # experimento MLflow
    mlflow.set_experiment(experiment_mapping[clf_name])

    print(f"=== Treinando {clf_name} ===")

    for score_name, score_metric in scores.items():
        with mlflow.start_run(run_name=f"{clf_name}_{score_name}"):
            grid_search = GridSearchCV(
                estimator=clf_info['model'],
                param_grid=clf_info['param_grid'],
                cv=10,
                n_jobs=-1,
                verbose=1,
                scoring=score_metric
            )

            grid_search.fit(X_train, y_train)

            # registra parametros
            mlflow.log_params(grid_search.best_params_)

            # registra métricas do treinamento
            mlflow.log_metric(f"best_train_{score_name}", grid_search.best_score_)
            print(f"Melhor {score_name} no conjunto de treinamento: {grid_search.best_score_}")

            # avaliação no conjunto de testes
            test_metrics = evaluate_best_model(grid_search, X_test, y_test)
            for metric_name, metric_value in test_metrics.items():
                mlflow.log_metric(f"test_{metric_name}", metric_value)

            # Salvar métricas em CSV
            metrics_data = {
                "clf_name": clf_name,
                "score_name": score_name,
                "best_params": grid_search.best_params_,
                "best_train_score": grid_search.best_score_,
                **test_metrics
            }
            df_metrics = pd.DataFrame([metrics_data])
            output_csv_path = f"./metrics_{clf_name}_{score_name}.csv"
            df_metrics.to_csv(output_csv_path, index=False)

            signature = infer_signature(X_train, grid_search.predict(X_train))
            mlflow.sklearn.log_model(grid_search.best_estimator_, f"{clf_name}_model", signature=signature)

            # Optional: Log artifacts, e.g., plots, if any
            # mlflow.log_artifact("path_to_artifact")

            mlflow.end_run()

    experiment_id = mlflow.get_experiment_by_name(experiment_mapping[clf_name]).experiment_id

    # Busca todas as execuções (runs) do experimento
    runs_df = mlflow.search_runs(experiment_ids=experiment_id)

    # Salva todas as execuções em um arquivo CSV
    output_csv_path = f"./runs_{clf_name}.csv"
    runs_df.to_csv(output_csv_path, index=False)


### Recuperando dados para análise

In [None]:
ngrok.kill()
NGROK_AUTH_TOKEN = ""
ngrok.set_auth_token(NGROK_AUTH_TOKEN)
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)

MLflow Tracking UI: https://ce9e-35-194-31-197.ngrok-free.app
