## Exercício 1
### Verificar a documentação dos modelos RandomForestClassifier, LogisticRegression, KNeighborsClassifier, GradientBoostingClassifier e altere ou inclua algum parâmetro dos modelos e compare os resultados com o baseline executado nesse notebook.

### Import libs

In [None]:
%pip install pandas
%pip install sklearn
%pip install mlflow

# Manipulação e visualização de dados
import pandas as pd
import time

# Bibliotecas para aprendizado de máquina
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score


# MLflow para gerenciamento de experimentos
import mlflow

# Supressão de avisos
import warnings
warnings.filterwarnings("ignore")

### Constants

In [None]:
DATA_PATH = "../data"

### Carrega dados

In [None]:
x_train = pd.read_csv(f"{DATA_PATH}/x_train.csv")
x_test = pd.read_csv(f"{DATA_PATH}/x_test.csv")
y_train = pd.read_csv(f"{DATA_PATH}/y_train.csv")
y_test = pd.read_csv(f"{DATA_PATH}/y_test.csv")

#### Utils

In [None]:
def avalia_modelo(models):
    results = []

    for name, model_value in models.items():
        model = model_value["model"]
        params = model_value["params"]

        inicio = time.time()
        model.fit(x_train, y_train)  # Treinamento
        fim = time.time()

        # Previsões
        y_pred = model.predict(x_test)

        # Métricas
        acuracia = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="weighted")
        tempo_treino = fim - inicio

        # Registrar no MLflow
        with mlflow.start_run(run_name=name):
            mlflow.log_param("Modelo", name)

            for param_key, param_value in params.items():
                mlflow.log_param(param_key, param_value)

            mlflow.log_metric("Acurácia", acuracia)
            mlflow.log_metric("F1-Score", f1)
            mlflow.log_metric("Tempo de Treinamento", tempo_treino)
            mlflow.sklearn.log_model(model, "modelo")

        # Armazenar resultados
        results.append({
            "Modelo": name,
            "Acurácia": acuracia,
            "F1-Score": f1,
            "Tempo de Treinamento (s)": tempo_treino,
            "model": model,
            "params": params
        })
        print(f"Modelo {name} treinado e registrado no MLflow.")

    df_results = pd.DataFrame(results)
    return check_best_df(df_results)

def check_best_df(df: pd.DataFrame):
    df.sort_values(by=["Acurácia", "Tempo de Treinamento (s)"], ascending=[False, True], inplace=True)
    print("Resultado da comparação:")
    print(df)
    df_best = df.iloc[0]
    print(f"Melhor Modelo: {df_best['Modelo']}")
    return df_best

#### Randon Forest

In [None]:
randon_forest_baseline_key = "Random Forest"

randon_forest_models = {
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "random_state": 42
        }
    },
    "Random Forest with 500 estimators": {
        "model": RandomForestClassifier(random_state=42, n_estimators=500),
        "params": {
            "random_state": 42,
            "n_estimators": 500
        }
    },
    "Random Forest with 10 max depth": {
        "model": RandomForestClassifier(random_state=42, max_depth=10),
        "params": {
            "random_state": 42,
            "max_depth": 10
        }
    },
    "Random Forest with 500 estimators and 10 max depth": {
        "model": RandomForestClassifier(random_state=42, n_estimators=500, max_depth=10),
        "params": {
            "random_state": 42,
            "n_estimators": 500,
            "max_depth": 10
        }
    },
}

best_radon_forest = avalia_modelo(randon_forest_models)

#### Logistic Regression

In [None]:
logistic_regression_baseline_key = "Logistic Regression"

logistic_regression_models = {
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000, random_state=42),
        "params": {
            "max_iter": 1000,
            "random_state": 42,
        }
    },
    "Logistic Regression with 10 intercept scaling": {
        "model": LogisticRegression(max_iter=1000, random_state=42, intercept_scaling=10),
        "params": {
            "max_iter": 1000,
            "random_state": 42,
            "intercept_scaling": 10
        }
    },
    "Logistic Regression with False fit intercept": {
        "model": LogisticRegression(max_iter=1000, random_state=42, fit_intercept=False),
        "params": {
            "max_iter": 1000,
            "random_state": 42,
            "fit_intercept": False
        }
    },
    "Logistic Regression with 10 intercept scaling and False fit intercept": {
        "model": LogisticRegression(max_iter=1000, random_state=42, intercept_scaling=10, fit_intercept=False),
        "params": {
            "max_iter": 1000,
            "random_state": 42,
            "intercept_scaling": 10,
            "fit_intercept": False
        }
    },
}

best_logistic_regression = avalia_modelo(logistic_regression_models)

#### K Neighbors Classifier

In [None]:
k_neighbors_classifier_baseline_key = "K-Nearest Neighbors"

k_neighbors_classifier_models = {
    "K Neighbors Classifier": {
        "model": KNeighborsClassifier(),
        "params": {}
    },
    "K Neighbors Classifier with 10 in neighbors": {
        "model": KNeighborsClassifier(n_neighbors=10),
        "params": {
            "n_neighbors": 10
        }
    },
    "K Neighbors Classifier with 50 leaf size": {
        "model": KNeighborsClassifier(leaf_size=50),
        "params": {
            "leaf_size": 50
        }
    },
    "K Neighbors Classifier with 10 in neighbors and  50 leaf size": {
        "model": KNeighborsClassifier(n_neighbors=10, leaf_size=50),
        "params": {
            "n_neighbors": 50,
            "leaf_size": 50
        }
    },
}

best_k_neighbors_classifier = avalia_modelo(k_neighbors_classifier_models)

#### Gradient Boosting

In [None]:
gradient_boosting_classifier_baseline_key = "Gradient Boosting"

gradient_boosting_classifier_models = {
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "randon_state": 42
        }
    },
    "Gradient Boosting with 200 n estimators": {
        "model": GradientBoostingClassifier(random_state=42, n_estimators=200),
        "params": {
            "random_state": 42,
            "n_estimators": 200
        }
    },
    "Gradient Boosting with 10 max depth": {
        "model": GradientBoostingClassifier(random_state=42, max_depth=10),
        "params": {
            "random_state": 42,
            "max_depth": 10
        }
    },
    "Gradient Boosting with 200 n estimators and 10 max depth": {
        "model": GradientBoostingClassifier(random_state=42, n_estimators=200, max_depth=10),
        "params": {
            "random_state": 42,
            "n_estimators": 200,
            "max_depth": 10
        }
    },
}

best_gradient_boosting_classifier = avalia_modelo(gradient_boosting_classifier_models)

#### Best of best

In [None]:
best_results = pd.concat([best_radon_forest.to_frame().T,
                         best_logistic_regression.to_frame().T,
                         best_k_neighbors_classifier.to_frame().T,
                         best_gradient_boosting_classifier.to_frame().T],
                         ignore_index=True,
                         sort=False)

best_results.head()
best_result = check_best_df(best_results)

#### Salvando o melhor melhor com hiper parâmetro

In [None]:
with mlflow.start_run(run_name="Melhor Modelo com hiperparâmetros"):
    mlflow.log_param("Modelo", best_result["Modelo"])
    mlflow.log_metric("Acurácia", best_result["Acurácia"])
    mlflow.log_metric("F1-Score", best_result["F1-Score"])
    mlflow.log_metric("Tempo de Treinamento", best_result["Tempo de Treinamento (s)"])
    mlflow.sklearn.log_model(best_result["model"], "modelo")

best_model_name = best_result["Modelo"]
print(f"Melhor modelo ({best_model_name}) armazenado com sucesso no MLflow.")