In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score


import numpy as np
import pandas as pd
import os
import sys

sys.path.append('../src/')
from dataloader import *
from logging_utils import *


import mlflow
import optuna

# start mlflow server from terminal: $mlflow server mlflow server --host 127.0.0.1 --port 8080
mlflow.set_tracking_uri("http://127.0.0.1:8080")
optuna.logging.set_verbosity(optuna.logging.ERROR)

%reload_ext autoreload

## Load Data

In [2]:
PATH = '../data/'

df = load_train_df(
    PATH,
    decode_dummies=True,
    add_geo_features=True
    )

In [3]:
y = df['Cover_Type']
X = df.drop(['Cover_Type'], axis=1)

In [4]:
from sklearn.model_selection import cross_val_predict

def objective(trial, experiment_id):
    with mlflow.start_run(experiment_id=experiment_id, nested=True):
        # Define hyperparameters
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 100),
            "max_depth": trial.suggest_int("max_depth", 2, 8),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        }

        model = RandomForestClassifier(**params)

        # TODO THE FOLLOWING IS NOT VERY SOUND: CV SCORE AND CMATRIX ON DIFF SPLITS
        acc = cross_val_score(model, X, y, cv=3, scoring='accuracy').mean()
        f1 = cross_val_score(model, X, y, cv=3, scoring='f1_macro').mean()
        
        metrics = {
            "accuracy": acc,
            "f1_macro": f1,

        }

        y_pred = cross_val_predict(model, X, y)
        
        # fig = plot_confusion_matrix(y, y_pred)
        # mlflow.log_figure(fig, "confusion_matrix.png")

        mlflow.log_params(params)
        mlflow.log_metrics(metrics)
        # NOTE model is not logged to mlflow: eval if worth and then save it

        return acc


In [5]:
# NOTE: create a new experiment whenever the data changes
# create mlflow experiment
confirm = input("Launch new mlflow experiment? (y/n)")
if confirm == "y":
    experiment_id = get_or_create_experiment("First Optuna Experiment")
    mlflow.set_experiment(experiment_id)

    run_name = input("Enter run name: ")
    with mlflow.start_run(
        experiment_id=experiment_id,
        run_name=run_name
        ):

        study = optuna.create_study(direction="maximize")

        study.optimize(
            lambda trial: objective(trial, experiment_id),
            n_trials=10,
            callbacks=[champion_callback]
            )

        mlflow.log_params(study.best_params)
        mlflow.log_metric("best_mse", study.best_value)
        mlflow.log_metric("best_rmse", np.sqrt(study.best_value))

        best_model = RandomForestClassifier(**study.best_params)
        best_model.fit(X, y)
        y_pred = best_model.predict(X)

        metrics = {
            "accuracy": accuracy_score(y, y_pred),
            "f1_macro": f1_score(y, y_pred, average='macro'),
        }

        cm = plot_confusion_matrix(y, y_pred, style="tableau-colorblind10", plot_size=(8, 8))

        mlflow.log_figure(cm, "confusion_matrix.png")
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(best_model, "best_model")
        mlflow.set_tag("mlflow.note.content", "This is a test run")

else:
    print("No new experiment created")
    sys.exit(0)


2024/02/17 14:38:29 INFO mlflow.tracking.fluent: Experiment with name '878289205600432043' does not exist. Creating a new experiment.


Trial 0 value: 0.7469576719576719
Trial 4 value: 0.7672619047619048 with  2.6463% improvement
Trial 9 value: 0.7685846560846561 with  0.1721% improvement
