In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
# from catboost import CatBoostRegressor


import numpy as np
import pandas as pd
import os
import sys

sys.path.append('../../src/')
from dataloader import *
from logging_utils import *
from estimators import *


import mlflow
import optuna

# start mlflow server from terminal: $mlflow server mlflow server --host 127.0.0.1 --port 8080
mlflow.set_tracking_uri("http://127.0.0.1:8080")
optuna.logging.set_verbosity(optuna.logging.ERROR)

%reload_ext autoreload

## Load Data

In [2]:
PATH = '../../data/'

df = load_train_df(
    PATH,
    decode_dummies=True,
    add_geo_features=True
    )

In [3]:
y = df['Cover_Type']
X = df.drop(['Cover_Type'], axis=1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# NOTE: create a new experiment whenever the data changes

confirm = input("Launch new mlflow experiment? (y/n)")
if confirm == "y":
    experiment_id = get_or_create_experiment(input("Enter experiment name: "))
    mlflow.set_experiment(experiment_id=experiment_id)

    run_name = input("Enter run name: ")
    with mlflow.start_run(
        experiment_id=experiment_id,
        run_name=run_name
        ):

        study = optuna.create_study(direction="maximize")

        study.optimize(
            lambda trial: objective(
                trial,
                experiment_id,
                X_train,
                y_train),
            n_trials=3,
            callbacks=[champion_callback]
            )

        mlflow.log_params(study.best_params)
        # TODO assign parameters to best model properly
        # best_model = get_estimator('passthrough')
        # best_model.set_params(**study.best_params)
        # best_model.fit(X_train, y_train)
        # y_pred = best_model.predict(X_val)

        # metrics = {
        #     "accuracy": accuracy_score(y_val, y_pred),
        #     "f1_macro": f1_score(y_val, y_pred, average='macro'),
        # }

        # cm = plot_confusion_matrix(y_val, y_pred)

        # mlflow.log_figure(cm, "confusion_matrix.png")
        # mlflow.log_metrics(metrics)
        # mlflow.sklearn.log_model(best_model, "best_model")
        # mlflow.set_tag("mlflow.note.content", "This is a test run")

else:
    print("No new experiment created")
    sys.exit(0)




Trial 0 value: 0.8111655657552076




Trial 1 value: 0.8238107199971468 with  1.5350% improvement


