# 1. Settings

## 1.1 User variables

In [1]:
NB_RUNS = 10

USE_DISCRETE = True

#TARGET = "charcoal_change"

NB_TOP_MODELS = 10
TRAIN_SIZE = 0.8
FOLD_STRATEGY = "kfold"
N_FOLD = 5
FIX_IMBALANCE = True # Good for decision tree
FEATURE_SELECTION = False
N_FEATURES_TO_SELECT = 0.2
REMOVE_MULTICOLLINEARITY = False # Good for ada boost sometime
PREPROCESS = True
TRANSFORMATION = True
TRANSFORMATION_METHOD = 'yeo-johnson'
NORMALIZE = True
NORMALIZE_METHOD = "zscore" #maxabs

PCA = False
POLYNOMIAL_FEATURES = False

#SORT_METRIC = "MAE"
#OPTIMIZE_METRIC = "MAE"
SORT_METRIC = "AUC"
OPTIMIZE_METRIC = "AUC"

EXCLUDED_MODELS = ["lar"]#["catboost", "nb", "dummy", "lr", "qda", "knn", "lda", "svm", "ridge"] # ["catboost", "Naive Bayes", "Dummy Classifier", "Logistic Regression", "Quadratic Discriminant Analysis", "K Neighbors Classifier"]

# 1.2 Dev variables

In [2]:
RANDOM_SEED = 0

INPUT_DATA = "tmp/data_processed.csv"
INPUT_DATA_DISCRETE = "tmp/data_discrete.csv"

OUTPUT_SCORES_PATH = "tmp/hap_pycaret_scores.csv"
VERBOSE = True

## 1.3 Imports

In [3]:
import pandas as pd
import numpy as np
import random
#from pycaret.regression import *
from pycaret.classification import *

from config import *

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

## 1.4 Constants

In [4]:
# Constants
DEBUG = False

# 2. Data Loading

In [5]:
if USE_DISCRETE:
    df = pd.read_csv(INPUT_DATA_DISCRETE)
else:
    df = pd.read_csv(INPUT_DATA)

#df = df[df[TARGET] >= MIN_CHARCOAL]
display(df)

Unnamed: 0,M15,Juniperus,Larix,Picea,Pinus,Alnus,Betula,Fraxinus,Populus,Quercus,...,Ambrosia,Artemesia,Asteraceae,Chenopodiaceae,Cyperaceae,Myrica,Poaceae,Typha,Aquatics,Ambrosia_prev
0,-1,-1,1,1,-1,1,-1,1,-1,1,...,-1,1,1,1,-1,1,1,1,0,1
1,1,-1,-1,-1,1,1,1,-1,1,-1,...,1,-1,1,-1,1,-1,-1,-1,0,1
2,-1,1,-1,1,1,-1,-1,1,1,-1,...,1,-1,1,-1,1,-1,-1,1,0,1
3,-1,1,1,-1,-1,1,1,-1,0,1,...,-1,1,0,1,1,1,-1,0,0,0
4,1,0,1,1,-1,-1,-1,1,-1,1,...,-1,-1,-1,-1,-1,-1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,1,-1,-1,-1,1,0,-1,-1,-1,-1,...,-1,-1,1,1,1,1,-1,-1,0,1
57,-1,-1,1,1,-1,1,1,1,1,1,...,1,1,0,-1,1,-1,-1,1,-1,1
58,1,-1,1,1,-1,-1,1,0,1,-1,...,-1,-1,0,-1,-1,1,1,1,1,1
59,-1,1,-1,-1,1,1,1,-1,-1,-1,...,1,1,0,1,-1,1,1,-1,-1,2


# Pycaret AutoML

In [6]:
df_scores = None

setup(df, target = TARGET, session_id = RANDOM_SEED,
    train_size=TRAIN_SIZE,
    fold_strategy=FOLD_STRATEGY,
    fold=N_FOLD,
    #fix_imbalance=FIX_IMBALANCE,
    preprocess=PREPROCESS,
    feature_selection=FEATURE_SELECTION,
    n_features_to_select=N_FEATURES_TO_SELECT,
    remove_multicollinearity=REMOVE_MULTICOLLINEARITY,
    normalize=NORMALIZE,
    normalize_method=NORMALIZE_METHOD,
    pca=PCA,
    polynomial_features=POLYNOMIAL_FEATURES,
    index=False,
    n_jobs=-1
    )

print("Val Scores")
top = compare_models(n_select = NB_TOP_MODELS,
                    sort=SORT_METRIC,
                    exclude=EXCLUDED_MODELS,
                    verbose=VERBOSE,
                    errors="raise")
df_results = pull()
df_results

Unnamed: 0,Description,Value
0,Session id,0
1,Target,Ambrosia
2,Target type,Multiclass
3,Target mapping,"-1: 0, 0: 1, 1: 2"
4,Original data shape,"(61, 22)"
5,Transformed data shape,"(61, 22)"
6,Transformed train set shape,"(48, 22)"
7,Transformed test set shape,"(13, 22)"
8,Numeric features,21
9,Preprocess,True


Val Scores


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.5822,0.3891,0.5,0.614,0.528,0.2404,0.2696,0.4
catboost,CatBoost Classifier,0.4978,0.3844,0.4533,0.5722,0.4444,0.1244,0.1572,0.292
dt,Decision Tree Classifier,0.3911,0.3552,0.3467,0.4087,0.3426,-0.0109,-0.0117,0.392
et,Extra Trees Classifier,0.4556,0.3541,0.3533,0.5011,0.361,0.0357,0.0596,0.036
lr,Logistic Regression,0.4378,0.321,0.3867,0.4373,0.3934,-0.0328,-0.0398,0.48
rf,Random Forest Classifier,0.5,0.3191,0.3933,0.5622,0.4098,0.1186,0.1605,0.396
lda,Linear Discriminant Analysis,0.4822,0.3151,0.4467,0.5178,0.4574,0.0976,0.0651,0.006
ada,Ada Boost Classifier,0.42,0.3146,0.3067,0.4802,0.3289,-0.0166,-0.0152,0.012
nb,Naive Bayes,0.4578,0.3143,0.4133,0.4791,0.3967,0.0226,0.0443,0.386
xgboost,Extreme Gradient Boosting,0.4333,0.3003,0.3133,0.3456,0.3189,0.0004,-0.0049,0.02


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.5822,0.3891,0.5,0.614,0.528,0.2404,0.2696,0.4
catboost,CatBoost Classifier,0.4978,0.3844,0.4533,0.5722,0.4444,0.1244,0.1572,0.292
dt,Decision Tree Classifier,0.3911,0.3552,0.3467,0.4087,0.3426,-0.0109,-0.0117,0.392
et,Extra Trees Classifier,0.4556,0.3541,0.3533,0.5011,0.361,0.0357,0.0596,0.036
lr,Logistic Regression,0.4378,0.321,0.3867,0.4373,0.3934,-0.0328,-0.0398,0.48
rf,Random Forest Classifier,0.5,0.3191,0.3933,0.5622,0.4098,0.1186,0.1605,0.396
lda,Linear Discriminant Analysis,0.4822,0.3151,0.4467,0.5178,0.4574,0.0976,0.0651,0.006
ada,Ada Boost Classifier,0.42,0.3146,0.3067,0.4802,0.3289,-0.0166,-0.0152,0.012
nb,Naive Bayes,0.4578,0.3143,0.4133,0.4791,0.3967,0.0226,0.0443,0.386
xgboost,Extreme Gradient Boosting,0.4333,0.3003,0.3133,0.3456,0.3189,0.0004,-0.0049,0.02


# DEBUG

In [7]:
evaluate_model(top)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

# TODO