# 1. Settings

## 1.1 User variables

In [1]:
NB_RUNS = 10

USE_DISCRETE = True

#TARGET = "charcoal_change"

NB_TOP_MODELS = 10
TRAIN_SIZE = 0.8
FOLD_STRATEGY = "kfold"
N_FOLD = 5
FIX_IMBALANCE = True # Good for decision tree
FEATURE_SELECTION = False
N_FEATURES_TO_SELECT = 0.2
REMOVE_MULTICOLLINEARITY = False # Good for ada boost sometime
PREPROCESS = True
TRANSFORMATION = True
TRANSFORMATION_METHOD = 'yeo-johnson'
NORMALIZE = True
NORMALIZE_METHOD = "zscore" #maxabs

PCA = False
POLYNOMIAL_FEATURES = False

#SORT_METRIC = "MAE"
#OPTIMIZE_METRIC = "MAE"
SORT_METRIC = "AUC"
OPTIMIZE_METRIC = "AUC"

EXCLUDED_MODELS = ["lar"]#["catboost", "nb", "dummy", "lr", "qda", "knn", "lda", "svm", "ridge"] # ["catboost", "Naive Bayes", "Dummy Classifier", "Logistic Regression", "Quadratic Discriminant Analysis", "K Neighbors Classifier"]

# 1.2 Dev variables

In [2]:
RANDOM_SEED = 0

INPUT_DATA = "tmp/data_processed.csv"
INPUT_DATA_DISCRETE = "tmp/data_discrete.csv"

OUTPUT_SCORES_PATH = "tmp/hap_pycaret_scores.csv"
VERBOSE = True

## 1.3 Imports

In [3]:
import pandas as pd
import numpy as np
import random
#from pycaret.regression import *
from pycaret.classification import *

from config import *

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

## 1.4 Constants

In [4]:
# Constants
DEBUG = False

# 2. Data Loading

In [5]:
if USE_DISCRETE:
    df = pd.read_csv(INPUT_DATA_DISCRETE)
else:
    df = pd.read_csv(INPUT_DATA)

#df = df[df[TARGET] >= MIN_CHARCOAL]
display(df)

Unnamed: 0,M15,Juniperus,Larix,Picea,Pinus,Alnus,Betula,Fraxinus,Populus,Quercus,...,Ambrosia,Artemesia,Asteraceae,Chenopodiaceae,Cyperaceae,Myrica,Poaceae,Typha,Aquatics,Ambrosia_prev
0,-1,-1,1,1,-1,1,-1,1,-1,1,...,-1,1,1,1,-1,1,1,1,0,1
1,1,-1,-1,-1,1,1,1,-1,1,-1,...,1,-1,1,-1,1,-1,-1,-1,0,1
2,-1,1,-1,1,1,-1,-1,1,1,-1,...,1,-1,1,-1,1,-1,-1,1,0,1
3,-1,1,1,-1,-1,1,1,-1,0,1,...,-1,1,0,1,1,1,-1,0,0,0
4,1,0,1,1,-1,-1,-1,1,-1,1,...,-1,-1,-1,-1,-1,-1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,1,-1,-1,-1,1,0,-1,-1,-1,-1,...,-1,-1,1,1,1,1,-1,-1,0,1
57,-1,-1,1,1,-1,1,1,1,1,1,...,1,1,0,-1,1,-1,-1,1,-1,1
58,1,-1,1,1,-1,-1,1,0,1,-1,...,-1,-1,0,-1,-1,1,1,1,1,1
59,-1,1,-1,-1,1,1,1,-1,-1,-1,...,1,1,0,1,-1,1,1,-1,-1,2


# Pycaret AutoML

In [6]:
df_scores = None

setup(df, target = TARGET, session_id = RANDOM_SEED,
    train_size=TRAIN_SIZE,
    fold_strategy=FOLD_STRATEGY,
    fold=N_FOLD,
    #fix_imbalance=FIX_IMBALANCE,
    preprocess=PREPROCESS,
    feature_selection=FEATURE_SELECTION,
    n_features_to_select=N_FEATURES_TO_SELECT,
    remove_multicollinearity=REMOVE_MULTICOLLINEARITY,
    normalize=NORMALIZE,
    normalize_method=NORMALIZE_METHOD,
    pca=PCA,
    polynomial_features=POLYNOMIAL_FEATURES,
    index=False,
    n_jobs=-1
    )

print("Val Scores")
top = compare_models(n_select = NB_TOP_MODELS,
                    sort=SORT_METRIC,
                    exclude=EXCLUDED_MODELS,
                    verbose=VERBOSE,
                    errors="raise")
df_results = pull()
df_results

Unnamed: 0,Description,Value
0,Session id,0
1,Target,Poaceae
2,Target type,Binary
3,Target mapping,"-1: 0, 1: 1"
4,Original data shape,"(61, 22)"
5,Transformed data shape,"(61, 22)"
6,Transformed train set shape,"(48, 22)"
7,Transformed test set shape,"(13, 22)"
8,Numeric features,21
9,Preprocess,True


Val Scores


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.6089,0.6983,0.6033,0.6233,0.6048,0.2144,0.22,0.344
dt,Decision Tree Classifier,0.6711,0.69,0.7467,0.65,0.6767,0.3514,0.3796,0.38
gbc,Gradient Boosting Classifier,0.6133,0.6444,0.6067,0.5967,0.598,0.2094,0.2117,0.01
xgboost,Extreme Gradient Boosting,0.6311,0.6002,0.5967,0.66,0.6215,0.2562,0.26,0.028
nb,Naive Bayes,0.5222,0.5883,0.5967,0.519,0.5122,0.0976,0.1201,0.384
rf,Random Forest Classifier,0.5644,0.551,0.4967,0.6133,0.5205,0.1297,0.1534,0.378
knn,K Neighbors Classifier,0.5244,0.5104,0.61,0.5422,0.5466,0.0649,0.046,0.394
lightgbm,Light Gradient Boosting Machine,0.4156,0.5,0.4,0.1556,0.2231,0.0,0.0,0.01
dummy,Dummy Classifier,0.4156,0.5,0.4,0.1556,0.2231,0.0,0.0,0.008
lda,Linear Discriminant Analysis,0.5,0.4931,0.47,0.4957,0.4693,-0.0302,-0.0317,0.008


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.6089,0.6983,0.6033,0.6233,0.6048,0.2144,0.22,0.344
dt,Decision Tree Classifier,0.6711,0.69,0.7467,0.65,0.6767,0.3514,0.3796,0.38
gbc,Gradient Boosting Classifier,0.6133,0.6444,0.6067,0.5967,0.598,0.2094,0.2117,0.01
xgboost,Extreme Gradient Boosting,0.6311,0.6002,0.5967,0.66,0.6215,0.2562,0.26,0.028
nb,Naive Bayes,0.5222,0.5883,0.5967,0.519,0.5122,0.0976,0.1201,0.384
rf,Random Forest Classifier,0.5644,0.551,0.4967,0.6133,0.5205,0.1297,0.1534,0.378
knn,K Neighbors Classifier,0.5244,0.5104,0.61,0.5422,0.5466,0.0649,0.046,0.394
lightgbm,Light Gradient Boosting Machine,0.4156,0.5,0.4,0.1556,0.2231,0.0,0.0,0.01
dummy,Dummy Classifier,0.4156,0.5,0.4,0.1556,0.2231,0.0,0.0,0.008
lda,Linear Discriminant Analysis,0.5,0.4931,0.47,0.4957,0.4693,-0.0302,-0.0317,0.008


# DEBUG

In [7]:
evaluate_model(top)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

# TODO