# 1. Settings

## 1.1 User variables

In [1]:
NB_RUNS = 10

USE_DISCRETE = True

#TARGET = "charcoal_change"

NB_TOP_MODELS = 10
TRAIN_SIZE = 0.8
FOLD_STRATEGY = "kfold"
N_FOLD = 5
FIX_IMBALANCE = True # Good for decision tree
FEATURE_SELECTION = False
N_FEATURES_TO_SELECT = 0.2
REMOVE_MULTICOLLINEARITY = False # Good for ada boost sometime
PREPROCESS = True
TRANSFORMATION = True
TRANSFORMATION_METHOD = 'yeo-johnson'
NORMALIZE = True
NORMALIZE_METHOD = "zscore" #maxabs

PCA = False
POLYNOMIAL_FEATURES = False

#SORT_METRIC = "MAE"
#OPTIMIZE_METRIC = "MAE"
SORT_METRIC = "AUC"
OPTIMIZE_METRIC = "AUC"

EXCLUDED_MODELS = ["lar"]#["catboost", "nb", "dummy", "lr", "qda", "knn", "lda", "svm", "ridge"] # ["catboost", "Naive Bayes", "Dummy Classifier", "Logistic Regression", "Quadratic Discriminant Analysis", "K Neighbors Classifier"]

# 1.2 Dev variables

In [2]:
RANDOM_SEED = 0

INPUT_DATA = "tmp/data_processed.csv"
INPUT_DATA_DISCRETE = "tmp/data_discrete.csv"

OUTPUT_SCORES_PATH = "tmp/hap_pycaret_scores.csv"
VERBOSE = True

## 1.3 Imports

In [3]:
import pandas as pd
import numpy as np
import random
#from pycaret.regression import *
from pycaret.classification import *

from config import *

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

## 1.4 Constants

In [4]:
# Constants
DEBUG = False

# 2. Data Loading

In [5]:
if USE_DISCRETE:
    df = pd.read_csv(INPUT_DATA_DISCRETE)
else:
    df = pd.read_csv(INPUT_DATA)

#df = df[df[TARGET] >= MIN_CHARCOAL]
display(df)

Unnamed: 0,M15,Juniperus,Larix,Picea,Pinus,Alnus,Betula,Fraxinus,Populus,Quercus,...,Ambrosia,Artemesia,Asteraceae,Chenopodiaceae,Cyperaceae,Myrica,Poaceae,Typha,Aquatics,Poaceae_prev
0,-1,-1,1,1,-1,1,-1,1,-1,1,...,-1,1,1,1,-1,1,1,1,0,0
1,1,-1,-1,-1,1,1,1,-1,1,-1,...,1,-1,1,-1,1,-1,-1,-1,0,0
2,-1,1,-1,1,1,-1,-1,1,1,-1,...,1,-1,1,-1,1,-1,-1,1,0,0
3,-1,1,1,-1,-1,1,1,-1,0,1,...,-1,1,0,1,1,1,-1,0,0,0
4,1,0,1,1,-1,-1,-1,1,-1,1,...,-1,-1,-1,-1,-1,-1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,1,-1,-1,-1,1,0,-1,-1,-1,-1,...,-1,-1,1,1,1,1,-1,-1,0,0
57,-1,-1,1,1,-1,1,1,1,1,1,...,1,1,0,-1,1,-1,-1,1,-1,1
58,1,-1,1,1,-1,-1,1,0,1,-1,...,-1,-1,0,-1,-1,1,1,1,1,1
59,-1,1,-1,-1,1,1,1,-1,-1,-1,...,1,1,0,1,-1,1,1,-1,-1,1


# Pycaret AutoML

In [6]:
df_scores = None

setup(df, target = TARGET, session_id = RANDOM_SEED,
    train_size=TRAIN_SIZE,
    fold_strategy=FOLD_STRATEGY,
    fold=N_FOLD,
    #fix_imbalance=FIX_IMBALANCE,
    preprocess=PREPROCESS,
    feature_selection=FEATURE_SELECTION,
    n_features_to_select=N_FEATURES_TO_SELECT,
    remove_multicollinearity=REMOVE_MULTICOLLINEARITY,
    normalize=NORMALIZE,
    normalize_method=NORMALIZE_METHOD,
    pca=PCA,
    polynomial_features=POLYNOMIAL_FEATURES,
    index=False,
    n_jobs=-1
    )

print("Val Scores")
top = compare_models(n_select = NB_TOP_MODELS,
                    sort=SORT_METRIC,
                    exclude=EXCLUDED_MODELS,
                    verbose=VERBOSE,
                    errors="raise")
df_results = pull()
df_results

Unnamed: 0,Description,Value
0,Session id,0
1,Target,Poaceae
2,Target type,Binary
3,Target mapping,"-1: 0, 1: 1"
4,Original data shape,"(61, 22)"
5,Transformed data shape,"(61, 22)"
6,Transformed train set shape,"(48, 22)"
7,Transformed test set shape,"(13, 22)"
8,Numeric features,21
9,Preprocess,True


Val Scores


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.6511,0.704,0.7467,0.6162,0.6618,0.2981,0.3243,0.292
rf,Random Forest Classifier,0.6267,0.649,0.5967,0.62,0.5921,0.2613,0.2677,0.378
nb,Naive Bayes,0.6289,0.6477,0.63,0.6433,0.6147,0.2772,0.2892,0.38
gbc,Gradient Boosting Classifier,0.6533,0.6456,0.6133,0.7,0.6311,0.3079,0.3288,0.012
et,Extra Trees Classifier,0.6111,0.6198,0.5633,0.6167,0.5856,0.2126,0.21,0.036
qda,Quadratic Discriminant Analysis,0.6467,0.595,0.57,0.7,0.5857,0.2069,0.2344,0.008
knn,K Neighbors Classifier,0.5244,0.584,0.71,0.5133,0.5825,0.0619,0.0833,0.382
dt,Decision Tree Classifier,0.5244,0.525,0.6467,0.5133,0.561,0.0412,0.0698,0.37
xgboost,Extreme Gradient Boosting,0.5644,0.5171,0.5733,0.5367,0.5335,0.1094,0.1361,0.02
lr,Logistic Regression,0.5222,0.5026,0.4967,0.5067,0.4969,0.0208,0.0233,0.442


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.6511,0.704,0.7467,0.6162,0.6618,0.2981,0.3243,0.292
rf,Random Forest Classifier,0.6267,0.649,0.5967,0.62,0.5921,0.2613,0.2677,0.378
nb,Naive Bayes,0.6289,0.6477,0.63,0.6433,0.6147,0.2772,0.2892,0.38
gbc,Gradient Boosting Classifier,0.6533,0.6456,0.6133,0.7,0.6311,0.3079,0.3288,0.012
et,Extra Trees Classifier,0.6111,0.6198,0.5633,0.6167,0.5856,0.2126,0.21,0.036
qda,Quadratic Discriminant Analysis,0.6467,0.595,0.57,0.7,0.5857,0.2069,0.2344,0.008
knn,K Neighbors Classifier,0.5244,0.584,0.71,0.5133,0.5825,0.0619,0.0833,0.382
dt,Decision Tree Classifier,0.5244,0.525,0.6467,0.5133,0.561,0.0412,0.0698,0.37
xgboost,Extreme Gradient Boosting,0.5644,0.5171,0.5733,0.5367,0.5335,0.1094,0.1361,0.02
lr,Logistic Regression,0.5222,0.5026,0.4967,0.5067,0.4969,0.0208,0.0233,0.442


# DEBUG

In [7]:
evaluate_model(top)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

# TODO