# 1. Settings

## 1.1 User variables

In [1]:
NB_RUNS = 10

USE_DISCRETE = True

#TARGET = "charcoal_change"

NB_TOP_MODELS = 10
TRAIN_SIZE = 0.8
FOLD_STRATEGY = "kfold"
N_FOLD = 5
FIX_IMBALANCE = True # Good for decision tree
FEATURE_SELECTION = False
N_FEATURES_TO_SELECT = 0.2
REMOVE_MULTICOLLINEARITY = False # Good for ada boost sometime
PREPROCESS = True
TRANSFORMATION = True
TRANSFORMATION_METHOD = 'yeo-johnson'
NORMALIZE = True
NORMALIZE_METHOD = "zscore" #maxabs

PCA = False
POLYNOMIAL_FEATURES = False

#SORT_METRIC = "MAE"
#OPTIMIZE_METRIC = "MAE"
SORT_METRIC = "AUC"
OPTIMIZE_METRIC = "AUC"

EXCLUDED_MODELS = ["lar"]#["catboost", "nb", "dummy", "lr", "qda", "knn", "lda", "svm", "ridge"] # ["catboost", "Naive Bayes", "Dummy Classifier", "Logistic Regression", "Quadratic Discriminant Analysis", "K Neighbors Classifier"]

# 1.2 Dev variables

In [2]:
RANDOM_SEED = 0

INPUT_DATA = "tmp/data_processed.csv"
INPUT_DATA_DISCRETE = "tmp/data_discrete.csv"

OUTPUT_SCORES_PATH = "tmp/hap_pycaret_scores.csv"
VERBOSE = True

## 1.3 Imports

In [3]:
import pandas as pd
import numpy as np
import random
#from pycaret.regression import *
from pycaret.classification import *

from config import *

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

## 1.4 Constants

In [4]:
# Constants
DEBUG = False

# 2. Data Loading

In [5]:
if USE_DISCRETE:
    df = pd.read_csv(INPUT_DATA_DISCRETE)
else:
    df = pd.read_csv(INPUT_DATA)

#df = df[df[TARGET] >= MIN_CHARCOAL]
display(df)

Unnamed: 0,Picea,Pinus,Alnus,Betula,Quercus,Ambrosia,Chenopodiaceae,Synth_0,Synth_1,Synth_2,Synth_3,Synth_4,Synth_5,Synth_6,Synth_7,Synth_8,Synth_9,Synth_0_prev
0,1,-1,1,-1,1,-1,1,1,1,1,1,1,1,1,-1,-1,-1,2
1,-1,1,1,1,-1,1,-1,1,1,1,1,1,1,-1,-1,-1,1,2
2,1,1,-1,-1,-1,1,-1,1,-1,-1,1,-1,1,1,1,1,-1,1
3,-1,-1,1,1,1,-1,1,1,1,1,1,1,-1,-1,-1,-1,1,1
4,1,-1,-1,-1,1,-1,-1,1,-1,-1,1,-1,1,1,-1,1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,-1,1,1,-1,-1,-1,1,-1,-1,1,1,1,-1,-1,1,1,-1,0
57,1,-1,1,1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,1,0
58,1,-1,-1,1,-1,-1,-1,-1,1,-1,-1,-1,1,1,1,-1,-1,1
59,-1,1,1,1,-1,1,1,1,1,-1,-1,-1,1,1,-1,-1,1,1


# Pycaret AutoML

In [6]:
df_scores = None

setup(df, target = TARGET, session_id = RANDOM_SEED,
    train_size=TRAIN_SIZE,
    fold_strategy=FOLD_STRATEGY,
    fold=N_FOLD,
    #fix_imbalance=FIX_IMBALANCE,
    preprocess=PREPROCESS,
    feature_selection=FEATURE_SELECTION,
    n_features_to_select=N_FEATURES_TO_SELECT,
    remove_multicollinearity=REMOVE_MULTICOLLINEARITY,
    normalize=NORMALIZE,
    normalize_method=NORMALIZE_METHOD,
    pca=PCA,
    polynomial_features=POLYNOMIAL_FEATURES,
    index=False,
    n_jobs=-1
    )

print("Val Scores")
top = compare_models(n_select = NB_TOP_MODELS,
                    sort=SORT_METRIC,
                    exclude=EXCLUDED_MODELS,
                    verbose=VERBOSE,
                    errors="raise")
df_results = pull()
df_results

Unnamed: 0,Description,Value
0,Session id,0
1,Target,Synth_0
2,Target type,Binary
3,Target mapping,"-1: 0, 1: 1"
4,Original data shape,"(61, 18)"
5,Transformed data shape,"(61, 18)"
6,Transformed train set shape,"(48, 18)"
7,Transformed test set shape,"(13, 18)"
8,Numeric features,17
9,Preprocess,True


Val Scores


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.7933,0.8301,0.8148,0.7857,0.7913,0.5551,0.5669,0.4
knn,K Neighbors Classifier,0.7089,0.7658,0.6795,0.691,0.6772,0.3726,0.3794,0.406
catboost,CatBoost Classifier,0.7133,0.7644,0.7576,0.71,0.7216,0.4105,0.4184,0.392
rf,Random Forest Classifier,0.6911,0.7544,0.6624,0.681,0.6542,0.3617,0.374,0.39
lr,Logistic Regression,0.6911,0.7461,0.639,0.7576,0.6691,0.3739,0.3939,0.476
ada,Ada Boost Classifier,0.6489,0.7037,0.6276,0.7057,0.641,0.2806,0.2957,0.014
xgboost,Extreme Gradient Boosting,0.6511,0.6973,0.649,0.699,0.6648,0.2865,0.2831,0.028
gbc,Gradient Boosting Classifier,0.6289,0.6881,0.5724,0.6433,0.5865,0.2307,0.2369,0.012
lda,Linear Discriminant Analysis,0.6511,0.6628,0.6005,0.7183,0.5999,0.2879,0.3204,0.004
et,Extra Trees Classifier,0.6689,0.6389,0.6224,0.6743,0.6324,0.3178,0.3259,0.036


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.7933,0.8301,0.8148,0.7857,0.7913,0.5551,0.5669,0.4
knn,K Neighbors Classifier,0.7089,0.7658,0.6795,0.691,0.6772,0.3726,0.3794,0.406
catboost,CatBoost Classifier,0.7133,0.7644,0.7576,0.71,0.7216,0.4105,0.4184,0.392
rf,Random Forest Classifier,0.6911,0.7544,0.6624,0.681,0.6542,0.3617,0.374,0.39
lr,Logistic Regression,0.6911,0.7461,0.639,0.7576,0.6691,0.3739,0.3939,0.476
ada,Ada Boost Classifier,0.6489,0.7037,0.6276,0.7057,0.641,0.2806,0.2957,0.014
xgboost,Extreme Gradient Boosting,0.6511,0.6973,0.649,0.699,0.6648,0.2865,0.2831,0.028
gbc,Gradient Boosting Classifier,0.6289,0.6881,0.5724,0.6433,0.5865,0.2307,0.2369,0.012
lda,Linear Discriminant Analysis,0.6511,0.6628,0.6005,0.7183,0.5999,0.2879,0.3204,0.004
et,Extra Trees Classifier,0.6689,0.6389,0.6224,0.6743,0.6324,0.3178,0.3259,0.036


# DEBUG

In [7]:
evaluate_model(top)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

# TODO