In [1]:
from utils import DataUtils, MetricUtils, PlotUtils
from HUGIMLClassifier import HUGIMLClassifier
from HUGIMLClassifierBNB import HUGIMLClassifierBNB
from HUGIMLClassifierL1 import HUGIMLClassifierL1
from hmeasure import h_score  
import pandas as pd, numpy as np, copy, optuna
import optuna.visualization as vis
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from tqdm.notebook import tqdm_notebook
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,log_loss
%matplotlib inline

#### parameter selection

#### grid search CV

In [3]:
paramsByDs = {
    'pimaIndianDiabetes': {'dsName': 'pimaIndianDiabetes', 'B': 7, 'L': 1, 'G': 5e-3}, 
    'Heloc': {'dsName': 'Heloc', 'B': 8, 'L': 1, 'G': 1e-3}, 
    'BankMarketingUCI': {'dsName': 'BankMarketingUCI', 'B': 11, 'L': 2, 'G': 3e-3}
}

# Select first three datasets
selected_datasets = ['pimaIndianDiabetes', 'Heloc', 'BankMarketingUCI']

results = []

for dsName in selected_datasets:
    params = {'dsName': dsName} #parameters B, L, T computed from training data
    
    X, y, yNewToOriginal, procdata  = DataUtils().get_dataset_df(params)
    params = {**params, **procdata}
    
    from sklearn.model_selection import GridSearchCV
    gsParams = [{'B': [10,12], 'L':[1], 'G': [1e-3]}]
    h = HUGIMLClassifier(**params)
    
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
    clf = GridSearchCV(h, param_grid=gsParams, cv=skf, scoring=MetricUtils().logLoss, verbose=3)
    clf.fit(X, y)
    
    results = dict(zip(['bestScore', 'bestIndex', 'bestParams'], [clf.best_score_, clf.best_index_, clf.best_params_]))
    results

dataset: pimaIndianDiabetes (768, 8)  featureSize: (6, 2, 0) classSize: [(0, 500), (1, 268)]
all cols: ['numPregnancies', 'glucose', 'bp', 'skinThickness', 'insulin', 'age', 'bmi', 'diabetesPedigre']
i/f cols: ['numPregnancies', 'glucose', 'bp', 'skinThickness', 'insulin', 'age'] ['bmi', 'diabetesPedigreeFunction']
cat cols: []
params  : []
Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV 1/3] END ...............B=10, G=0.001, L=1;, score=-0.492 total time=   1.7s
[CV 2/3] END ...............B=10, G=0.001, L=1;, score=-0.455 total time=   1.7s
[CV 3/3] END ...............B=10, G=0.001, L=1;, score=-0.505 total time=   1.6s
[CV 1/3] END ...............B=12, G=0.001, L=1;, score=-0.529 total time=   1.6s
[CV 2/3] END ...............B=12, G=0.001, L=1;, score=-0.480 total time=   1.5s
[CV 3/3] END ...............B=12, G=0.001, L=1;, score=-0.507 total time=   1.6s
dataset: Heloc (9861, 23)  featureSize: (0, 23, 0) classSize: [(0, 5128), (1, 4733)]
all cols: ['MSinceMostRecen

#### optuna search

In [6]:
def pickParamValues(params, trial):
    paramsOptuna = {
                  'dsName': params['dsName'], 
                  'B': trial.suggest_categorical('B', params['B']), 
                  'L': trial.suggest_categorical('L', params['L']),
                  'G': trial.suggest_categorical('G', params['G'])
                 }
    return paramsOptuna

def runOptuna(params):
    summary = []
    keyExplored = []
    
    def objective(trial):
        paramsOptuna = pickParamValues(params, trial)
        optunaParamKey = ' '.join([str(k) for k in list(paramsOptuna.values())])
        cntr = 1
        while optunaParamKey in keyExplored:
            if cntr>=10: break #attempt 10times for a different key 
            print('key already explored, getting next possible key ', cntr)
            paramsOptuna = pickParamValues(params, trial)
            optunaParamKey = ' '.join([str(k) for k in list(paramsOptuna.values())])
            cntr += 1
        keyExplored.append(optunaParamKey)
        
        X, y, yNewToOriginal, procdata  = DataUtils().get_dataset_df(paramsOptuna)

        paramsOptuna = {**paramsOptuna, **procdata}
        #HUGIML classifier initialization
        clf = HUGIMLClassifier(**paramsOptuna)
        
        #train, test validation
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
        x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=0, stratify=y_train)#80/20 split for validation and parameter selection
        
        #transform x by generating HUIs and fit a model
        clf.fit(x_train, y_train)
        
        #predict probability on test instances
        y_pred_proba = clf.predict_proba(x_valid)
        y_pred = np.argmax(y_pred_proba, axis=1)
        
        #compute metrics
        finalRes = MetricUtils().get_metrics(y_valid, y_pred, y_pred_proba) #['accuracy', 'f1', 'auc', 'hmeasure', 'logLoss', 'precision', 'recall']
        logLoss = finalRes[4]
        summary.append(finalRes[0:5])#acc, f1, auc, hmeasure, logloss
        return logLoss

    study = optuna.create_study(direction="minimize", study_name='cv_search')
    print()
    study.optimize(objective, n_trials=params.get('numTrials', 5))
    print(study.best_trial)
    print("best value ", study.best_value, "best params ", study.best_params)

    #post process to get the output in a pandas dataframe
    newd = {}
    for tr in study.trials:
        for k, v in tr._params.items():
            if newd.get(k, -1)==-1: newd[k] = []
            newd[k].append(v)
    pd.DataFrame(newd)

    newd['logloss'] = [tr._values[0] for tr in study.trials]
    newd['acc'] = [s[0] for s in summary]
    newd['f1'] = [s[1] for s in summary]
    newd['auc'] = [s[2] for s in summary]
    newd['hmeasure'] = [s[3] for s in summary]
    newd['logLoss'] = [s[4] for s in summary]
    
    out = pd.DataFrame(newd)
    out = out.sort_values(by=['logloss'], ascending=True)
    display(out.style.highlight_max(color = 'yellow', axis = 0, subset = pd.IndexSlice[:, ['acc', 'f1', 'auc', 'hmeasure']]))

In [7]:
runOptuna({'dsName': dsName, 'B': [3,7,12], 'L': [1], 'G': [1e-2, 1e-3]})

[I 2025-04-21 09:08:18,002] A new study created in memory with name: cv_search



dataset: BankMarketingUCI (41188, 15)  featureSize: (1, 8, 6) classSize: [(0, 36548), (1, 4640)]
all cols: ['age', 'campaign', 'previous', 'pdays', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'job']
i/f cols: ['age'] ['campaign', 'previous', 'pdays', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
cat cols: ['job', 'marital', 'education', 'default', 'housing', 'loan']
params  : [('B', 3), ('L', 1), ('G', 0.01)]


[I 2025-04-21 09:08:21,970] Trial 0 finished with value: 0.2894 and parameters: {'B': 3, 'L': 1, 'G': 0.01}. Best is trial 0 with value: 0.2894.


dataset: BankMarketingUCI (41188, 15)  featureSize: (1, 8, 6) classSize: [(0, 36548), (1, 4640)]
all cols: ['age', 'campaign', 'previous', 'pdays', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'job']
i/f cols: ['age'] ['campaign', 'previous', 'pdays', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
cat cols: ['job', 'marital', 'education', 'default', 'housing', 'loan']
params  : [('B', 7), ('L', 1), ('G', 0.001)]


[I 2025-04-21 09:08:26,354] Trial 1 finished with value: 0.2747 and parameters: {'B': 7, 'L': 1, 'G': 0.001}. Best is trial 1 with value: 0.2747.


key already explored, getting next possible key  1
key already explored, getting next possible key  2
key already explored, getting next possible key  3
key already explored, getting next possible key  4
key already explored, getting next possible key  5
key already explored, getting next possible key  6
key already explored, getting next possible key  7
key already explored, getting next possible key  8
key already explored, getting next possible key  9
dataset: BankMarketingUCI (41188, 15)  featureSize: (1, 8, 6) classSize: [(0, 36548), (1, 4640)]
all cols: ['age', 'campaign', 'previous', 'pdays', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'job']
i/f cols: ['age'] ['campaign', 'previous', 'pdays', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
cat cols: ['job', 'marital', 'education', 'default', 'housing', 'loan']
params  : [('B', 3), ('L', 1), ('G', 0.01)]


[I 2025-04-21 09:08:30,118] Trial 2 finished with value: 0.2894 and parameters: {'B': 3, 'L': 1, 'G': 0.01}. Best is trial 1 with value: 0.2747.


dataset: BankMarketingUCI (41188, 15)  featureSize: (1, 8, 6) classSize: [(0, 36548), (1, 4640)]
all cols: ['age', 'campaign', 'previous', 'pdays', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'job']
i/f cols: ['age'] ['campaign', 'previous', 'pdays', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
cat cols: ['job', 'marital', 'education', 'default', 'housing', 'loan']
params  : [('B', 3), ('L', 1), ('G', 0.001)]


[I 2025-04-21 09:08:34,429] Trial 3 finished with value: 0.2806 and parameters: {'B': 3, 'L': 1, 'G': 0.001}. Best is trial 1 with value: 0.2747.


key already explored, getting next possible key  1
key already explored, getting next possible key  2
key already explored, getting next possible key  3
key already explored, getting next possible key  4
key already explored, getting next possible key  5
key already explored, getting next possible key  6
key already explored, getting next possible key  7
key already explored, getting next possible key  8
key already explored, getting next possible key  9
dataset: BankMarketingUCI (41188, 15)  featureSize: (1, 8, 6) classSize: [(0, 36548), (1, 4640)]
all cols: ['age', 'campaign', 'previous', 'pdays', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'job']
i/f cols: ['age'] ['campaign', 'previous', 'pdays', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
cat cols: ['job', 'marital', 'education', 'default', 'housing', 'loan']
params  : [('B', 3), ('L', 1), ('G', 0.001)]


[I 2025-04-21 09:08:38,768] Trial 4 finished with value: 0.2806 and parameters: {'B': 3, 'L': 1, 'G': 0.001}. Best is trial 1 with value: 0.2747.


FrozenTrial(number=1, state=TrialState.COMPLETE, values=[0.2747], datetime_start=datetime.datetime(2025, 4, 21, 9, 8, 21, 972188), datetime_complete=datetime.datetime(2025, 4, 21, 9, 8, 26, 354201), params={'B': 7, 'L': 1, 'G': 0.001}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'B': CategoricalDistribution(choices=(3, 7, 12)), 'L': CategoricalDistribution(choices=(1,)), 'G': CategoricalDistribution(choices=(0.01, 0.001))}, trial_id=1, value=None)
best value  0.2747 best params  {'B': 7, 'L': 1, 'G': 0.001}


Unnamed: 0,B,L,G,logloss,acc,f1,auc,hmeasure,logLoss
1,7,1,0.001,0.2747,0.9012,0.3431,0.7916,0.3678,0.2747
3,3,1,0.001,0.2806,0.8985,0.3082,0.7837,0.3443,0.2806
4,3,1,0.001,0.2806,0.8985,0.3082,0.7837,0.3443,0.2806
0,3,1,0.01,0.2894,0.9003,0.3423,0.7614,0.2803,0.2894
2,3,1,0.01,0.2894,0.9003,0.3423,0.7614,0.2803,0.2894
