In [2]:
from utils import DataUtils, MetricUtils, PlotUtils
from HUGIMLClassifierBNB import HUGIMLClassifier

import pandas as pd, numpy as np, copy, optuna
import optuna.visualization as vis
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from tqdm.notebook import tqdm_notebook
%matplotlib inline

In [5]:
# Parameters B, L, and G for HUI mining and data transformation
paramsByDs = {
    'pimaIndianDiabetes': {'dsName': 'pimaIndianDiabetes', 'B': 7, 'L': 1, 'G': 5e-3},
    'Heloc': {'dsName': 'Heloc', 'B': 5, 'L': 2, 'G': 1e-3},
    'BankMarketingUCI': {'dsName': 'BankMarketingUCI', 'B': 6, 'L': 1, 'G': 5e-4},
    'Iris': {'dsName': 'Iris', 'B': 4, 'L': 1, 'G': 1e-3},
    'Titanic': {'dsName': 'Titanic', 'B': 5, 'L': 2, 'G': 5e-4}
}

dsNames = ['pimaIndianDiabetes', 'Heloc', 'BankMarketingUCI', 'Iris', 'Titanic']
dsName = dsNames[0]  # Using first dataset
params = paramsByDs[dsName]  # parameters B, L, G specified in dictionary

# Load dataset
X, y, yNewToOriginal, procdata = DataUtils().get_dataset_df(params)

# Ensure y is pandas Series
if not isinstance(y, pd.Series):
    y = pd.Series(y)

# Prepare the data using the classifier's method
clf_prep = HUGIMLClassifier()
X_prepared, y_prepared = clf_prep.prepareXy(X, y)

# Convert to numpy arrays (handling both cases where they might be DataFrames or already arrays)
Xs = X_prepared.to_numpy() if hasattr(X_prepared, 'to_numpy') else np.array(X_prepared)
ys = y_prepared.to_numpy() if hasattr(y_prepared, 'to_numpy') else np.array(y_prepared)

nfolds = 10
skf = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=0)

overallScores = [0, 0, 0, 0, 0, 0, 0]  # Acc, F1, Auc, Hmeasure, logLoss, Prec, Recall

for i, (tridx, tstidx) in enumerate(skf.split(Xs, ys)):
    x_train, y_train = Xs[tridx,:], ys[tridx]
    x_test, y_test = Xs[tstidx,:], ys[tstidx]

    # Combine parameters
    params = {
        **params,
        **procdata,
        'foldNo': i+1,
        'allCols': clf_prep.allCols,  # From prepareXy
        'origColumns': clf_prep.origColumns,  # From prepareXy
        'verbose': False  # Set to True for debugging
    }
    
    # Initialize HUGIML classifier with BernoulliNB
    clf = HUGIMLClassifier(**params)
    
    # Transform x by generating HUIs and fit the model
    clf.fit(x_train, y_train)
    
    # Predict probability on test instances
    y_pred_proba = clf.predict_proba(x_test)
    y_pred = np.argmax(y_pred_proba, axis=1)
    
    # Get metrics
    currScores = MetricUtils().get_metrics(y_test, y_pred, y_pred_proba)
    
    # Aggregate fold scores
    overallScores = [x + y for x, y in zip(overallScores, currScores)]

# Calculate average across folds
finalRes = [round(o / nfolds, 4) for o in overallScores]
out = pd.DataFrame(finalRes).T
out.columns = ['accuracy', 'f1', 'auc', 'hmeasure', 'logLoss', 'precision', 'recall']
out.index = ['performance']

print(f"\nResults for {dsName} dataset using BernoulliNB:")
display(out)

# Optional: Get information about the HUI features used
if hasattr(clf, 'procdata_'):
    print(f"\nNumber of HUI patterns generated: {len(clf.get_hug_features())}")
    print(f"Transformed feature space shape: {clf.get_transformed_shape()}")

dataset: pimaIndianDiabetes (768, 8)  featureSize: (6, 2, 0) classSize: [(0, 500), (1, 268)]
all cols: ['numPregnancies', 'glucose', 'bp', 'skinThickness', 'insulin', 'age', 'bmi', 'diabetesPedigre']
i/f cols: ['numPregnancies', 'glucose', 'bp', 'skinThickness', 'insulin', 'age'] ['bmi', 'diabetesPedigreeFunction']
cat cols: []
params  : [('B', 7), ('L', 1), ('G', 0.005)]

Results for pimaIndianDiabetes dataset using BernoulliNB:


Unnamed: 0,accuracy,f1,auc,hmeasure,logLoss,precision,recall
performance,0.7578,0.6578,0.821,0.4208,0.5395,0.649,0.6716



Number of HUI patterns generated: 24
Transformed feature space shape: (692, 24)
