In [4]:
import glob
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sys.path.append("../../../")

from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier, XGBRFClassifier

from helpers.split import tag_label_feature_split, make_train_test_split
from helpers.assess import make_confusion_matrix, make_classification_report

  from pandas import MultiIndex, Int64Index


In [None]:
def rfcross_gradient_boost_experiment(dataset):

    # load data and separate one hot encoded labels from features,
    # transform one hot encoded labels into label strings,
    # then encode label strings

    print(f"\n\nDataset: {dataset}\n")

    df = pd.read_pickle(dataset)
    _, (y, le) , X = tag_label_feature_split(df,label_format='encoded')

    # divide into train and test data sets
    X_train_std, X_test_std, y_train, y_test = make_train_test_split(
        X, y, test_size=0.2, random_state=10, stratify=y,x_scaler="standard"
    )

    # calculate sample weights to deal with class imbalance

    sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)

    model = XGBRFClassifier(
        use_label_encoder=False, objective="multi:softprob", eval_metric="mlogloss",
    )

    model.fit(X_train_std, y_train, sample_weight=sample_weights)

    predictions = model.predict(X_test_std)
    train_accuracy = model.score(X_train_std, y_train)
    test_accuracy = model.score(X_test_std, y_test)

    print(f"Train: {train_accuracy = :f}\n")
    make_classification_report(y_train,model=model,x=X_train_std,label_encoder=le,print_report=True)
    print(f"Test: {test_accuracy = :f}\n")
    make_classification_report(y_test, y_pred=predictions, digits=4,label_encoder=le,print_report=True)
    
    name = os.path.basename(dataset).split(".")[0]
    make_confusion_matrix(
        y_test,
        y_pred=predictions,
        label_encoder=le,
        title=f"{name} test (row normalized)"
    )

    return model

In [None]:
dataset_folder = "../../../datasets/"
datasets = sorted([name for name in glob.glob(dataset_folder + "*.pickle")])
for dataset in datasets:
    rfcross_gradient_boost_experiment(dataset)