In [None]:
from typing import Optional, List
import pathlib
import random
import re

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
DATA_PATH = pathlib.Path("data")
OUTPUT_PATH = pathlib.Path("output")

In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [None]:
def plot_confusion_matrix(
    y_test: np.ndarray,
    y_pred: np.ndarray,
    labels: List[str],
    title: Optional[str] = None,
) -> None:
    """Plot confusion matrix with given class names.

    Args:
        y_test (np.ndarray): True labels.
        y_pred (np.ndarray): Predicted labels.
        labels (List[str]): List of class names.
        title (str | None): Plot figure title. Defaults to None.
    """
    _, ax = plt.subplots(1, 1, figsize=(6, 5))
    if title is None:
        title = "Confusion Matrix"
    ax.set_title(title)
    cm = confusion_matrix(y_test, y_pred, normalize="true")
    sns.heatmap(
        pd.DataFrame(cm, labels, labels),
        ax=ax, annot=True, annot_kws={"size": 12}, fmt=".2f"
    )
    plt.show()


def convert_sklearn_to_weka(clf: DecisionTreeClassifier) -> str:
    """Convert Scikit-learn decision tree classifier to Weka textual format.

    Args:
        clf (DecisionTreeClassifier): Scikit-learn decision tree classifier.

    Returns:
        str: Decision tree in Weka textual format.
    """
    dt = re.sub(
        r"\|--- ", r"",
        export_text(clf, feature_names=clf.feature_names_in_)
    ).splitlines()
    new_dt = []
    line_idx = 1
    while line_idx < len(dt):
        match = re.search(r"class: (\w+)$", dt[line_idx])
        if match is not None:
            new_dt.append(f"{dt[line_idx - 1]}: {match.group(1)}")
            line_idx += 2
        else:
            new_dt.append(dt[line_idx - 1])
            line_idx += 1
    return "\n".join(new_dt) + "\n"

In [None]:
# Read labels and features from ARFF file
arff_data, arff_meta = arff.loadarff(OUTPUT_PATH / "har_features.arff")
arff_df = pd.DataFrame(arff_data)
X = arff_df.drop("class", axis=1)
y = np.array([label.decode() for label in arff_df["class"]])
LABELS = list(arff_meta["class"][1])

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3)

print(f"Training set: {len(y_train):>4} samples")
print(f"Testing set:  {len(y_test):>4} samples")

In [None]:
# Train decision tree classifer
dt_model = DecisionTreeClassifier(
    max_depth=128,
    criterion="entropy",
    min_impurity_decrease=1e-3,
    ccp_alpha=1e-2,
    class_weight="balanced",
).fit(X_train, y_train)

# Make predictions on the testing set
y_pred = dt_model.predict(X_test)

# Plot results
print(f"Prediction accuracy: {accuracy_score(y_test, y_pred) * 100:.02f}%")
plot_confusion_matrix(y_test, y_pred, LABELS)

In [None]:
# Export scikit-learn model to Weka format
with open(OUTPUT_PATH / "har_dectree.txt", "w") as f:
    weka_str = convert_sklearn_to_weka(dt_model)
    f.write(weka_str)
    print(weka_str, end="")