In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score
import pickle

Classifier shortlist
- LogisticRegression
- RidgeClassifier
- SVC
- KNeighborsClassifier
- GaussianProcessClassifier (too slow)
- GaussianNB
- DecisionTreeClassifier

In [5]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
SEED = 0

# Data

In [7]:
with open("../data/classical_ml_activity.pkl", "rb") as f:
    data = pickle.load(f)

X_train = data["X_train"].values
X_test = data["X_test"].values
y_train = data["y_train"].values
y_test = data["y_test"].values

# Models

In [14]:
class_weights = data["y_train"].value_counts()
class_weights = class_weights / sum(class_weights)
class_weights

1    0.500
3    0.215
2    0.205
0    0.080
Name: class, dtype: float64

In [15]:
params = {
    "class_weight": "balanced",
    "random_state": SEED,
    "max_iter": 10_000
}

config = {
    LogisticRegression: params,
    RidgeClassifier: params,
    SVC: params,
    LinearSVC: params | {"dual": True, "max_iter": 20_000},
    KNeighborsClassifier: {"n_jobs": -1},
    GaussianNB: {"priors": np.array(list(class_weights.values))},
    DecisionTreeClassifier: {"random_state": SEED, "class_weight": "balanced"}
}

In [16]:
models = [model_class(**param) for model_class, param in config.items()]

# Training

In [17]:
trained_models = []
metrics = {}

for model in models:
    name = model.__class__.__name__
    print(f"Training {name}...")
    metrics.setdefault("model_name", []).append(name)

    model.fit(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    metrics.setdefault("accuracy", []).append(test_acc)
    print(f"Model accuracy: {test_acc*100:.2f}%")

    y_pred = model.predict(X_test)
    for metric_name, metric in zip(
        ["precision", "recall", "f1"],
        [precision_score, recall_score, f1_score]
    ):
        metrics.setdefault(metric_name, []).append(metric(y_test, y_pred, average="weighted"))

    trained_models.append(model)

Training LogisticRegression...
Model accuracy: 87.85%
Training RidgeClassifier...
Model accuracy: 83.00%
Training SVC...
Model accuracy: 89.90%
Training LinearSVC...
Model accuracy: 88.80%
Training KNeighborsClassifier...
Model accuracy: 91.05%
Training GaussianNB...
Model accuracy: 83.90%
Training DecisionTreeClassifier...
Model accuracy: 88.10%


In [18]:
df_metrics = pd.DataFrame(metrics)
df_metrics.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_name,accuracy,precision,recall,f1
4,KNeighborsClassifier,0.9105,0.912049,0.9105,0.910873
2,SVC,0.899,0.911434,0.899,0.900632
3,LinearSVC,0.888,0.896419,0.888,0.889256
6,DecisionTreeClassifier,0.881,0.880938,0.881,0.880901
0,LogisticRegression,0.8785,0.890806,0.8785,0.879956
5,GaussianNB,0.839,0.862428,0.839,0.842351
1,RidgeClassifier,0.83,0.854471,0.83,0.833506
