In [1]:
import time
import os
import json

import polars as pl
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from IPython.display import display

In [2]:
label_encoder = LabelEncoder()

def load_data(filename, fit=True):
    path = os.path.join("..", "data_workflow_notebooks", "data", "tabular", filename)
    data = pl.read_csv(path).to_numpy()

    # 2nd last column is target, last column is file name
    x = data[:, :-2].astype(float)
    y_raw = data[:, -2]
    if fit:
        y = label_encoder.fit_transform(y_raw)
    else:
        y = label_encoder.transform(y_raw)
    return x, y

X_train, y_train = load_data("train_processed.csv", fit=True)
X_test, y_test = load_data("test_processed.csv")

print(f"Data loaded successfully\nTrain shape: {X_train.shape}, Test shape: {X_test.shape}")

Data loaded successfully
Train shape: (5621, 106), Test shape: (301, 106)


In [3]:
configs = {
    'Logistic Regression': {
        'pipeline': Pipeline([
            ('clf', LogisticRegression(max_iter=2000))
        ]),
        'params': {
            'clf__C': [0.1, 0.5, 1, 5, 10],
            'clf__solver': ['lbfgs', 'newton-cg']
        }
    },

    'Log. Reg. (Non-Linear)': {
        'pipeline': Pipeline([
            ('poly', PolynomialFeatures(degree=2)),
            ('clf', LogisticRegression(max_iter=50000))
        ]),
        'params': {
            'clf__C': [0.01, 0.1, 1]
        }
    },

    'SVM (Linear)': {
        'pipeline': Pipeline([
            ('clf', SVC(kernel='linear', max_iter=500000))
        ]),
        'params': {
            'clf__C': [0.01, 0.1, 1]
        }
    },

    'SVM (Kernel RBF)': {
        'pipeline': Pipeline([
            ('clf', SVC(kernel='rbf', max_iter=10000))
        ]),
        'params': {
            'clf__C': [20, 50, 100, 150, 200],
            'clf__gamma': ['scale', 'auto', 0.0001, 0.001, 0.01]
        }
    },

    'k-NN': {
        'pipeline': Pipeline([
            ('clf', KNeighborsClassifier(weights='distance'))
        ]),
        'params': {
            'clf__n_neighbors': [5, 7, 11, 15],
            'clf__p': [1, 2],
            'clf__leaf_size': [2, 3, 5, 7]
        }
    },

    'Naive Bayes': {
        'pipeline': Pipeline([
            ('clf', GaussianNB())
        ]),
        'params': {
            'clf__var_smoothing': [1e-4, 1e-3, 0.01, 0.1]
        }
    },

    'Random Forest': {
        'pipeline': Pipeline([
            ('clf', RandomForestClassifier(random_state=462))
        ]),
        'params': {
            'clf__n_estimators': [100, 200, 250],
            'clf__max_depth': [None, 10, 20, 25]
        }
    }
}

In [4]:
results = []

for name, config in configs.items():
    print(f"Training {name}")
    grid = GridSearchCV(
            config['pipeline'], 
            config['params'], 
            cv=5, 
            n_jobs=-1,
            scoring='accuracy'
        )

    start_time = time.time()
    grid.fit(X_train, y_train)
    end_time = time.time()
    training_time = end_time - start_time

    best_model = grid.best_estimator_
    best_params = grid.best_params_

    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results.append({
        'Classifier': name,
        'Best Hyperparameters': json.dumps(best_params),
        'Training Time (sec)': round(training_time, 4),
        'Test Accuracy': f"{round(acc * 100, 2)} %"
    })

Training Logistic Regression
Training Log. Reg. (Non-Linear)
Training SVM (Linear)
Training SVM (Kernel RBF)
Training k-NN
Training Naive Bayes
Training Random Forest


In [5]:
results_df = pl.DataFrame(results)
pl.Config.set_fmt_str_lengths(1000)
display(results_df.sort(by="Test Accuracy", descending=True))
results_df.write_csv('benchmark_results.csv')

Classifier,Best Hyperparameters,Training Time (sec),Test Accuracy
str,str,f64,str
"""Logistic Regression""","""{""clf__C"": 0.5, ""clf__solver"": ""lbfgs""}""",4.2122,"""92.36 %"""
"""SVM (Linear)""","""{""clf__C"": 0.1}""",1.9075,"""89.37 %"""
"""Naive Bayes""","""{""clf__var_smoothing"": 0.01}""",0.1614,"""87.71 %"""
"""SVM (Kernel RBF)""","""{""clf__C"": 150, ""clf__gamma"": 0.001}""",18.4779,"""85.05 %"""
"""Random Forest""","""{""clf__max_depth"": 20, ""clf__n_estimators"": 200}""",27.6749,"""81.4 %"""
"""Log. Reg. (Non-Linear)""","""{""clf__C"": 0.1}""",95.1838,"""66.11 %"""
"""k-NN""","""{""clf__leaf_size"": 2, ""clf__n_neighbors"": 11, ""clf__p"": 1}""",3.2424,"""33.55 %"""
