In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

Classifier shortlist
- LogisticRegression
- RidgeClassifier
- SVC
- KNeighborsClassifier
- GaussianProcessClassifier (too slow)
- GaussianNB
- DecisionTreeClassifier

In [2]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
SEED = 0

# Data

In [4]:
df = pd.read_csv("/Users/trevoryu/Code/data/bdpp_data/preprocessed.csv")

In [5]:
df

Unnamed: 0,area,aspectratio,compactness,convexarea,eccentricity,equivalentdiameter,extent,majoraxislength,minoraxislength,perimeter,roundness,solidity,labels
0,-0.239477,-0.057676,0.601668,-0.249104,-0.394052,-0.186192,0.133002,-0.354749,-0.037375,-0.320879,0.519165,0.355837,3
1,0.087938,0.019910,-0.412415,0.073633,0.562506,0.281669,-0.413466,0.345412,0.142566,0.147384,0.122865,0.416040,3
2,0.007383,0.093310,-1.198328,0.000765,1.066853,0.172987,0.037099,0.488756,-0.155616,0.186013,-0.457566,0.162324,3
3,-0.075697,0.002723,-0.219457,-0.086749,0.400774,0.056837,0.029861,0.071411,-0.007877,-0.050680,0.157721,0.355837,3
4,-0.081343,0.009624,-0.313823,-0.086765,0.468872,0.048783,-1.135455,0.089287,-0.034600,-0.034816,0.045575,0.132222,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18991,0.238367,0.021047,-0.470161,0.235498,0.572082,0.475507,-1.050408,0.554215,0.330785,0.378959,-0.157500,0.016115,2
18992,0.233670,-0.032839,-0.381429,0.376010,-0.015259,0.469619,-1.428593,0.519876,0.717506,3.424890,-4.766095,-4.013238,2
18993,0.482107,0.012234,-0.322274,0.468290,0.493344,0.768416,0.370046,0.791625,0.619757,0.635347,-0.164320,0.295633,2
18994,0.422157,-0.014799,0.011528,0.406581,0.211378,0.698530,0.462330,0.614488,0.661514,0.478125,0.220614,0.355837,2


In [6]:
X = df.drop("labels", axis=1).values
y = df["labels"].values

In [7]:
label_counts = df["labels"].value_counts().sort_index()

class_weight = {i: count / sum(label_counts) for i, count in label_counts.items()}
class_weight

{0: 0.044061907770056856,
 1: 0.7165192672141504,
 2: 0.10781217098336492,
 3: 0.13160665403242788}

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# Models

In [9]:
params = {
    "class_weight": "balanced",
    "random_state": SEED,
    "max_iter": 10_000
}

config = {
    LogisticRegression: params,
    RidgeClassifier: params,
    SVC: params,
    LinearSVC: params | {"dual": True, "max_iter": 20_000},
    KNeighborsClassifier: {"n_jobs": -1},
    GaussianNB: {"priors": np.array(list(class_weight.values()))},
    DecisionTreeClassifier: {"random_state": SEED, "class_weight": "balanced"}
}

In [10]:
models = [model_class(**param) for model_class, param in config.items()]

# Training

In [11]:
trained_models = []
metrics = {}

for model in models:
    name = model.__class__.__name__
    print(f"Training {name}...")
    metrics.setdefault("model_name", []).append(name)

    model.fit(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    metrics.setdefault("accuracy", []).append(test_acc)
    print(f"Model accuracy: {test_acc*100:.2f}%")

    y_pred = model.predict(X_test)
    for metric_name, metric in zip(
        ["precision", "recall", "f1"],
        [precision_score, recall_score, f1_score]
    ):
        metrics.setdefault(metric_name, []).append(metric(y_test, y_pred, average="weighted"))

    trained_models.append(model)

Training LogisticRegression...
Model accuracy: 85.18%
Training RidgeClassifier...
Model accuracy: 81.29%
Training SVC...
Model accuracy: 88.18%
Training LinearSVC...
Model accuracy: 88.00%
Training KNeighborsClassifier...
Model accuracy: 92.32%
Training GaussianNB...
Model accuracy: 81.53%
Training DecisionTreeClassifier...
Model accuracy: 90.18%


In [12]:
df_metrics = pd.DataFrame(metrics)
df_metrics.sort_values(by="accuracy", ascending=False)

Unnamed: 0,model_name,accuracy,precision,recall,f1
4,KNeighborsClassifier,0.923158,0.924166,0.923158,0.922983
6,DecisionTreeClassifier,0.901842,0.900632,0.901842,0.90112
2,SVC,0.881842,0.912379,0.881842,0.888875
3,LinearSVC,0.88,0.896065,0.88,0.884544
0,LogisticRegression,0.851842,0.891076,0.851842,0.860645
5,GaussianNB,0.815263,0.876614,0.815263,0.830404
1,RidgeClassifier,0.812895,0.870301,0.812895,0.826555
