In [14]:
import os
import numpy as np
import pandas as pd
import sklearn

from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

from sklearn import svm
from sklearn import preprocessing

In [15]:
core_path = os.getcwd()
asteroids_df = pd.read_pickle(os.path.join(core_path, "data/lvl2/", "asteroids.pkl"))

In [16]:
asteroids_df.loc[:, "Class"] = asteroids_df["Main Group"].apply(lambda x: 1 if x=="X" else 0)

In [17]:
asteroids_X = np.array([k["Reflectance_norm550nm"].tolist() for k in asteroids_df["SpectrumDF"]])
asteroids_y = np.array(asteroids_df["Class"].tolist())

In [18]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1,
                             test_size=0.2)

for train_index, test_index in sss.split(asteroids_X, asteroids_y):
    X_train, X_test = asteroids_X[train_index], asteroids_X[test_index]
    y_train, y_test = asteroids_y[train_index], asteroids_y[test_index]

positive_class_weight = int(1.0 / (sum(y_train) / len(X_train)))


In [27]:
param_grid = {
    "svc__kernel": ["rbf", "linear"],
    "svc__C": [0.1, 1, 10, 100],
    "svc__gamma": ["scale", 0.01, 0.1]
}

pipe = Pipeline([
    ("scaler", preprocessing.StandardScaler()),
    ("svc", svm.SVC(class_weight={1: positive_class_weight}))
])

In [28]:
halving_clf = HalvingGridSearchCV(
    pipe,
    param_grid,
    scoring="f1",        # or "recall" if you want zero FN
    cv=5,
    factor=2,            # halves models each round
    min_resources="exhaust",
    verbose=3,
    n_jobs=-1
)

halving_clf.fit(X_train, y_train)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 66
max_resources_: 1071
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 24
n_resources: 66
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 12
n_resources: 132
Fitting 5 folds for each of 12 candidates, totalling 60 fits
----------
iter: 2
n_candidates: 6
n_resources: 264
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 3
n_candidates: 3
n_resources: 528
Fitting 5 folds for each of 3 candidates, totalling 15 fits
----------
iter: 4
n_candidates: 2
n_resources: 1056
Fitting 5 folds for each of 2 candidates, totalling 10 fits


0,1,2
,estimator,Pipeline(step...ght={1: 5}))])
,param_grid,"{'svc__C': [0.1, 1, ...], 'svc__gamma': ['scale', 0.01, ...], 'svc__kernel': ['rbf', 'linear']}"
,factor,2
,resource,'n_samples'
,max_resources,'auto'
,min_resources,'exhaust'
,aggressive_elimination,False
,cv,5
,scoring,'f1'
,refit,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,100
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,{1: 5}


In [30]:
y_test_pred = halving_clf.predict(X_test)

In [31]:
from sklearn.metrics import confusion_matrix
conf_mtx = confusion_matrix(y_test, y_test_pred)

print(conf_mtx)

tn, fp, fn, tp = conf_mtx.ravel()

[[215   6]
 [  2  45]]


In [32]:
recall_score = round(sklearn.metrics.recall_score(y_test, y_test_pred), 3)
print(f"Recall Score: {recall_score}")

precision_score = round(sklearn.metrics.precision_score(y_test, y_test_pred), 3)
print(f"Precision Score: {precision_score}")

f1_score = round(sklearn.metrics.f1_score(y_test, y_test_pred), 3)
print(f"F1 Score: {f1_score}")

Recall Score: 0.957
Precision Score: 0.882
F1 Score: 0.918


In [33]:
asteroids_random_y = asteroids_y.copy()
np.random.shuffle(asteroids_random_y)

In [34]:
f1_score_naive = round(sklearn.metrics.f1_score(asteroids_y, asteroids_random_y), 3)
print(f"F1 Score Naive: {f1_score_naive}")

F1 Score Naive: 0.169
