In [73]:
import numpy as np
import pandas as pd 
import sklearn

from sklearn import svm 
from sklearn import preprocessing 
from sklearn.model_selection import GridSearchCV

In [74]:
# We first modify the data to create a binary classification system
asteroids_df = pd.read_pickle("data/lvl2/asteroids.pkl")
asteroids_df.loc[:, "Class"] = asteroids_df["Main_Group"].apply(lambda x: 1 if x == "X" else 0)

# Separate the spectra and class into numpy arrays
asteroids_X = np.vstack([k["Reflectance_norm550nm"].to_numpy() for k in asteroids_df["SpectrumDF"]])
asteroids_Y = asteroids_df["Class"].to_numpy()

In [75]:
# Split the data into training and test samples 
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

for train_index, test_index in sss.split(asteroids_X, asteroids_Y):
    X_train, X_test = asteroids_X[train_index], asteroids_X[test_index]
    Y_train, Y_test = asteroids_Y[train_index], asteroids_Y[test_index]

# Define class weights based on ratios of 1 to 0
positive_class_weight = int(1.0 / (np.sum(Y_train) / len (Y_train)))
print(positive_class_weight)

5


In [76]:
# Use grid search to tune the hyperparameter space

params_grid = [
    {"C": [1, 10, 100], "kernel": ["linear", "rbf"]}
]

# Create the svc
svc = svm.SVC(class_weight={1:positive_class_weight})

# Scale the data to weight dimensions equally
scaler = preprocessing.StandardScaler().fit(X_train) 
X_train_scaled = scaler.transform(X_train) # Is this really needed, considering every dimension has the same order of magnitude (reflectance)?

# Perform Grid Search to find the best hyperparameter
wclf = GridSearchCV(svc, params_grid, scoring="f1", verbose=3, cv=5)
wclf.fit(X_train_scaled, Y_train)


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ................C=1, kernel=linear;, score=0.541 total time=   0.0s
[CV 2/5] END ................C=1, kernel=linear;, score=0.496 total time=   0.0s
[CV 3/5] END ................C=1, kernel=linear;, score=0.571 total time=   0.0s
[CV 4/5] END ................C=1, kernel=linear;, score=0.519 total time=   0.0s
[CV 5/5] END ................C=1, kernel=linear;, score=0.526 total time=   0.0s
[CV 1/5] END ...................C=1, kernel=rbf;, score=0.864 total time=   0.0s
[CV 2/5] END ...................C=1, kernel=rbf;, score=0.813 total time=   0.0s
[CV 3/5] END ...................C=1, kernel=rbf;, score=0.854 total time=   0.0s
[CV 4/5] END ...................C=1, kernel=rbf;, score=0.864 total time=   0.0s
[CV 5/5] END ...................C=1, kernel=rbf;, score=0.813 total time=   0.0s
[CV 1/5] END ...............C=10, kernel=linear;, score=0.551 total time=   0.1s
[CV 2/5] END ...............C=10, kernel=linear;,

In [78]:
final_clf = wclf.best_estimator_
print(final_clf.get_params())

{'C': 100, 'break_ties': False, 'cache_size': 200, 'class_weight': {1: 5}, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [53]:
# Test the fit on the test sample 
X_test_scaled = scaler.transform(X_test)
Y_test_pred = wclf.predict(X_test_scaled)

In [57]:
# Track the performance using various metrics 
from sklearn.metrics import confusion_matrix

# Create a confusion matrix
conf_mat = confusion_matrix(Y_test, Y_test_pred)
print(conf_mat)

# Track the precision, recall, and f1_score
scores = sklearn.metrics.precision_recall_fscore_support(Y_test, Y_test_pred, average='binary')
print(scores)

[[217   4]
 [  2  45]]
(0.9183673469387755, 0.9574468085106383, 0.9375000000000001, None)


In [63]:
# Create a naive baseline based on a random shuffle 
asteroids_random_y = asteroids_Y.copy()
np.random.shuffle(asteroids_random_y)

conf_mat = confusion_matrix(asteroids_Y, asteroids_random_y)
print(conf_mat)

random_scores = sklearn.metrics.precision_recall_fscore_support(asteroids_Y, asteroids_random_y, average='binary') 
print(random_scores) # Recall is the same as precision, interesting...

[[912 190]
 [190  47]]
(0.19831223628691982, 0.19831223628691982, 0.19831223628691982, None)
