In [40]:
import numpy as np
import pandas as pd 
import sklearn

In [41]:
# We first modify the data to create a binary classification system
asteroids_df = pd.read_pickle("data/lvl2/asteroids.pkl")
asteroids_df.loc[:, "Class"] = asteroids_df["Main_Group"].apply(lambda x: 1 if x == "X" else 0)

# Separate the spectra and class into numpy arrays
asteroids_X = np.vstack([k["Reflectance_norm550nm"].to_numpy() for k in asteroids_df["SpectrumDF"]])
asteroids_Y = asteroids_df["Class"].to_numpy()

In [50]:
# Split the data into training and test samples 
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(asteroids_X, asteroids_Y):
    X_train, X_test = asteroids_X[train_index], asteroids_X[test_index]
    Y_train, Y_test = asteroids_Y[train_index], asteroids_Y[test_index]

# Define class weights based on ratios of 1 to 0
positive_class_weight = int(1.0 / (np.sum(Y_train) / len (Y_train)))
print(positive_class_weight)

5


In [52]:
# Create and train the SVM 
from sklearn import preprocessing 

# Scale the data to weight dimensions equally
scaler = preprocessing.StandardScaler().fit(X_train) 
X_train_scaled = scaler.transform(X_train) # Is this really needed, considering every dimension has the same order of magnitude (reflectance)?

from sklearn import svm 

wclf = svm.SVC(kernel="rbf", class_weight={1:positive_class_weight}, C=100) # A regularization parameter of 100?
wclf.fit(X_train_scaled, Y_train)

In [53]:
# Test the fit on the test sample 
X_test_scaled = scaler.transform(X_test)
Y_test_pred = wclf.predict(X_test_scaled)

In [57]:
# Track the performance using various metrics 
from sklearn.metrics import confusion_matrix

# Create a confusion matrix
conf_mat = confusion_matrix(Y_test, Y_test_pred)
print(conf_mat)

# Track the precision, recall, and f1_score
scores = sklearn.metrics.precision_recall_fscore_support(Y_test, Y_test_pred, average='binary')
print(scores)

[[217   4]
 [  2  45]]
(0.9183673469387755, 0.9574468085106383, 0.9375000000000001, None)


In [63]:
# Create a naive baseline based on a random shuffle 
asteroids_random_y = asteroids_Y.copy()
np.random.shuffle(asteroids_random_y)

conf_mat = confusion_matrix(asteroids_Y, asteroids_random_y)
print(conf_mat)

random_scores = sklearn.metrics.precision_recall_fscore_support(asteroids_Y, asteroids_random_y, average='binary') 
print(random_scores) # Recall is the same as precision, interesting...

[[912 190]
 [190  47]]
(0.19831223628691982, 0.19831223628691982, 0.19831223628691982, None)
