# Step 5.: Machine Learning - Parameter Search / Optimization

In [1]:
# Import standard libraries
import os

# Import installed libraries
import numpy as np
import pandas as pd
import sklearn

from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [2]:
# Let's mount the Google Drive, where we store files and models (if applicable, otherwise work
# locally)
try:
    from google.colab import drive
    drive.mount('/gdrive')
    core_path = "/gdrive/MyDrive/Colab/asteroid_taxonomy/"
except ModuleNotFoundError:
    core_path = ""

In [3]:
# Load the level 2 asteroid data
asteroids_df = pd.read_pickle(os.path.join(core_path, "data/lvl2/", "asteroids.pkl"))

In [4]:
# Now we add a binary classification schema, where we distinguish between e.g., X and non-X classes
asteroids_df.loc[:, "Class"] = asteroids_df["Main_Group"].apply(lambda x: 1 if x=="X" else 0)

In [5]:
# Allocate the spectra to one array and the classes to another one
asteroids_X = np.array([k["Reflectance_norm550nm"].tolist() for k in asteroids_df["SpectrumDF"]])
asteroids_y = np.array(asteroids_df["Class"].to_list())

In [6]:
# In this example we create a single test-training split with a ratio of 0.8 / 0.2
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=11, test_size=0.2, random_state=0)



result_df = pd.DataFrame([], columns=[])

for train_index, test_index in sss.split(asteroids_X, asteroids_y):
    
    temp_results = {}
    
    X_train, X_test = asteroids_X[train_index], asteroids_X[test_index]
    y_train, y_test = asteroids_y[train_index], asteroids_y[test_index]
    
    # Compute class weightning
    positive_class_weight = int(1.0 / (sum(y_train) / len(X_train)))

    parameters = {'kernel':('linear', 'rbf', 'poly'),
                  'C':[0.1, 1, 10, 100, 1000]}
    svc = svm.SVC(class_weight={1: positive_class_weight}, verbose=True)
    
    # Import the preprocessing module

    # Instantiate the StandardScaler (mean 0, standard deviation 1) and use the training data to fit
    # the scaler
    scaler = preprocessing.StandardScaler().fit(X_train)

    # Transform now the training data
    X_train_scaled = scaler.transform(X_train)
    print("Here")
    wclf = GridSearchCV(svc, parameters, scoring='recall')
    
    # Perform the training
    wclf.fit(X_train_scaled, y_train)

    break

Here
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

NameError: name 'stop' is not defined

In [7]:
wclf.cv_results_

{'mean_fit_time': array([2.50409093, 0.00687008]),
 'std_fit_time': array([0.84438061, 0.001063  ]),
 'mean_score_time': array([0.00340405, 0.00364099]),
 'std_score_time': array([0.00025272, 0.00030883]),
 'param_C': masked_array(data=[100, 100],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf'],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 100, 'kernel': 'linear'}, {'C': 100, 'kernel': 'rbf'}],
 'split0_test_score': array([0.86842105, 0.94736842]),
 'split1_test_score': array([0.84210526, 0.92105263]),
 'split2_test_score': array([0.94736842, 0.94736842]),
 'split3_test_score': array([0.89473684, 0.94736842]),
 'split4_test_score': array([0.89473684, 0.97368421]),
 'mean_test_score': array([0.88947368, 0.94736842]),
 'std_test_score': array([0.03491184, 0.01664357]),
 'rank_test_score': array([2, 1], dtype=int32)}

In [None]:
# Scale the testing data ...
X_test_scaled = scaler.transform(X_test)

# ... and perform a predicition
y_test_pred = wclf.predict(X_test_scaled)

In [None]:
# Import the confusion matrix and perform the computation
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_test_pred)

print(conf_mat)

# The order of the confusion matrix is:
#     - true negative (top left, tn)
#     - false positive (top right, fp)
#     - false negative (bottom left, fn)
#     - true positive (bottom right, tp)
tn, fp, fn, tp = conf_mat.ravel()

In [None]:
# Recall: ratio of correctly classified X Class spectra, considering the false negatives
# (recall = tp / (tp + fn))
recall_score = round(sklearn.metrics.recall_score(y_test, y_test_pred), 3)
print(f"Recall Score: {recall_score}")

# Precision: ratio of correctly classified X Class spectra, considering the false positives
# (precision = tp / (tp + fp))
precision_score = round(sklearn.metrics.precision_score(y_test, y_test_pred), 3)
print(f"Precision Score: {precision_score}")

# A combined score
f1_score = round(sklearn.metrics.f1_score(y_test, y_test_pred), 3)
print(f"F1 Score: {f1_score}")