In [1]:
# Libraries for managing dataset
import pandas as pd
import numpy as np

# Libraries of the ML models
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.svm import LinearSVC as LSVM
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.naive_bayes import CategoricalNB as NBC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.neural_network import MLPClassifier as MLPC
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression as LR

# libraries for performance evaluation
from sklearn.metrics import zero_one_loss
from sklearn.metrics import jaccard_score # equal to accuracy_score for binary classification
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import hamming_loss
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import recall_score
from sklearn.metrics import recall_score, make_scorer

# Libraries for cross-validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("../datasets/final_combined_extracted_features_dataset/final_combined_extracted_octapeptide_features.csv", 
                 index_col=0)

training_set, validation_set, testing_set = np.split(df.sample(frac=1, random_state=123), [int(.7*len(df)), int(.8*len(df))])

X_training = training_set.drop(["cleavage Status"], axis=1)
y_training = training_set["cleavage Status"].copy().map({"cleaved": 1, "uncleaved": 0})

X_validation = validation_set.drop(["cleavage Status"], axis=1)
y_validation = validation_set["cleavage Status"].copy().map({"cleaved": 1, "uncleaved": 0})

X_testing = testing_set.drop(["cleavage Status"], axis=1)
y_testing = testing_set["cleavage Status"].copy().map({"cleaved": 1, "uncleaved": 0})

# Set Up the GridSearchCV Algorithm to Perform the Search For the Best Hyperparameters

**1. Tuning C parameter of LSVM**

In [14]:
cv = StratifiedKFold(n_splits=10, random_state=123, shuffle=True)

grid = GridSearchCV(LSVM(max_iter=10000, random_state=123, tol=1e-7, class_weight="balanced", penalty="l2"), 
                    param_grid={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, scoring="balanced_accuracy", n_jobs=-1, cv=cv)

grid.fit(X_validation, y_validation)



In [4]:
sorted(grid.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_C',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'split5_test_score',
 'split6_test_score',
 'split7_test_score',
 'split8_test_score',
 'split9_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [12]:
grid.cv_results_['mean_test_score']

array([0.80881339, 0.79493545, 0.7767115 , 0.83035676, 0.79577149,
       0.79071003, 0.78466837, 0.82820965, 0.82327381, 0.82791628])

In [6]:
grid.cv_results_['param_C']

masked_array(data=[4, 4.3, 4.7],
             mask=[False, False, False],
       fill_value='?',
            dtype=object)

In [7]:
# LSVM(max_iter=10000, random_state=0, tol=1e-7, class_weight="balanced", C=4) Best parameter combination for LSVM