#### **Classifying the Graph Metrics With SVC**

In [39]:
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, cross_validate, GridSearchCV
import numpy as np

In [2]:
graph_metrics_df = pd.read_csv("../../data/current_graph_metrics.csv")
statuses_df = pd.read_csv("../../data/surface_and_status.csv")

In [3]:
df = graph_metrics_df.merge(statuses_df, on="SpecID").sort_values(by="SpecID").set_index('SpecID')
df

Unnamed: 0_level_0,PageRank,DegreeCentrality,EigenvectorCentrality,ArticleRank,LabelPropagation,Leiden,Louvain,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
201210-1-00,0.830275,1170.371464,0.014458,0.195683,1,1,2289,201210-1,Normal
201210-1-01,0.811969,1134.613827,0.014017,0.194311,1,1,2289,201210-1,Normal
201210-1-02,0.580317,733.700940,0.009064,0.178658,1,127,988,201210-1,Normal
201210-1-03,0.744123,1017.430766,0.012569,0.189725,1,1,2289,201210-1,Normal
201210-1-04,0.794119,1107.509457,0.013682,0.193235,1,1,2289,201210-1,Normal
...,...,...,...,...,...,...,...,...,...
210526-3-45,1.128517,1689.660278,0.020874,0.215918,1,1,2289,210526-3,Hyperglycemia
210526-3-46,1.110504,1658.692469,0.020491,0.214711,1,1,2289,210526-3,Hyperglycemia
210526-3-47,1.145219,1718.172663,0.021226,0.217025,1,1,2289,210526-3,Hyperglycemia
210526-3-48,1.084825,1613.192575,0.019929,0.212946,1,1,2289,210526-3,Hyperglycemia


In [34]:
def evaluate_svm(df):
    # Set the Surfaces as groups
    groups = df['SurID']
    X = df.drop(['Status', 'SurID'], axis=1)
    y = df['Status']

    # Creating the SVM classifier
    #svm = SVC(random_state=1234)
    svm = SVC(random_state=1234, kernel='linear', C=100000)
    
    # Using GroupKFold for classification tasks
    cv = GroupKFold(n_splits=5)

    scores = cross_validate(svm, X, y, groups=groups, cv=cv, scoring=['accuracy'], n_jobs=-1)
    
    # Displaying the results
    print(f'{svm.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores["test_accuracy"]):.4f} +/- {np.std(scores["test_accuracy"]):.4f}')

In [35]:
#evaluate_svm(df)

>#### Classify based on FastRP Embeddings

In [36]:
fastRP_df = pd.read_csv("../../data/fastRP_embeddings.csv")

In [37]:
df = fastRP_df.merge(statuses_df, on="SpecID").sort_values(by="SpecID").set_index('SpecID')
df

Unnamed: 0_level_0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_504,embedding_505,embedding_506,embedding_507,embedding_508,embedding_509,embedding_510,embedding_511,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,-0.088389,0.020422,0.147291,0.141107,0.076047,-0.051798,0.008347,-0.016279,-0.020508,0.064854,...,0.020160,0.085649,0.096982,-0.011981,0.028314,-0.156349,-0.055221,0.084925,201210-1,Normal
201210-1-01,-0.083970,0.021639,0.146236,0.138482,0.078607,-0.051480,0.008602,-0.018410,-0.018800,0.069214,...,0.023753,0.083272,0.095447,-0.012564,0.025295,-0.154007,-0.052783,0.087279,201210-1,Normal
201210-1-02,-0.072841,0.024081,0.144375,0.132327,0.084038,-0.048699,0.008751,-0.022922,-0.014418,0.079331,...,0.031896,0.076468,0.092639,-0.014681,0.019510,-0.146837,-0.047957,0.093192,201210-1,Normal
201210-1-03,-0.018323,0.008027,0.127082,0.082085,0.094985,-0.022637,-0.000978,-0.060352,0.024082,0.110962,...,0.083069,-0.001681,0.096954,-0.018749,0.024557,-0.075894,-0.007323,0.143794,201210-1,Normal
201210-1-04,-0.077046,0.023361,0.144474,0.133169,0.081799,-0.051000,0.009235,-0.021873,-0.015591,0.076126,...,0.030767,0.077876,0.092512,-0.012997,0.022328,-0.149724,-0.048062,0.092219,201210-1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,-0.091401,0.019449,0.147800,0.142272,0.074101,-0.052247,0.008276,-0.015094,-0.021373,0.062217,...,0.018374,0.086775,0.097710,-0.011213,0.030401,-0.157729,-0.056315,0.083805,210526-3,Hyperglycemia
210526-3-46,-0.091623,0.019437,0.147861,0.142346,0.073858,-0.052321,0.008249,-0.014975,-0.021440,0.062008,...,0.018272,0.086762,0.097698,-0.011187,0.030760,-0.157841,-0.056374,0.083749,210526-3,Hyperglycemia
210526-3-47,-0.092270,0.019280,0.148023,0.142648,0.073370,-0.052377,0.008227,-0.014642,-0.021649,0.061360,...,0.017848,0.086950,0.097848,-0.011082,0.031457,-0.158153,-0.056663,0.083493,210526-3,Hyperglycemia
210526-3-48,-0.091947,0.019361,0.147960,0.142524,0.073604,-0.052334,0.008220,-0.014786,-0.021555,0.061668,...,0.018043,0.086861,0.097787,-0.011158,0.031133,-0.157999,-0.056547,0.083600,210526-3,Hyperglycemia


In [38]:
evaluate_svm(df)

SVC Cross-Validation Accuracy: 0.5199 +/- 0.0778


**Perform Grid Search Instead**

In [47]:
def evaluate_svm_with_grid_search(df):
    # Set the Surfaces as groups
    groups = df['SurID']
    X = df.drop(['Status', 'SurID'], axis=1)
    y = df['Status']

    # Define the SVM classifier
    svm = SVC(random_state=1234)

    # The parameter grid to search over
    # param_grid = {
    #     'kernel': ['linear', 'rbf'],
    #     'C': [0.1, 1, 10, 100, 1000, 10000],
    #     'gamma': ['scale', 'auto']
    # }

    # param_grid = {
    # 'C': [1, 10, 100, 1000, 10000, 100000, 1000000],
    # }

    # param_grid = {
    # 'C': [10**4, 10**5, 10**6, 10**7, 10**8, 10**9],
    # }

    param_grid = {
    'C': [10**6],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
    'gamma': ['scale', 'auto']
    }
    
    # Define the GroupKFold split strategy
    cv = GroupKFold(n_splits=5)

    # Initialise the GridSearchCV object
    grid_search = GridSearchCV(svm, param_grid, scoring='accuracy', cv=cv, n_jobs=-1, verbose=1)

    # Fit the model with the grid search
    # Note: GridSearchCV doesn't directly accept 'groups' as a parameter in 'fit', so we include them in 'cv' splits
    grid_search.fit(X, y, groups=groups)

    # Best parameter set found:
    print(f'Best parameters found: {grid_search.best_params_}')
    
    # Best cross-validation score:
    print(f'Best cross-validation accuracy: {grid_search.best_score_:.4f}')

    # Optionally, you can return the best estimator or the full GridSearchCV object for further inspection or predictions
    return grid_search.best_estimator_

In [49]:
evaluate_svm_with_grid_search(df)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
