#### **Classifying the Graph Metrics With SVC**

In [1]:
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, cross_validate, GridSearchCV
import numpy as np

In [2]:
graph_metrics_df = pd.read_csv("../../data/current_graph_metrics.csv")
statuses_df = pd.read_csv("../../data/surface_and_status.csv")

In [3]:
df = graph_metrics_df.merge(statuses_df, on="SpecID").sort_values(by="SpecID").set_index('SpecID')
df

Unnamed: 0_level_0,PageRank,DegreeCentrality,EigenvectorCentrality,ArticleRank,LabelPropagation,Leiden,Louvain,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
201210-1-00,0.830275,1170.371464,0.014458,0.195683,1,1,2289,201210-1,Normal
201210-1-01,0.811969,1134.613827,0.014017,0.194311,1,1,2289,201210-1,Normal
201210-1-02,0.580317,733.700940,0.009064,0.178658,1,127,988,201210-1,Normal
201210-1-03,0.744123,1017.430766,0.012569,0.189725,1,1,2289,201210-1,Normal
201210-1-04,0.794119,1107.509457,0.013682,0.193235,1,1,2289,201210-1,Normal
...,...,...,...,...,...,...,...,...,...
210526-3-45,1.128517,1689.660278,0.020874,0.215918,1,1,2289,210526-3,Hyperglycemia
210526-3-46,1.110504,1658.692469,0.020491,0.214711,1,1,2289,210526-3,Hyperglycemia
210526-3-47,1.145219,1718.172663,0.021226,0.217025,1,1,2289,210526-3,Hyperglycemia
210526-3-48,1.084825,1613.192575,0.019929,0.212946,1,1,2289,210526-3,Hyperglycemia


In [4]:
def evaluate_svm(df):
    # Set the Surfaces as groups
    groups = df['SurID']
    X = df.drop(['Status', 'SurID'], axis=1)
    y = df['Status']

    # Creating the SVM classifier
    #svm = SVC(random_state=1234)
    svm = SVC(random_state=1234, kernel='linear', C=100000)
    
    # Using GroupKFold for classification tasks
    cv = GroupKFold(n_splits=5)

    scores = cross_validate(svm, X, y, groups=groups, cv=cv, scoring=['accuracy'], n_jobs=-1)
    
    # Displaying the results
    print(f'{svm.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores["test_accuracy"]):.4f} +/- {np.std(scores["test_accuracy"]):.4f}')

In [5]:
#evaluate_svm(df)

>#### Classify based on FastRP Embeddings

In [6]:
fastRP_df = pd.read_csv("../../data/fastRP_embeddings.csv")

In [7]:
df = fastRP_df.merge(statuses_df, on="SpecID").sort_values(by="SpecID").set_index('SpecID')
df

Unnamed: 0_level_0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_504,embedding_505,embedding_506,embedding_507,embedding_508,embedding_509,embedding_510,embedding_511,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,-0.071709,0.019812,0.155153,0.125092,0.073686,-0.058367,0.013803,-0.025799,0.000904,0.076615,...,0.044698,0.058593,0.084534,-0.015432,0.035657,-0.147477,-0.044182,0.114158,201210-1,Normal
201210-1-01,-0.070028,0.019581,0.154498,0.123217,0.074390,-0.057920,0.014115,-0.027034,0.001916,0.077447,...,0.046547,0.056789,0.084982,-0.015607,0.035058,-0.145944,-0.043111,0.116166,201210-1,Normal
201210-1-02,-0.066435,0.019104,0.153313,0.119391,0.075831,-0.056594,0.014539,-0.029566,0.004048,0.079145,...,0.050232,0.052881,0.086077,-0.016078,0.033761,-0.142486,-0.041049,0.120104,201210-1,Normal
201210-1-03,-0.047474,0.008049,0.144355,0.092666,0.081228,-0.051265,0.014183,-0.046848,0.018119,0.078884,...,0.072508,0.027534,0.101231,-0.015336,0.032139,-0.118208,-0.028825,0.144707,201210-1,Normal
201210-1-04,-0.067541,0.018991,0.153414,0.120063,0.075290,-0.057354,0.014549,-0.029062,0.003492,0.078399,...,0.049597,0.053764,0.085851,-0.015725,0.034519,-0.143443,-0.041400,0.119543,201210-1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,-0.072801,0.019800,0.155506,0.126072,0.073219,-0.058749,0.013640,-0.025144,0.000344,0.076015,...,0.043682,0.059618,0.084365,-0.015234,0.036072,-0.148361,-0.044739,0.113078,210526-3,Hyperglycemia
210526-3-46,-0.072872,0.019808,0.155536,0.126133,0.073157,-0.058784,0.013614,-0.025098,0.000306,0.075967,...,0.043633,0.059657,0.084338,-0.015231,0.036147,-0.148420,-0.044767,0.113027,210526-3,Hyperglycemia
210526-3-47,-0.073124,0.019828,0.155632,0.126384,0.073027,-0.058861,0.013562,-0.024929,0.000162,0.075812,...,0.043392,0.059883,0.084283,-0.015200,0.036295,-0.148633,-0.044909,0.112769,210526-3,Hyperglycemia
210526-3-48,-0.072999,0.019822,0.155589,0.126270,0.073086,-0.058819,0.013578,-0.025006,0.000229,0.075889,...,0.043509,0.059773,0.084308,-0.015221,0.036226,-0.148531,-0.044847,0.112889,210526-3,Hyperglycemia


In [8]:
#evaluate_svm(df)

**Perform Grid Search Instead**

In [9]:
def evaluate_svm_with_grid_search(df):
    # Set the Surfaces as groups
    groups = df['SurID']
    X = df.drop(['Status', 'SurID'], axis=1)
    y = df['Status']

    # Define the SVM classifier
    svm = SVC(random_state=1234)

    # The parameter grid to search over
    # param_grid = {
    #     'kernel': ['linear', 'rbf'],
    #     'C': [0.1, 1, 10, 100, 1000, 10000],
    #     'gamma': ['scale', 'auto']
    # }

    # param_grid = {
    # 'C': [1, 10, 100, 1000, 10000, 100000, 1000000],
    # }

    # param_grid = {
    # 'C': [10**4, 10**5, 10**6, 10**7, 10**8, 10**9],
    # }

    # param_grid = {
    # 'C': [10**6],
    # 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    # 'gamma': ['scale', 'auto']
    # }

    param_grid = {
    'C': [1, 10** 4, 10**6],
    'gamma': ['scale']
    }
    
    # Define the GroupKFold split strategy
    cv = GroupKFold(n_splits=5)

    # Initialise the GridSearchCV object
    grid_search = GridSearchCV(svm, param_grid, scoring='accuracy', cv=cv, n_jobs=-1, verbose=1)

    # Fit the model with the grid search
    # Note: GridSearchCV doesn't directly accept 'groups' as a parameter in 'fit', so we include them in 'cv' splits
    grid_search.fit(X, y, groups=groups)

    # Best parameter set found:
    print(f'Best parameters found: {grid_search.best_params_}')
    
    # Best cross-validation score:
    print(f'Best cross-validation accuracy: {grid_search.best_score_:.4f}')

    # Optionally, you can return the best estimator or the full GridSearchCV object for further inspection or predictions
    return grid_search.best_estimator_

In [10]:
evaluate_svm_with_grid_search(df)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


Best parameters found: {'C': 1000000, 'gamma': 'scale'}
Best cross-validation accuracy: 0.5259
