In [109]:

import pickle

import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from utils import get_pipeline

# setup + import on pre-processed data:
data = pd.read_csv('../data_processed/hpc_job_scheduling_with_timestamps.csv')
pickle_file = open('../data_processed/hpc_job_scheduling_with_timestamps.pkl', 'rb')
feature_structure = pickle.load(pickle_file)
pickle_file.close()

feature_columns = feature_structure['bin'] + feature_structure['cat'] + feature_structure['cont'] + feature_structure[
    'ord']
X = data[feature_columns]
y = data['Class']

# Split Data
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
train_X.head()


Unnamed: 0,Protocol,Class,Compounds,InputFields,Iterations,NumPending,Normalized_Timestamp
1178,b'I',b'VF',120.0,12098.0,20.0,0.0,0.634844
877,b'H',b'VF',143.0,344.0,20.0,0.0,0.387071
4233,b'O',b'M',481.0,136.0,20.0,0.0,0.667067
4134,b'O',b'M',359.0,461.0,50.0,417.0,0.393976
3598,b'N',b'VF',460.0,463.0,20.0,1.0,0.381367


In [110]:
def run_classifiers(input_classifiers):
    for clf in input_classifiers:
        clf_name = clf.__class__.__name__
        model_pipeline = get_pipeline(feature_structure, clf=clf)
        model_pipeline.fit(train_X, train_y)
    
        # Predictions and evaluation
        predictions = model_pipeline.predict(test_X)
        print(f"Results for {clf_name}:")
        print("Accuracy:", accuracy_score(test_y, predictions))
        print("Confusion Matrix:\n", confusion_matrix(test_y, predictions))
        print("Classification Report:\n", classification_report(test_y, predictions, zero_division=1))
        print("\n----------------------\n")

In [111]:
classifiersSVC = [
    SVC(kernel='linear', random_state=42),
    SVC(kernel='rbf', random_state=42),
    SVC(kernel='poly', random_state=42),
    SVC(kernel='sigmoid', random_state=42),
]

resultsSVC = evaluate_models(data, feature_structure, classifiersSVC)
resultsSVC

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"SVC(kernel='linear', random_state=42)_Holdout",1.0,1.0,1.0,1.0,0.026217
1,"SVC(kernel='linear', random_state=42)_CV",1.0,1.0,1.0,1.0,0.184243
2,SVC(random_state=42)_Holdout,0.998847,0.998851,0.998847,0.998845,0.048831
3,SVC(random_state=42)_CV,0.98707,0.987549,0.98707,0.986956,0.300565
4,"SVC(kernel='poly', random_state=42)_Holdout",0.997693,0.997697,0.997693,0.997692,0.047516
5,"SVC(kernel='poly', random_state=42)_CV",0.984066,0.984107,0.984066,0.983726,0.294831
6,"SVC(kernel='sigmoid', random_state=42)_Holdout",0.889273,0.918324,0.889273,0.898133,0.036141
7,"SVC(kernel='sigmoid', random_state=42)_CV",0.868383,0.900144,0.868383,0.878633,0.250679


In [112]:
classifierKneighbors = [ KNeighborsClassifier(n_jobs=-1, n_neighbors=k) for k in range(2, 13)]
resultsKNeighbors = evaluate_models(data, feature_structure, classifierKneighbors)
resultsKNeighbors

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"KNeighborsClassifier(n_jobs=-1, n_neighbors=2)...",0.994233,0.994338,0.994233,0.99424,0.041029
1,"KNeighborsClassifier(n_jobs=-1, n_neighbors=2)_CV",0.931877,0.94307,0.931877,0.931469,0.286211
2,"KNeighborsClassifier(n_jobs=-1, n_neighbors=3)...",0.99308,0.993164,0.99308,0.993079,0.050034
3,"KNeighborsClassifier(n_jobs=-1, n_neighbors=3)_CV",0.949198,0.952154,0.949198,0.948952,0.303165
4,"KNeighborsClassifier(n_jobs=-1, n_neighbors=4)...",0.99308,0.993105,0.99308,0.993086,0.041386
5,"KNeighborsClassifier(n_jobs=-1, n_neighbors=4)_CV",0.950352,0.953931,0.950352,0.950202,0.285478
6,KNeighborsClassifier(n_jobs=-1)_Holdout,0.989619,0.9897,0.989619,0.989639,0.03701
7,KNeighborsClassifier(n_jobs=-1)_CV,0.954742,0.956415,0.954742,0.954284,0.306278
8,"KNeighborsClassifier(n_jobs=-1, n_neighbors=6)...",0.989619,0.989706,0.989619,0.989649,0.03902
9,"KNeighborsClassifier(n_jobs=-1, n_neighbors=6)_CV",0.953587,0.955999,0.953587,0.953156,0.282012


In [113]:
classifierDecisionTree = [ DecisionTreeClassifier(random_state=42, max_depth=depth) for depth in range(3, 15)]
resultsDecisionTree = evaluate_models(data, feature_structure, classifierDecisionTree)
resultsDecisionTree

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"DecisionTreeClassifier(max_depth=3, random_sta...",1.0,1.0,1.0,1.0,0.01401
1,"DecisionTreeClassifier(max_depth=3, random_sta...",1.0,1.0,1.0,1.0,0.127241
2,"DecisionTreeClassifier(max_depth=4, random_sta...",1.0,1.0,1.0,1.0,0.012506
3,"DecisionTreeClassifier(max_depth=4, random_sta...",1.0,1.0,1.0,1.0,0.122701
4,"DecisionTreeClassifier(max_depth=5, random_sta...",1.0,1.0,1.0,1.0,0.011505
5,"DecisionTreeClassifier(max_depth=5, random_sta...",1.0,1.0,1.0,1.0,0.129536
6,"DecisionTreeClassifier(max_depth=6, random_sta...",1.0,1.0,1.0,1.0,0.013505
7,"DecisionTreeClassifier(max_depth=6, random_sta...",1.0,1.0,1.0,1.0,0.123861
8,"DecisionTreeClassifier(max_depth=7, random_sta...",1.0,1.0,1.0,1.0,0.01351
9,"DecisionTreeClassifier(max_depth=7, random_sta...",1.0,1.0,1.0,1.0,0.123919


In [114]:
# Latex

print(results_to_latex(resultsKNeighbors, "Results HPC k-NN", "bc_knn"))
print("\n")
print(results_to_latex(resultsDecisionTree, "Results HPC Decision Trees", "bc_dt"))
print("\n")
print(results_to_latex(resultsSVC, "Results HPC SVC", "bc_svc"))

\begin{table}[H]
\centering
\resizebox{0.8\textwidth}{!}{
\begin{tabular}{|l|rr|rr|rr|rr|rr|}
\toprule
 & \multicolumn{2}{c|}{accuracy} & \multicolumn{2}{c|}{precision} & \multicolumn{2}{c|}{recall} & \multicolumn{2}{c|}{f1-score} & \multicolumn{2}{c|}{timing} \\
Parameters & holdout & cv & holdout & cv & holdout & cv & holdout & cv & holdout & cv \\
\hline
k=2 & 0.994 & 0.932 & 0.994 & 0.943 & 0.994 & 0.932 & 0.994 & 0.931 & 0.041 & 0.286 \\
k=3 & 0.993 & 0.949 & 0.993 & 0.952 & 0.993 & 0.949 & 0.993 & 0.949 & 0.050 & 0.303 \\
k=4 & 0.993 & 0.950 & 0.993 & 0.954 & 0.993 & 0.950 & 0.993 & 0.950 & 0.041 & 0.285 \\
k=5 & 0.990 & 0.955 & 0.990 & 0.956 & 0.990 & 0.955 & 0.990 & 0.954 & 0.037 & 0.306 \\
k=6 & 0.990 & 0.954 & 0.990 & 0.956 & 0.990 & 0.954 & 0.990 & 0.953 & 0.039 & 0.282 \\
k=7 & 0.988 & 0.955 & 0.989 & 0.957 & 0.988 & 0.955 & 0.989 & 0.955 & 0.043 & 0.290 \\
k=8 & 0.988 & 0.952 & 0.989 & 0.953 & 0.988 & 0.952 & 0.989 & 0.951 & 0.041 & 0.293 \\
k=9 & 0.985 & 0.956 & 0.985 & 0

## Original dataset

In [115]:
# Evaluate on original dataset:
data = pd.read_csv('../data_processed/hpc_job_scheduling_original.csv')
pickle_file = open('../data_processed/hpc_job_scheduling_original.pkl', 'rb')
feature_structure = pickle.load(pickle_file)
pickle_file.close()

feature_columns = feature_structure['bin'] + feature_structure['cat'] + feature_structure['cont'] + feature_structure[
    'ord']


In [116]:
resultsKNeighbors = evaluate_models(data, feature_structure, classifierKneighbors)
resultsKNeighbors

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"KNeighborsClassifier(n_jobs=-1, n_neighbors=2)...",0.994233,0.994338,0.994233,0.994232,0.066851
1,"KNeighborsClassifier(n_jobs=-1, n_neighbors=2)_CV",0.922407,0.934104,0.922407,0.922689,0.402462
2,"KNeighborsClassifier(n_jobs=-1, n_neighbors=3)...",0.991926,0.991982,0.991926,0.991923,0.067255
3,"KNeighborsClassifier(n_jobs=-1, n_neighbors=3)_CV",0.934419,0.939916,0.934419,0.934382,0.465576
4,"KNeighborsClassifier(n_jobs=-1, n_neighbors=4)...",0.99308,0.993105,0.99308,0.993071,0.065862
5,"KNeighborsClassifier(n_jobs=-1, n_neighbors=4)_CV",0.933031,0.940097,0.933031,0.933276,0.400607
6,KNeighborsClassifier(n_jobs=-1)_Holdout,0.989619,0.989609,0.989619,0.989599,0.063687
7,KNeighborsClassifier(n_jobs=-1)_CV,0.936267,0.939919,0.936267,0.935961,0.393067
8,"KNeighborsClassifier(n_jobs=-1, n_neighbors=6)...",0.988466,0.988443,0.988466,0.988442,0.065361
9,"KNeighborsClassifier(n_jobs=-1, n_neighbors=6)_CV",0.938114,0.94126,0.938114,0.937808,0.397228


In [117]:
resultsDecisionTree = evaluate_models(data, feature_structure, classifierDecisionTree)
resultsDecisionTree

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"DecisionTreeClassifier(max_depth=3, random_sta...",1.0,1.0,1.0,1.0,0.016507
1,"DecisionTreeClassifier(max_depth=3, random_sta...",1.0,1.0,1.0,1.0,0.143763
2,"DecisionTreeClassifier(max_depth=4, random_sta...",1.0,1.0,1.0,1.0,0.014756
3,"DecisionTreeClassifier(max_depth=4, random_sta...",1.0,1.0,1.0,1.0,0.142207
4,"DecisionTreeClassifier(max_depth=5, random_sta...",1.0,1.0,1.0,1.0,0.01439
5,"DecisionTreeClassifier(max_depth=5, random_sta...",1.0,1.0,1.0,1.0,0.14148
6,"DecisionTreeClassifier(max_depth=6, random_sta...",1.0,1.0,1.0,1.0,0.014834
7,"DecisionTreeClassifier(max_depth=6, random_sta...",1.0,1.0,1.0,1.0,0.139415
8,"DecisionTreeClassifier(max_depth=7, random_sta...",1.0,1.0,1.0,1.0,0.014554
9,"DecisionTreeClassifier(max_depth=7, random_sta...",1.0,1.0,1.0,1.0,0.135868


In [118]:
resultsSVC = evaluate_models(data, feature_structure, classifiersSVC)
resultsSVC

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"SVC(kernel='linear', random_state=42)_Holdout",1.0,1.0,1.0,1.0,0.035522
1,"SVC(kernel='linear', random_state=42)_CV",1.0,1.0,1.0,1.0,0.201686
2,SVC(random_state=42)_Holdout,0.997693,0.997697,0.997693,0.997688,0.0606
3,SVC(random_state=42)_CV,0.987532,0.987776,0.987532,0.987378,0.332625
4,"SVC(kernel='poly', random_state=42)_Holdout",0.998847,0.998869,0.998847,0.99885,0.073716
5,"SVC(kernel='poly', random_state=42)_CV",0.987068,0.986868,0.987068,0.986658,0.364929
6,"SVC(kernel='sigmoid', random_state=42)_Holdout",0.893887,0.925873,0.893887,0.903707,0.041533
7,"SVC(kernel='sigmoid', random_state=42)_CV",0.911109,0.928286,0.911109,0.913327,0.249629


In [119]:
# Original dataset

# Latex
print(results_to_latex(resultsKNeighbors, "Results HPC k-NN", "bc_knn"))
print("\n")
print(results_to_latex(resultsDecisionTree, "Results HPC Decision Trees", "bc_dt"))
print("\n")
print(results_to_latex(resultsSVC, "Results HPC SVC", "bc_svc"))

\begin{table}[H]
\centering
\resizebox{0.8\textwidth}{!}{
\begin{tabular}{|l|rr|rr|rr|rr|rr|}
\toprule
 & \multicolumn{2}{c|}{accuracy} & \multicolumn{2}{c|}{precision} & \multicolumn{2}{c|}{recall} & \multicolumn{2}{c|}{f1-score} & \multicolumn{2}{c|}{timing} \\
Parameters & holdout & cv & holdout & cv & holdout & cv & holdout & cv & holdout & cv \\
\hline
k=2 & 0.994 & 0.922 & 0.994 & 0.934 & 0.994 & 0.922 & 0.994 & 0.923 & 0.067 & 0.402 \\
k=3 & 0.992 & 0.934 & 0.992 & 0.940 & 0.992 & 0.934 & 0.992 & 0.934 & 0.067 & 0.466 \\
k=4 & 0.993 & 0.933 & 0.993 & 0.940 & 0.993 & 0.933 & 0.993 & 0.933 & 0.066 & 0.401 \\
k=5 & 0.990 & 0.936 & 0.990 & 0.940 & 0.990 & 0.936 & 0.990 & 0.936 & 0.064 & 0.393 \\
k=6 & 0.988 & 0.938 & 0.988 & 0.941 & 0.988 & 0.938 & 0.988 & 0.938 & 0.065 & 0.397 \\
k=7 & 0.987 & 0.942 & 0.987 & 0.944 & 0.987 & 0.942 & 0.987 & 0.941 & 0.065 & 0.400 \\
k=8 & 0.991 & 0.939 & 0.991 & 0.942 & 0.991 & 0.939 & 0.991 & 0.939 & 0.066 & 0.398 \\
k=9 & 0.987 & 0.941 & 0.987 & 0