In [98]:

import pickle

import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from utils import get_pipeline

# setup + import on pre-processed data:
data = pd.read_csv('../data_processed/hpc_job_scheduling_with_timestamps.csv')
pickle_file = open('../data_processed/hpc_job_scheduling_with_timestamps.pkl', 'rb')
feature_structure = pickle.load(pickle_file)
pickle_file.close()

feature_columns = feature_structure['bin'] + feature_structure['cat'] + feature_structure['cont'] + feature_structure[
    'ord']
X = data[feature_columns]
y = data['Class']

# Split Data
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
train_X.head()


Unnamed: 0,Protocol,Class,Compounds,InputFields,Iterations,NumPending,Normalized_Timestamp
1178,b'I',b'VF',120.0,12098.0,20.0,0.0,0.634844
877,b'H',b'VF',143.0,344.0,20.0,0.0,0.387071
4233,b'O',b'M',481.0,136.0,20.0,0.0,0.667067
4134,b'O',b'M',359.0,461.0,50.0,417.0,0.393976
3598,b'N',b'VF',460.0,463.0,20.0,1.0,0.381367


In [99]:
def run_classifiers(input_classifiers):
    for clf in input_classifiers:
        clf_name = clf.__class__.__name__
        model_pipeline = get_pipeline(feature_structure, clf=clf)
        model_pipeline.fit(train_X, train_y)
    
        # Predictions and evaluation
        predictions = model_pipeline.predict(test_X)
        print(f"Results for {clf_name}:")
        print("Accuracy:", accuracy_score(test_y, predictions))
        print("Confusion Matrix:\n", confusion_matrix(test_y, predictions))
        print("Classification Report:\n", classification_report(test_y, predictions, zero_division=1))
        print("\n----------------------\n")

In [100]:
classifiersSVC = [
    SVC(kernel='linear', random_state=42),
    SVC(kernel='rbf', random_state=42),
    SVC(kernel='poly', random_state=42),
    SVC(kernel='sigmoid', random_state=42),
]

resultsSVC = evaluate_models(data, feature_structure, classifiersSVC)
resultsSVC

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"SVC(kernel='linear', random_state=42)_Holdout",1.0,1.0,1.0,1.0,0.028126
1,"SVC(kernel='linear', random_state=42)_CV",1.0,1.0,1.0,1.0,0.185514
2,SVC(random_state=42)_Holdout,0.998847,0.998851,0.998847,0.998845,0.048796
3,SVC(random_state=42)_CV,0.98707,0.987549,0.98707,0.986956,0.301911
4,"SVC(kernel='poly', random_state=42)_Holdout",0.997693,0.997697,0.997693,0.997692,0.049139
5,"SVC(kernel='poly', random_state=42)_CV",0.984066,0.984107,0.984066,0.983726,0.290952
6,"SVC(kernel='sigmoid', random_state=42)_Holdout",0.889273,0.918324,0.889273,0.898133,0.039301
7,"SVC(kernel='sigmoid', random_state=42)_CV",0.868383,0.900144,0.868383,0.878633,0.567053


In [101]:
classifierKneighbors = [ KNeighborsClassifier(n_jobs=-1, n_neighbors=k) for k in range(2, 13)]
resultsKNeighbors = evaluate_models(data, feature_structure, classifierKneighbors)
resultsKNeighbors

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"KNeighborsClassifier(n_jobs=-1, n_neighbors=2)...",0.994233,0.994338,0.994233,0.99424,0.044661
1,"KNeighborsClassifier(n_jobs=-1, n_neighbors=2)_CV",0.931877,0.94307,0.931877,0.931469,0.31855
2,"KNeighborsClassifier(n_jobs=-1, n_neighbors=3)...",0.99308,0.993164,0.99308,0.993079,0.042856
3,"KNeighborsClassifier(n_jobs=-1, n_neighbors=3)_CV",0.949198,0.952154,0.949198,0.948952,0.300804
4,"KNeighborsClassifier(n_jobs=-1, n_neighbors=4)...",0.99308,0.993105,0.99308,0.993086,0.040119
5,"KNeighborsClassifier(n_jobs=-1, n_neighbors=4)_CV",0.950352,0.953931,0.950352,0.950202,0.652044
6,KNeighborsClassifier(n_jobs=-1)_Holdout,0.989619,0.9897,0.989619,0.989639,0.05865
7,KNeighborsClassifier(n_jobs=-1)_CV,0.954742,0.956415,0.954742,0.954284,0.32789
8,"KNeighborsClassifier(n_jobs=-1, n_neighbors=6)...",0.989619,0.989706,0.989619,0.989649,0.0512
9,"KNeighborsClassifier(n_jobs=-1, n_neighbors=6)_CV",0.953587,0.955999,0.953587,0.953156,0.313532


In [102]:
classifierDecisionTree = [ DecisionTreeClassifier(random_state=42, max_depth=depth) for depth in range(3, 15)]
resultsDecisionTree = evaluate_models(data, feature_structure, classifierDecisionTree)
resultsDecisionTree

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"DecisionTreeClassifier(max_depth=3, random_sta...",1.0,1.0,1.0,1.0,0.012017
1,"DecisionTreeClassifier(max_depth=3, random_sta...",1.0,1.0,1.0,1.0,0.129646
2,"DecisionTreeClassifier(max_depth=4, random_sta...",1.0,1.0,1.0,1.0,0.012016
3,"DecisionTreeClassifier(max_depth=4, random_sta...",1.0,1.0,1.0,1.0,0.128753
4,"DecisionTreeClassifier(max_depth=5, random_sta...",1.0,1.0,1.0,1.0,0.012691
5,"DecisionTreeClassifier(max_depth=5, random_sta...",1.0,1.0,1.0,1.0,0.130512
6,"DecisionTreeClassifier(max_depth=6, random_sta...",1.0,1.0,1.0,1.0,0.01203
7,"DecisionTreeClassifier(max_depth=6, random_sta...",1.0,1.0,1.0,1.0,0.215868
8,"DecisionTreeClassifier(max_depth=7, random_sta...",1.0,1.0,1.0,1.0,0.059441
9,"DecisionTreeClassifier(max_depth=7, random_sta...",1.0,1.0,1.0,1.0,0.24981


In [103]:
# Latex

print(results_to_latex(resultsKNeighbors, "Results Breast Cancer k-NN", "bc_knn"))
print("\n")
print(results_to_latex(resultsDecisionTree, "Results Breast Cancer Decision Trees", "bc_dt"))
print("\n")
print(results_to_latex(resultsSVC, "Results Breast Cancer SVC", "bc_svc"))

\begin{table}[H]
\centering
\resizebox{0.8\textwidth}{!}{
\begin{tabular}{|l|rr|rr|rr|rr|rr|}
\toprule
 & \multicolumn{2}{c|}{accuracy} & \multicolumn{2}{c|}{precision} & \multicolumn{2}{c|}{recall} & \multicolumn{2}{c|}{f1-score} & \multicolumn{2}{c|}{timing} \\
Parameters & holdout & cv & holdout & cv & holdout & cv & holdout & cv & holdout & cv \\
\hline
k=2 & 0.994 & 0.932 & 0.994 & 0.943 & 0.994 & 0.932 & 0.994 & 0.931 & 0.045 & 0.319 \\
k=3 & 0.993 & 0.949 & 0.993 & 0.952 & 0.993 & 0.949 & 0.993 & 0.949 & 0.043 & 0.301 \\
k=4 & 0.993 & 0.950 & 0.993 & 0.954 & 0.993 & 0.950 & 0.993 & 0.950 & 0.040 & 0.652 \\
k=5 & 0.990 & 0.955 & 0.990 & 0.956 & 0.990 & 0.955 & 0.990 & 0.954 & 0.059 & 0.328 \\
k=6 & 0.990 & 0.954 & 0.990 & 0.956 & 0.990 & 0.954 & 0.990 & 0.953 & 0.051 & 0.314 \\
k=7 & 0.988 & 0.955 & 0.989 & 0.957 & 0.988 & 0.955 & 0.989 & 0.955 & 0.041 & 0.278 \\
k=8 & 0.988 & 0.952 & 0.989 & 0.953 & 0.988 & 0.952 & 0.989 & 0.951 & 0.076 & 0.567 \\
k=9 & 0.985 & 0.956 & 0.985 & 0

## Original dataset

In [104]:
# Evaluate on original dataset:
data = pd.read_csv('../data_processed/hpc_job_scheduling_original.csv')
pickle_file = open('../data_processed/hpc_job_scheduling_original.pkl', 'rb')
feature_structure = pickle.load(pickle_file)
pickle_file.close()

feature_columns = feature_structure['bin'] + feature_structure['cat'] + feature_structure['cont'] + feature_structure[
    'ord']


In [105]:
resultsKNeighbors = evaluate_models(data, feature_structure, classifierKneighbors)
resultsKNeighbors

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"KNeighborsClassifier(n_jobs=-1, n_neighbors=2)...",0.994233,0.994338,0.994233,0.994232,0.081326
1,"KNeighborsClassifier(n_jobs=-1, n_neighbors=2)_CV",0.922407,0.934104,0.922407,0.922689,0.428162
2,"KNeighborsClassifier(n_jobs=-1, n_neighbors=3)...",0.991926,0.991982,0.991926,0.991923,0.065042
3,"KNeighborsClassifier(n_jobs=-1, n_neighbors=3)_CV",0.934419,0.939916,0.934419,0.934382,0.417789
4,"KNeighborsClassifier(n_jobs=-1, n_neighbors=4)...",0.99308,0.993105,0.99308,0.993071,0.287487
5,"KNeighborsClassifier(n_jobs=-1, n_neighbors=4)_CV",0.933031,0.940097,0.933031,0.933276,0.495885
6,KNeighborsClassifier(n_jobs=-1)_Holdout,0.989619,0.989609,0.989619,0.989599,0.074126
7,KNeighborsClassifier(n_jobs=-1)_CV,0.936267,0.939919,0.936267,0.935961,0.391815
8,"KNeighborsClassifier(n_jobs=-1, n_neighbors=6)...",0.988466,0.988443,0.988466,0.988442,0.065301
9,"KNeighborsClassifier(n_jobs=-1, n_neighbors=6)_CV",0.938114,0.94126,0.938114,0.937808,0.730608


In [106]:
resultsDecisionTree = evaluate_models(data, feature_structure, classifierDecisionTree)
resultsDecisionTree

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"DecisionTreeClassifier(max_depth=3, random_sta...",1.0,1.0,1.0,1.0,0.017115
1,"DecisionTreeClassifier(max_depth=3, random_sta...",1.0,1.0,1.0,1.0,0.144355
2,"DecisionTreeClassifier(max_depth=4, random_sta...",1.0,1.0,1.0,1.0,0.015532
3,"DecisionTreeClassifier(max_depth=4, random_sta...",1.0,1.0,1.0,1.0,0.139881
4,"DecisionTreeClassifier(max_depth=5, random_sta...",1.0,1.0,1.0,1.0,0.013571
5,"DecisionTreeClassifier(max_depth=5, random_sta...",1.0,1.0,1.0,1.0,0.135237
6,"DecisionTreeClassifier(max_depth=6, random_sta...",1.0,1.0,1.0,1.0,0.025099
7,"DecisionTreeClassifier(max_depth=6, random_sta...",1.0,1.0,1.0,1.0,0.391198
8,"DecisionTreeClassifier(max_depth=7, random_sta...",1.0,1.0,1.0,1.0,0.031338
9,"DecisionTreeClassifier(max_depth=7, random_sta...",1.0,1.0,1.0,1.0,0.179806


In [107]:
resultsSVC = evaluate_models(data, feature_structure, classifiersSVC)
resultsSVC

Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"SVC(kernel='linear', random_state=42)_Holdout",1.0,1.0,1.0,1.0,0.037091
1,"SVC(kernel='linear', random_state=42)_CV",1.0,1.0,1.0,1.0,0.211994
2,SVC(random_state=42)_Holdout,0.997693,0.997697,0.997693,0.997688,0.066283
3,SVC(random_state=42)_CV,0.987532,0.987776,0.987532,0.987378,0.349434
4,"SVC(kernel='poly', random_state=42)_Holdout",0.998847,0.998869,0.998847,0.99885,0.076843
5,"SVC(kernel='poly', random_state=42)_CV",0.987068,0.986868,0.987068,0.986658,0.60053
6,"SVC(kernel='sigmoid', random_state=42)_Holdout",0.893887,0.925873,0.893887,0.903707,0.091955
7,"SVC(kernel='sigmoid', random_state=42)_CV",0.911109,0.928286,0.911109,0.913327,0.30605


In [108]:
# Original dataset

# Latex
print(results_to_latex(resultsKNeighbors, "Results Breast Cancer k-NN", "bc_knn"))
print("\n")
print(results_to_latex(resultsDecisionTree, "Results Breast Cancer Decision Trees", "bc_dt"))
print("\n")
print(results_to_latex(resultsSVC, "Results Breast Cancer SVC", "bc_svc"))

\begin{table}[H]
\centering
\resizebox{0.8\textwidth}{!}{
\begin{tabular}{|l|rr|rr|rr|rr|rr|}
\toprule
 & \multicolumn{2}{c|}{accuracy} & \multicolumn{2}{c|}{precision} & \multicolumn{2}{c|}{recall} & \multicolumn{2}{c|}{f1-score} & \multicolumn{2}{c|}{timing} \\
Parameters & holdout & cv & holdout & cv & holdout & cv & holdout & cv & holdout & cv \\
\hline
k=2 & 0.994 & 0.922 & 0.994 & 0.934 & 0.994 & 0.922 & 0.994 & 0.923 & 0.081 & 0.428 \\
k=3 & 0.992 & 0.934 & 0.992 & 0.940 & 0.992 & 0.934 & 0.992 & 0.934 & 0.065 & 0.418 \\
k=4 & 0.993 & 0.933 & 0.993 & 0.940 & 0.993 & 0.933 & 0.993 & 0.933 & 0.287 & 0.496 \\
k=5 & 0.990 & 0.936 & 0.990 & 0.940 & 0.990 & 0.936 & 0.990 & 0.936 & 0.074 & 0.392 \\
k=6 & 0.988 & 0.938 & 0.988 & 0.941 & 0.988 & 0.938 & 0.988 & 0.938 & 0.065 & 0.731 \\
k=7 & 0.987 & 0.942 & 0.987 & 0.944 & 0.987 & 0.942 & 0.987 & 0.941 & 0.120 & 0.453 \\
k=8 & 0.991 & 0.939 & 0.991 & 0.942 & 0.991 & 0.939 & 0.991 & 0.939 & 0.065 & 0.398 \\
k=9 & 0.987 & 0.941 & 0.987 & 0