In [23]:

import pickle

import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from utils import get_pipeline

# setup + import on pre-processed data:
data = pd.read_csv('../data_processed/hpc_job_scheduling_with_timestamps.csv')
pickle_file = open('../data_processed/hpc_job_scheduling_with_timestamps.pkl', 'rb')
feature_structure = pickle.load(pickle_file)
pickle_file.close()

feature_columns = feature_structure['bin'] + feature_structure['cat'] + feature_structure['cont'] + feature_structure[
    'ord']
X = data[feature_columns]
y = data['Class']

# Split Data
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
train_X.head()


Unnamed: 0,Protocol,Class,Compounds,InputFields,Iterations,NumPending,Normalized_Timestamp
1178,b'I',b'VF',120.0,12098.0,20.0,0.0,0.634844
877,b'H',b'VF',143.0,344.0,20.0,0.0,0.387071
4233,b'O',b'M',481.0,136.0,20.0,0.0,0.667067
4134,b'O',b'M',359.0,461.0,50.0,417.0,0.393976
3598,b'N',b'VF',460.0,463.0,20.0,1.0,0.381367


In [24]:
def run_classifiers(input_classifiers):
    for clf in input_classifiers:
        clf_name = clf.__class__.__name__
        model_pipeline = get_pipeline(feature_structure, clf=clf)
        model_pipeline.fit(train_X, train_y)
    
        # Predictions and evaluation
        predictions = model_pipeline.predict(test_X)
        print(f"Results for {clf_name}:")
        print("Accuracy:", accuracy_score(test_y, predictions))
        print("Confusion Matrix:\n", confusion_matrix(test_y, predictions))
        print("Classification Report:\n", classification_report(test_y, predictions, zero_division=1))
        print("\n----------------------\n")

In [25]:
random_forest_classifiers = [
    RandomForestClassifier(n_estimators=1, random_state=42),
    RandomForestClassifier(n_estimators=3, random_state=42),
    RandomForestClassifier(n_estimators=5, random_state=42),
]

run_classifiers(random_forest_classifiers)

Results for RandomForestClassifier:
Accuracy: 0.9965397923875432
Confusion Matrix:
 [[269   0   0   0]
 [  0  50   0   0]
 [  0   0 104   1]
 [  0   2   0 441]]
Classification Report:
               precision    recall  f1-score   support

        b'F'       1.00      1.00      1.00       269
        b'L'       0.96      1.00      0.98        50
        b'M'       1.00      0.99      1.00       105
       b'VF'       1.00      1.00      1.00       443

    accuracy                           1.00       867
   macro avg       0.99      1.00      0.99       867
weighted avg       1.00      1.00      1.00       867


----------------------

Results for RandomForestClassifier:
Accuracy: 1.0
Confusion Matrix:
 [[269   0   0   0]
 [  0  50   0   0]
 [  0   0 105   0]
 [  0   0   0 443]]
Classification Report:
               precision    recall  f1-score   support

        b'F'       1.00      1.00      1.00       269
        b'L'       1.00      1.00      1.00        50
        b'M'       1.0

In [26]:
gradient_boosting_classifiers = [
    GradientBoostingClassifier(n_estimators=1, random_state=42),
    GradientBoostingClassifier(n_estimators=2, random_state=42),
    GradientBoostingClassifier(n_estimators=3, random_state=42),
]

run_classifiers(gradient_boosting_classifiers)

Results for GradientBoostingClassifier:
Accuracy: 0.510957324106113
Confusion Matrix:
 [[  0   0   0 269]
 [  0   0   0  50]
 [  0   0   0 105]
 [  0   0   0 443]]
Classification Report:
               precision    recall  f1-score   support

        b'F'       1.00      0.00      0.00       269
        b'L'       1.00      0.00      0.00        50
        b'M'       1.00      0.00      0.00       105
       b'VF'       0.51      1.00      0.68       443

    accuracy                           0.51       867
   macro avg       0.88      0.25      0.17       867
weighted avg       0.75      0.51      0.35       867


----------------------

Results for GradientBoostingClassifier:
Accuracy: 0.8212226066897347
Confusion Matrix:
 [[269   0   0   0]
 [  0   0   0  50]
 [  0   0   0 105]
 [  0   0   0 443]]
Classification Report:
               precision    recall  f1-score   support

        b'F'       1.00      1.00      1.00       269
        b'L'       1.00      0.00      0.00        50


In [27]:
k_neighbors_classifiers = [
    KNeighborsClassifier(n_neighbors=1),
    KNeighborsClassifier(n_neighbors=2),
    KNeighborsClassifier(n_neighbors=3),
    KNeighborsClassifier(n_neighbors=5),
    KNeighborsClassifier(n_neighbors=10),
    KNeighborsClassifier(n_neighbors=50),
    KNeighborsClassifier(n_neighbors=100),
]

run_classifiers(k_neighbors_classifiers)

Results for KNeighborsClassifier:
Accuracy: 0.9953863898500577
Confusion Matrix:
 [[267   0   0   2]
 [  0  49   1   0]
 [  0   1 104   0]
 [  0   0   0 443]]
Classification Report:
               precision    recall  f1-score   support

        b'F'       1.00      0.99      1.00       269
        b'L'       0.98      0.98      0.98        50
        b'M'       0.99      0.99      0.99       105
       b'VF'       1.00      1.00      1.00       443

    accuracy                           1.00       867
   macro avg       0.99      0.99      0.99       867
weighted avg       1.00      1.00      1.00       867


----------------------

Results for KNeighborsClassifier:
Accuracy: 0.994232987312572
Confusion Matrix:
 [[269   0   0   0]
 [  1  49   0   0]
 [  2   0 103   0]
 [  2   0   0 441]]
Classification Report:
               precision    recall  f1-score   support

        b'F'       0.98      1.00      0.99       269
        b'L'       1.00      0.98      0.99        50
        b'M'

In [28]:
decision_tree_classifiers = [
    DecisionTreeClassifier(random_state=42)
]

run_classifiers(decision_tree_classifiers)

Results for DecisionTreeClassifier:
Accuracy: 1.0
Confusion Matrix:
 [[269   0   0   0]
 [  0  50   0   0]
 [  0   0 105   0]
 [  0   0   0 443]]
Classification Report:
               precision    recall  f1-score   support

        b'F'       1.00      1.00      1.00       269
        b'L'       1.00      1.00      1.00        50
        b'M'       1.00      1.00      1.00       105
       b'VF'       1.00      1.00      1.00       443

    accuracy                           1.00       867
   macro avg       1.00      1.00      1.00       867
weighted avg       1.00      1.00      1.00       867


----------------------



In [29]:
from ass1.utils import evaluate_models

classifiers = [
    RandomForestClassifier(n_estimators=10, random_state=42),
    GradientBoostingClassifier(n_estimators=10, random_state=42),
    KNeighborsClassifier(n_neighbors=5),
    SVC(random_state=42),
    DecisionTreeClassifier(random_state=42)
]

# Evaluate using the utility functions
evaluate_models(data, feature_structure, classifiers)


Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"RandomForestClassifier(n_estimators=10, random...",1.0,1.0,1.0,1.0,0.017015
1,"RandomForestClassifier(n_estimators=10, random...",1.0,1.0,1.0,1.0,0.111585
2,"GradientBoostingClassifier(n_estimators=10, ra...",1.0,1.0,1.0,1.0,0.094568
3,"GradientBoostingClassifier(n_estimators=10, ra...",1.0,1.0,1.0,1.0,0.494556
4,KNeighborsClassifier()_Holdout,0.989619,0.9897,0.989619,0.989639,0.034521
5,KNeighborsClassifier()_CV,0.954742,0.956415,0.954742,0.954284,0.211466
6,SVC(random_state=42)_Holdout,0.998847,0.998851,0.998847,0.998845,0.049521
7,SVC(random_state=42)_CV,0.98707,0.987549,0.98707,0.986956,0.234985
8,DecisionTreeClassifier(random_state=42)_Holdout,1.0,1.0,1.0,1.0,0.007002
9,DecisionTreeClassifier(random_state=42)_CV,1.0,1.0,1.0,1.0,0.067907


In [30]:
# Evaluate on original dataset:
data = pd.read_csv('../data_processed/hpc_job_scheduling_original.csv')
pickle_file = open('../data_processed/hpc_job_scheduling_original.pkl', 'rb')
feature_structure = pickle.load(pickle_file)
pickle_file.close()

feature_columns = feature_structure['bin'] + feature_structure['cat'] + feature_structure['cont'] + feature_structure[
    'ord']
X = data[feature_columns]
y = data['Class']

# Split Data
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
train_X.head()

evaluate_models(data, feature_structure, classifiers)


Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"RandomForestClassifier(n_estimators=10, random...",1.0,1.0,1.0,1.0,0.021138
1,"RandomForestClassifier(n_estimators=10, random...",1.0,1.0,1.0,1.0,0.136659
2,"GradientBoostingClassifier(n_estimators=10, ra...",1.0,1.0,1.0,1.0,0.081041
3,"GradientBoostingClassifier(n_estimators=10, ra...",1.0,1.0,1.0,1.0,0.403673
4,KNeighborsClassifier()_Holdout,0.989619,0.989609,0.989619,0.989599,0.07119
5,KNeighborsClassifier()_CV,0.936267,0.939919,0.936267,0.935961,0.328315
6,SVC(random_state=42)_Holdout,0.997693,0.997697,0.997693,0.997688,0.054743
7,SVC(random_state=42)_CV,0.987532,0.987776,0.987532,0.987378,0.260616
8,DecisionTreeClassifier(random_state=42)_Holdout,1.0,1.0,1.0,1.0,0.007945
9,DecisionTreeClassifier(random_state=42)_CV,1.0,1.0,1.0,1.0,0.071561
