In [8]:

import pickle

import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from utils import get_pipeline

# setup + import
data = pd.read_csv('../data_processed/hpc_job_scheduling_with_timestamps.csv')
pickle_file = open('../data_processed/hpc_job_scheduling_with_timestamps.pkl', 'rb')
feature_structure = pickle.load(pickle_file)
pickle_file.close()

feature_columns = feature_structure['bin'] + feature_structure['cat'] + feature_structure['cont'] + feature_structure[
    'ord']
X = data[feature_columns]
y = data['Class']

# Split Data
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
train_X.head()


     Protocol  Day  Class  Compounds  InputFields  Iterations  NumPending  \
1178     b'I'    4  b'VF'      120.0      12098.0        20.0         0.0   
877      b'H'    2  b'VF'      143.0        344.0        20.0         0.0   
4233     b'O'    4   b'M'      481.0        136.0        20.0         0.0   
4134     b'O'    2   b'M'      359.0        461.0        50.0       417.0   
3598     b'N'    2  b'VF'      460.0        463.0        20.0         1.0   
...       ...  ...    ...        ...          ...         ...         ...   
3444     b'N'    2   b'F'       22.0        559.0       100.0         0.0   
466      b'D'    0  b'VF'       49.0         37.0        20.0         0.0   
3092     b'M'    0   b'F'      461.0        679.0        20.0         0.0   
3772     b'O'    0   b'M'      117.0        314.0        20.0      3870.0   
860      b'H'    4  b'VF'      143.0        227.0        20.0         0.0   

       Timestamp  Normalized_Timestamp       Hour  
1178  105.850000       

In [9]:
def run_classifiers(input_classifiers):
    for clf in input_classifiers:
        clf_name = clf.__class__.__name__
        model_pipeline = get_pipeline(feature_structure, clf=clf)
        model_pipeline.fit(train_X, train_y)
    
        # Predictions and evaluation
        predictions = model_pipeline.predict(test_X)
        print(f"Results for {clf_name}:")
        print("Accuracy:", accuracy_score(test_y, predictions))
        print("Confusion Matrix:\n", confusion_matrix(test_y, predictions))
        print("Classification Report:\n", classification_report(test_y, predictions, zero_division=1))
        print("\n----------------------\n")

In [10]:
random_forest_classifiers = [
    RandomForestClassifier(n_estimators=1, random_state=42),
    RandomForestClassifier(n_estimators=3, random_state=42),
    RandomForestClassifier(n_estimators=5, random_state=42),
]

run_classifiers(random_forest_classifiers)

Results for RandomForestClassifier:
Accuracy: 0.9769319492502884
Confusion Matrix:
 [[262   0   1   6]
 [  1  47   2   0]
 [  3   0 101   1]
 [  5   0   1 437]]
Classification Report:
               precision    recall  f1-score   support

        b'F'       0.97      0.97      0.97       269
        b'L'       1.00      0.94      0.97        50
        b'M'       0.96      0.96      0.96       105
       b'VF'       0.98      0.99      0.99       443

    accuracy                           0.98       867
   macro avg       0.98      0.97      0.97       867
weighted avg       0.98      0.98      0.98       867


----------------------

Results for RandomForestClassifier:
Accuracy: 0.9965397923875432
Confusion Matrix:
 [[268   0   0   1]
 [  0  49   1   0]
 [  0   1 104   0]
 [  0   0   0 443]]
Classification Report:
               precision    recall  f1-score   support

        b'F'       1.00      1.00      1.00       269
        b'L'       0.98      0.98      0.98        50
       

In [11]:
gradient_boosting_classifiers = [
    GradientBoostingClassifier(n_estimators=1, random_state=42),
    GradientBoostingClassifier(n_estimators=2, random_state=42),
    GradientBoostingClassifier(n_estimators=3, random_state=42),
]

run_classifiers(gradient_boosting_classifiers)

Results for GradientBoostingClassifier:
Accuracy: 0.510957324106113
Confusion Matrix:
 [[  0   0   0 269]
 [  0   0   0  50]
 [  0   0   0 105]
 [  0   0   0 443]]
Classification Report:
               precision    recall  f1-score   support

        b'F'       1.00      0.00      0.00       269
        b'L'       1.00      0.00      0.00        50
        b'M'       1.00      0.00      0.00       105
       b'VF'       0.51      1.00      0.68       443

    accuracy                           0.51       867
   macro avg       0.88      0.25      0.17       867
weighted avg       0.75      0.51      0.35       867


----------------------

Results for GradientBoostingClassifier:
Accuracy: 0.8212226066897347
Confusion Matrix:
 [[269   0   0   0]
 [  0   0   0  50]
 [  0   0   0 105]
 [  0   0   0 443]]
Classification Report:
               precision    recall  f1-score   support

        b'F'       1.00      1.00      1.00       269
        b'L'       1.00      0.00      0.00        50


In [12]:
k_neighbors_classifiers = [
    KNeighborsClassifier(n_neighbors=1),
    KNeighborsClassifier(n_neighbors=2),
    KNeighborsClassifier(n_neighbors=3),
    KNeighborsClassifier(n_neighbors=5),
    KNeighborsClassifier(n_neighbors=10),
    KNeighborsClassifier(n_neighbors=50),
    KNeighborsClassifier(n_neighbors=100),
]

run_classifiers(k_neighbors_classifiers)

Results for KNeighborsClassifier:
Accuracy: 0.9919261822376009
Confusion Matrix:
 [[266   0   1   2]
 [  0  48   2   0]
 [  2   0 103   0]
 [  0   0   0 443]]
Classification Report:
               precision    recall  f1-score   support

        b'F'       0.99      0.99      0.99       269
        b'L'       1.00      0.96      0.98        50
        b'M'       0.97      0.98      0.98       105
       b'VF'       1.00      1.00      1.00       443

    accuracy                           0.99       867
   macro avg       0.99      0.98      0.99       867
weighted avg       0.99      0.99      0.99       867


----------------------

Results for KNeighborsClassifier:
Accuracy: 0.9930795847750865
Confusion Matrix:
 [[269   0   0   0]
 [  0  49   1   0]
 [  3   0 102   0]
 [  2   0   0 441]]
Classification Report:
               precision    recall  f1-score   support

        b'F'       0.98      1.00      0.99       269
        b'L'       1.00      0.98      0.99        50
        b'M

In [13]:
decision_tree_classifiers = [
    DecisionTreeClassifier(random_state=42)
]

run_classifiers(decision_tree_classifiers)

Results for DecisionTreeClassifier:
Accuracy: 1.0
Confusion Matrix:
 [[269   0   0   0]
 [  0  50   0   0]
 [  0   0 105   0]
 [  0   0   0 443]]
Classification Report:
               precision    recall  f1-score   support

        b'F'       1.00      1.00      1.00       269
        b'L'       1.00      1.00      1.00        50
        b'M'       1.00      1.00      1.00       105
       b'VF'       1.00      1.00      1.00       443

    accuracy                           1.00       867
   macro avg       1.00      1.00      1.00       867
weighted avg       1.00      1.00      1.00       867


----------------------



In [15]:
from ass1.utils import evaluate_models

classifiers = [
    RandomForestClassifier(n_estimators=10, random_state=42),
    GradientBoostingClassifier(n_estimators=10, random_state=42),
    KNeighborsClassifier(n_neighbors=5),
    SVC(random_state=42),
    DecisionTreeClassifier(random_state=42)
]

# Evaluate using the utility functions
evaluate_models(data, feature_structure, classifiers)


Unnamed: 0,model,accuracy,precision,recall,f1-score,timing
0,"RandomForestClassifier(n_estimators=10, random...",1.0,1.0,1.0,1.0,0.118992
1,"RandomForestClassifier(n_estimators=10, random...",0.988684,0.989619,0.988684,0.988738,0.743663
2,"GradientBoostingClassifier(n_estimators=10, ra...",1.0,1.0,1.0,1.0,0.333415
3,"GradientBoostingClassifier(n_estimators=10, ra...",1.0,1.0,1.0,1.0,0.711439
4,KNeighborsClassifier()_Holdout,0.978085,0.978383,0.978085,0.978151,0.088905
5,KNeighborsClassifier()_CV,0.93442,0.937956,0.93442,0.934008,0.457474
6,SVC(random_state=42)_Holdout,0.99654,0.996562,0.99654,0.996542,0.153567
7,SVC(random_state=42)_CV,0.986838,0.98726,0.986838,0.986757,1.290248
8,DecisionTreeClassifier(random_state=42)_Holdout,1.0,1.0,1.0,1.0,0.050851
9,DecisionTreeClassifier(random_state=42)_CV,1.0,1.0,1.0,1.0,0.536307
