In [1]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
import re
import pandas as pd
from time import time
from matplotlib import pyplot as plt
import cupy as cp


In [2]:
test_size = 0.25

data = pd.read_csv('data/BCCC-CIRA-CIC-DoHBrw-2020.csv')

all_inputs = data.iloc[:, :28]
all_labels = data.loc[:, 'Label']

In [3]:
data_pca = pd.read_csv('data/data_pca.csv')

all_inputs_pca = data_pca.iloc[:, :9]
all_labels_pca = data_pca.loc[:, 'Label']

In [None]:
model_set = [
    MLPClassifier(),
    BaggingClassifier(n_jobs=-1),
    AdaBoostClassifier(),
    ExtraTreesClassifier(n_jobs=-1),
    QuadraticDiscriminantAnalysis(),
    RandomForestClassifier(n_jobs=-1),
    GaussianNB(),
    LogisticRegression(n_jobs=-1),
    LinearDiscriminantAnalysis(),
    GradientBoostingClassifier(),
    DecisionTreeClassifier(),
    SGDClassifier(n_jobs=-1),
    KNeighborsClassifier(n_jobs=-1),
    CatBoostClassifier(logging_level='Silent'),
]

def filter_parentheses(content):
    # This regular expression matches content inside parentheses
    regex = r'\([^()]*\)'
    # Use a loop to handle nested parentheses
    while re.search(regex, content):
        content = re.sub(regex, '', content)
    return content

results = []
names = [filter_parentheses(str(i)) for i in model_set] + [filter_parentheses(str(i)) + '_pca_dataset' for i in model_set]

features = cp.asarray(data[['feature1', 'feature2']].values)
target = cp.asarray(data['target'].values)

for model in model_set:
    kfold = KFold(n_splits=10, random_state=42, shuffle=True)
    print(filter_parentheses(str(model)))
    cv_results = cross_val_score(model,all_inputs, all_labels, cv=kfold, scoring='accuracy', n_jobs=-1)
    print(cv_results)
    results.append(cv_results)

for model in model_set:
    kfold = KFold(n_splits=10, random_state=42, shuffle=True)
    print(filter_parentheses(str(model)) + "_pca")
    cv_results = cross_val_score(model,all_inputs_pca, all_labels_pca, cv=kfold, scoring='accuracy', n_jobs=-1)
    print(cv_results)
    results.append(cv_results)

print(results)
plt.figure(figsize=(10,7))
plt.boxplot(results, labels=names)
plt.title('Algorithm Comparison')

# swap the axes
plt.xlabel('Algorithm')
plt.ylabel('Accuracy')
plt.xticks(rotation=90)

plt.show()















MLPClassifier_pca
BaggingClassifier_pca
AdaBoostClassifier_pca


In [None]:
X_train, X_test, y_train, y_test = train_test_split(all_inputs, all_labels, test_size=test_size, random_state=42)

train_times = []
decision_times = []
model_names = []

for model in model_set:
    start_time = time()
    model.fit(X_train, y_train)
    train_times.append(time() - start_time)

    start_time = time()
    model.predict(X_test)
    decision_times.append(time() - start_time)

    model_names.append(filter_parentheses(str(model)))
    
X_train, X_test, y_train, y_test = train_test_split(all_inputs_pca, all_labels_pca, test_size=test_size, random_state=42)

for model in model_set:
    start_time = time()
    model.fit(X_train, y_train)
    train_times.append(time() - start_time)

    start_time = time()
    model.predict(X_test)
    decision_times.append(time() - start_time)

    model_names.append(filter_parentheses(str(model)))
    
fig, ax = plt.subplots(2, 1, figsize=(14, 10))
ax[0].barh(model_names, train_times, color='blue')
ax[0].set_title('Training Time Comparison')
ax[0].set_xlabel('Time in seconds')

ax[1].barh(model_names, decision_times, color='green')
ax[1].set_title('Decision Time Comparison')
ax[1].set_xlabel('Time in seconds')

plt.tight_layout()
plt.show()