This version of the script uses binarization strategies for features

In [1]:
!pip install pyTsetlinMachine pandas scikit-learn pyarrow matplotlib seaborn

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from pyTsetlinMachine.tm import MultiClassTsetlinMachine
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import seaborn as sns
import glob


Collecting pyTsetlinMachine
  Downloading pyTsetlinMachine-0.6.6.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyTsetlinMachine
  Building wheel for pyTsetlinMachine (setup.py) ... [?25l[?25hdone
  Created wheel for pyTsetlinMachine: filename=pyTsetlinMachine-0.6.6-cp310-cp310-linux_x86_64.whl size=59504 sha256=0d520e9e53cdd1f0cbb5e258befa8582fc1f6b38b0d06c14a172b02ebd84dc4c
  Stored in directory: /root/.cache/pip/wheels/b0/b0/c5/07c4cb8bb93c5325bdc2c2a070b565f54df717d5d11f0c6802
Successfully built pyTsetlinMachine
Installing collected packages: pyTsetlinMachine
Successfully installed pyTsetlinMachine-0.6.6


Source of training dataset:

https://ieeexplore.ieee.org/abstract/document/8888419

https://www.kaggle.com/datasets/dhoogla/cicddos2019


(for preliminary testing, the test-train split in this dataset has been used)

In [None]:
import os
import glob
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

data_dir = "/kaggle/input/cicddos2019"
training_files = glob.glob(os.path.join(data_dir, "*-training.parquet"))
testing_files = glob.glob(os.path.join(data_dir, "*-testing.parquet"))

if not training_files:
    print(" No training files found!")
if not testing_files:
    print(" No testing files found!")

train_data = pd.concat([pd.read_parquet(f) for f in training_files], ignore_index=True)
test_data = pd.concat([pd.read_parquet(f) for f in testing_files], ignore_index=True)

ddos_types = ["DrDoS_DNS", "DrDoS_LDAP", "DrDoS_MSSQL", "DrDoS_NTP", "DrDoS_NetBIOS", 
              "DrDoS_SNMP", "Syn", "TFTP", "UDP", "UDPLag"]
label_mapping = {attack.upper(): 1 for attack in ddos_types}
label_mapping["BENIGN"] = 0

train_data['Label'] = train_data['Label'].str.upper()
test_data['Label'] = test_data['Label'].str.upper()

train_data = train_data[train_data['Label'].isin(label_mapping)]
test_data = test_data[test_data['Label'].isin(label_mapping)]
train_data['Label'] = train_data['Label'].map(label_mapping)
test_data['Label'] = test_data['Label'].map(label_mapping)

train_data.fillna(train_data.median(numeric_only=True), inplace=True)
test_data.fillna(test_data.median(numeric_only=True), inplace=True)

drop_features = [
    "Fwd Packet Length Max", "Bwd Packet Length Max", "Packet Length Max", "Subflow Fwd Bytes",
    "Subflow Bwd Bytes", "Init Bwd Win Bytes", "Flow Bytes/s", "Fwd IAT Total", "Bwd IAT Total", 
    "Avg Packet Size", "Fwd Packets Length Total", "Bwd Packets Length Total"
]
existing_drop_features = [col for col in drop_features if col in train_data.columns]
train_data.drop(columns=existing_drop_features, inplace=True)
test_data.drop(columns=existing_drop_features, inplace=True)

categorical_features = ['Protocol']
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_train = one_hot_encoder.fit_transform(train_data[categorical_features])
encoded_test = one_hot_encoder.transform(test_data[categorical_features])

encoded_columns = one_hot_encoder.get_feature_names_out(categorical_features)
train_encoded_df = pd.DataFrame(encoded_train, columns=encoded_columns, index=train_data.index)
test_encoded_df = pd.DataFrame(encoded_test, columns=encoded_columns, index=test_data.index)

train_data = pd.concat([train_data.drop(columns=categorical_features), train_encoded_df], axis=1)
test_data = pd.concat([test_data.drop(columns=categorical_features), test_encoded_df], axis=1)

binary_flags = ["Fwd PSH Flags", "Bwd PSH Flags", "Fwd URG Flags", "Bwd URG Flags"]
for flag in binary_flags:
    if flag in train_data.columns:
        train_data[flag] = (train_data[flag] > 0).astype(int)
        test_data[flag] = (test_data[flag] > 0).astype(int)

continuous_features = [
    "Flow Duration", "Fwd Packet Length Mean", "Bwd Packet Length Mean", "Packet Length Mean",
    "Flow IAT Mean", "Flow IAT Std", "Fwd IAT Mean", "Bwd IAT Mean", "Active Mean", "Idle Mean"
]
for feature in continuous_features:
    if feature in train_data.columns:
        q1, q2, q3 = np.percentile(train_data[feature], [25, 50, 75])
        train_data[f"{feature}_bin1"] = (train_data[feature] <= q1).astype(int)
        train_data[f"{feature}_bin2"] = ((train_data[feature] > q1) & (train_data[feature] <= q2)).astype(int)
        train_data[f"{feature}_bin3"] = ((train_data[feature] > q2) & (train_data[feature] <= q3)).astype(int)
        train_data[f"{feature}_bin4"] = (train_data[feature] > q3).astype(int)
        
        test_data[f"{feature}_bin1"] = (test_data[feature] <= q1).astype(int)
        test_data[f"{feature}_bin2"] = ((test_data[feature] > q1) & (test_data[feature] <= q2)).astype(int)
        test_data[f"{feature}_bin3"] = ((test_data[feature] > q2) & (test_data[feature] <= q3)).astype(int)
        test_data[f"{feature}_bin4"] = (test_data[feature] > q3).astype(int)
        
    train_data.drop(columns=[feature], inplace=True)
    test_data.drop(columns=[feature], inplace=True)

X_train = train_data.drop(columns=['Label'])
y_train = train_data['Label']
X_test = test_data.drop(columns=['Label'])
y_test = test_data['Label']

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Preprocessing complete! Ready for model training.")


Preprocessing complete! Ready for model training.


In [None]:
final_feature_names = train_data.drop(columns=['Label']).columns.tolist()

print(f"Final number of features used for training: {len(final_feature_names)}")
print("Features:")
for feature in final_feature_names:
    print(f"- {feature}")

Final number of features used for training: 97
Features:
- Total Fwd Packets
- Total Backward Packets
- Fwd Packet Length Min
- Fwd Packet Length Std
- Bwd Packet Length Min
- Bwd Packet Length Std
- Flow Packets/s
- Flow IAT Max
- Flow IAT Min
- Fwd IAT Std
- Fwd IAT Max
- Fwd IAT Min
- Bwd IAT Std
- Bwd IAT Max
- Bwd IAT Min
- Fwd PSH Flags
- Bwd PSH Flags
- Fwd URG Flags
- Bwd URG Flags
- Fwd Header Length
- Bwd Header Length
- Fwd Packets/s
- Bwd Packets/s
- Packet Length Min
- Packet Length Std
- Packet Length Variance
- FIN Flag Count
- SYN Flag Count
- RST Flag Count
- PSH Flag Count
- ACK Flag Count
- URG Flag Count
- CWE Flag Count
- ECE Flag Count
- Down/Up Ratio
- Avg Fwd Segment Size
- Avg Bwd Segment Size
- Fwd Avg Bytes/Bulk
- Fwd Avg Packets/Bulk
- Fwd Avg Bulk Rate
- Bwd Avg Bytes/Bulk
- Bwd Avg Packets/Bulk
- Bwd Avg Bulk Rate
- Subflow Fwd Packets
- Subflow Bwd Packets
- Init Fwd Win Bytes
- Fwd Act Data Packets
- Fwd Seg Size Min
- Active Std
- Active Max
- Active Mi

In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

feature_names = train_data.drop(columns=['Label']).columns  # Get feature names before scaling
X_train = pd.DataFrame(X_train, columns=feature_names)  # Convert back to DataFrame
X_test = pd.DataFrame(X_test, columns=feature_names)

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
feature_importances = rf.feature_importances_

feature_importance_dict = {feature: importance for feature, importance in zip(X_train.columns, feature_importances)}

# grouping binned features together
bin_feature_groups = {}  
for feature in X_train.columns:
    if "_bin" in feature: 
        base_feature = "_".join(feature.split("_")[:-1])  
        bin_feature_groups.setdefault(base_feature, []).append(feature)

# summing importance values for each base feature
base_feature_importance = {}
for base_feature, bins in bin_feature_groups.items():
    total_importance = sum(feature_importance_dict[bin] for bin in bins)
    base_feature_importance[base_feature] = total_importance
    for bin in bins:
        feature_importance_dict[bin] = total_importance  

# sorting features by importance (bins have been grouped under the same score)
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# choosing the top features (to make sure that all bins of a feature are included)
num_features_to_keep = 50
selected_features = set()

for feature, _ in sorted_features:
    if "_bin" in feature:  # If it's a binned feature
        base_feature = "_".join(feature.split("_")[:-1])
        if base_feature not in selected_features:
            selected_features.update(bin_feature_groups[base_feature])  # Add all bins
    else:
        selected_features.add(feature)
    if len(selected_features) >= num_features_to_keep:
        break

# filtering the dataset based on selected features
selected_features = list(selected_features)
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

print(f"Selected {len(selected_features)} features for training.")


✅ Selected 53 features for training.


In [5]:
print("Final selected features (53):")
for i, feature in enumerate(selected_features, 1):
    print(f"{i}. {feature}")


Final selected features (53):
1. Avg Bwd Segment Size
2. Bwd Packet Length Mean_bin4
3. Active Min
4. Flow Duration_bin3
5. Idle Max
6. Total Backward Packets
7. Total Fwd Packets
8. Fwd Packet Length Std
9. Fwd IAT Mean_bin3
10. Fwd Packet Length Mean_bin4
11. Down/Up Ratio
12. Fwd Packet Length Mean_bin2
13. Packet Length Min
14. Fwd IAT Mean_bin1
15. Flow IAT Max
16. Idle Std
17. Fwd Packets/s
18. CWE Flag Count
19. Bwd Packet Length Mean_bin1
20. Fwd Packet Length Min
21. Avg Fwd Segment Size
22. Flow Packets/s
23. Flow Duration_bin4
24. Fwd IAT Max
25. Flow IAT Mean_bin1
26. Subflow Bwd Packets
27. Flow Duration_bin1
28. Packet Length Mean_bin2
29. Packet Length Std
30. Flow IAT Mean_bin2
31. Packet Length Variance
32. ACK Flag Count
33. Fwd IAT Mean_bin4
34. Fwd IAT Std
35. Packet Length Mean_bin1
36. Init Fwd Win Bytes
37. Fwd Act Data Packets
38. Fwd IAT Mean_bin2
39. Bwd Packet Length Min
40. Flow Duration_bin2
41. Flow IAT Mean_bin3
42. RST Flag Count
43. Packet Length Mean_b

In [None]:
bin_feature_groups = {} 
other_features = []  

for feature in X_train.columns:
    if "_bin" in feature:  
        base_feature = "_".join(feature.split("_")[:-1]) 
        bin_feature_groups.setdefault(base_feature, []).append(feature)
    else:
        other_features.append(feature)  

sorted_feature_order = []

for base_feature, bins in bin_feature_groups.items():
    sorted_feature_order.extend(sorted(bins))  

sorted_feature_order.extend(other_features)

X_train = X_train[sorted_feature_order]
X_test = X_test[sorted_feature_order]

print("✅ Features reordered. Final column order in X_train:")
print(X_train.columns.tolist())


✅ Features reordered. Final column order in X_train:
['Flow Duration_bin1', 'Flow Duration_bin2', 'Flow Duration_bin3', 'Flow Duration_bin4', 'Fwd Packet Length Mean_bin1', 'Fwd Packet Length Mean_bin2', 'Fwd Packet Length Mean_bin3', 'Fwd Packet Length Mean_bin4', 'Bwd Packet Length Mean_bin1', 'Bwd Packet Length Mean_bin2', 'Bwd Packet Length Mean_bin3', 'Bwd Packet Length Mean_bin4', 'Packet Length Mean_bin1', 'Packet Length Mean_bin2', 'Packet Length Mean_bin3', 'Packet Length Mean_bin4', 'Flow IAT Mean_bin1', 'Flow IAT Mean_bin2', 'Flow IAT Mean_bin3', 'Flow IAT Mean_bin4', 'Flow IAT Std_bin1', 'Flow IAT Std_bin2', 'Flow IAT Std_bin3', 'Flow IAT Std_bin4', 'Fwd IAT Mean_bin1', 'Fwd IAT Mean_bin2', 'Fwd IAT Mean_bin3', 'Fwd IAT Mean_bin4', 'Bwd IAT Mean_bin1', 'Bwd IAT Mean_bin2', 'Bwd IAT Mean_bin3', 'Bwd IAT Mean_bin4', 'Active Mean_bin1', 'Active Mean_bin2', 'Active Mean_bin3', 'Active Mean_bin4', 'Idle Mean_bin1', 'Idle Mean_bin2', 'Idle Mean_bin3', 'Idle Mean_bin4', 'Total Fwd

In [None]:
print(f"Total training samples: {len(train_data)}")

print("Training set label distribution:\n", train_data['Label'].value_counts())

Total training samples: 113412
Training set label distribution:
 Label
1    66985
0    46427
Name: count, dtype: int64


In [9]:
print("X_train shape:", X_train.shape)  
print("X_test shape:", X_test.shape)


X_train shape: (113412, 97)
X_test shape: (286858, 97)


In [10]:
print("X_train shape:", y_train.shape)  
print("X_test shape:", y_test.shape)


X_train shape: (113412,)
X_test shape: (286858,)


to get statistically significant results, 
probably need to use the same training script multiple times, find a different best model each time, and find its perofrmance on test dataset each time, then take average of all of those times for final results.
how many tims to do it to make sure it is statistically significant

In [None]:
import numpy as np
import pickle
import time
import tracemalloc
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from joblib import Parallel, delayed

X_train_selected = X_train_selected.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test_selected = X_test_selected.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20], 'min_samples_split': [2, 5]}
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_params, best_f1, lowest_fpr, best_model = None, 0, float('inf'), None

def train_fold(n_estimators, max_depth, min_samples_split, X_train_fold, y_train_fold, X_val_fold, y_val_fold):
    """Train Random Forest on a single fold."""
    model = RandomForestClassifier(
        n_estimators=n_estimators, max_depth=max_depth, 
        min_samples_split=min_samples_split, random_state=42, n_jobs=-1
    )
    model.fit(X_train_fold, y_train_fold)
    
    y_val_pred = model.predict(X_val_fold)
    f1 = f1_score(y_val_fold, y_val_pred, zero_division=0)
    
    tn, fp, fn, tp = confusion_matrix(y_val_fold, y_val_pred).ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    return accuracy_score(y_val_fold, y_val_pred), f1, fpr

for params in ParameterGrid(param_grid):
    print(f"Testing params: {params}")
    results = Parallel(n_jobs=-1)(
        delayed(train_fold)(
            params['n_estimators'], params['max_depth'], params['min_samples_split'],
            X_train_selected.iloc[train], y_train.iloc[train],  # Using `.iloc` to ensure proper slicing
            X_train_selected.iloc[val], y_train.iloc[val]
        ) for train, val in kf.split(X_train_selected)
    )
    
    mean_acc, mean_f1, mean_fpr = np.mean(results, axis=0)
    if mean_f1 > best_f1 or (mean_f1 == best_f1 and mean_fpr < lowest_fpr):
        best_f1, lowest_fpr, best_params = mean_f1, mean_fpr, params

        with open("/kaggle/working/best_params_rf.pkl", "wb") as f:
            pickle.dump(best_params, f)

if best_params:
    print(f"\n Training final model with best params: {best_params}")
    best_model = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
    best_model.fit(X_train_selected, y_train)

    with open("/kaggle/working/best_model_rf.pkl", "wb") as model_file:
        pickle.dump(best_model, model_file)

    tracemalloc.start()
    start_time = time.time()
    
    y_test_pred = best_model.predict(X_test_selected)
    
    end_time = time.time()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    
    inference_time = end_time - start_time
    peak_memory_kb = peak / 1024  

    print("\n Test Set Performance (Random Forest):")
    print(f" Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
    print(f" Precision: {precision_score(y_test, y_test_pred, zero_division=0):.4f}")
    print(f" Recall: {recall_score(y_test, y_test_pred, zero_division=0):.4f}")
    print(f" F1 Score: {f1_score(y_test, y_test_pred, zero_division=0):.4f}")
    print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
    
    print(f"\n Inference Time: {inference_time:.6f} seconds")
    print(f" Peak Memory Usage: {peak_memory_kb:.2f} KB")

else:
    print("No trained model found.")

Testing params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Testing params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Testing params: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Testing params: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
Testing params: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
Testing params: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Testing params: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
Testing params: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}

 Training final model with best params: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}

 Test Set Performance (Random Forest):
 Accuracy: 0.8413
 Precision: 0.9997
 Recall: 0.8069
 F1 Score: 0.8930

 Confusion Matrix:
 [[ 51354     50]
 [ 45469 189985]]

 Inference Time: 0.934096 seconds
 Peak Memory Usage: 91753.24 KB


In [None]:
import numpy as np
import pickle
import time
import tracemalloc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from joblib import Parallel, delayed

X_train_selected = X_train_selected.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test_selected = X_test_selected.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

param_grid = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_params, best_f1, lowest_fpr, best_model = None, 0, float('inf'), None

def train_fold(n_neighbors, weights, X_train_fold, y_train_fold, X_val_fold, y_val_fold):
    """Train KNN on a single fold."""
    model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)
    model.fit(X_train_fold, y_train_fold)
    
    y_val_pred = model.predict(X_val_fold)
    f1 = f1_score(y_val_fold, y_val_pred, zero_division=0)
    
    tn, fp, fn, tp = confusion_matrix(y_val_fold, y_val_pred).ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    return accuracy_score(y_val_fold, y_val_pred), f1, fpr

for params in ParameterGrid(param_grid):
    print(f" Testing params: {params}")
    results = Parallel(n_jobs=-1)(
        delayed(train_fold)(
            params['n_neighbors'], params['weights'],
            X_train_selected.iloc[train], y_train.iloc[train],  
            X_train_selected.iloc[val], y_train.iloc[val]
        ) for train, val in kf.split(X_train_selected)
    )
    
    mean_acc, mean_f1, mean_fpr = np.mean(results, axis=0)
    if mean_f1 > best_f1 or (mean_f1 == best_f1 and mean_fpr < lowest_fpr):
        best_f1, lowest_fpr, best_params = mean_f1, mean_fpr, params

        with open("/kaggle/working/best_params_knn.pkl", "wb") as f:
            pickle.dump(best_params, f)

if best_params:
    print(f"\n Training final model with best params: {best_params}")
    best_model = KNeighborsClassifier(**best_params)
    best_model.fit(X_train_selected, y_train)

    with open("/kaggle/working/best_model_knn.pkl", "wb") as model_file:
        pickle.dump(best_model, model_file)

    tracemalloc.start()
    start_time = time.time()
    
    y_test_pred = best_model.predict(X_test_selected)
    
    end_time = time.time()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    
    inference_time = end_time - start_time
    peak_memory_kb = peak / 1024  

    acc = accuracy_score(y_test, y_test_pred)
    prec = precision_score(y_test, y_test_pred, zero_division=0)
    rec = recall_score(y_test, y_test_pred, zero_division=0)
    f1 = f1_score(y_test, y_test_pred, zero_division=0)
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  

    print("\n Test Set Performance (KNN):")
    print(f"Accuracy:            {acc:.4f}")
    print(f"Precision:           {prec:.4f}")
    print(f"Recall:              {rec:.4f}")
    print(f"F1 Score:            {f1:.4f}")
    print(f"False Positive Rate: {fpr:.4f}")
    print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
    
    print(f"\n Inference Time: {inference_time:.6f} seconds")
    print(f" Peak Memory Usage: {peak_memory_kb:.2f} KB")

else:
    print(" No trained model found.")


 Testing params: {'n_neighbors': 3, 'weights': 'uniform'}
 Testing params: {'n_neighbors': 3, 'weights': 'distance'}
 Testing params: {'n_neighbors': 5, 'weights': 'uniform'}
 Testing params: {'n_neighbors': 5, 'weights': 'distance'}
 Testing params: {'n_neighbors': 7, 'weights': 'uniform'}
 Testing params: {'n_neighbors': 7, 'weights': 'distance'}

 Training final model with best params: {'n_neighbors': 5, 'weights': 'distance'}

 Test Set Performance (KNN):
Accuracy:            0.9812
Precision:           0.9996
Recall:              0.9775
F1 Score:            0.9884
False Positive Rate: 0.0017

 Confusion Matrix:
 [[ 51316     88]
 [  5294 230160]]

 Inference Time: 73.439659 seconds
 Peak Memory Usage: 144503.31 KB


In [None]:
import numpy as np
import pickle
import time
import tracemalloc
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from joblib import Parallel, delayed

X_train_selected = X_train_selected.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test_selected = X_test_selected.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# our hyperparameter grid includes both 'linear' and 'rbf' kernels for fair comparison
param_grid = {'C': [1, 10], 'kernel': ['linear', 'rbf']}  # RBF added back

# 3-fold cross-validation for faster execution
kf = KFold(n_splits=3, shuffle=True, random_state=42)

best_params, best_f1, lowest_fpr, best_model = None, 0, float('inf'), None

def train_fold(C, kernel, X_train_fold, y_train_fold, X_val_fold, y_val_fold):
    """Train SVM on a single fold."""
    if kernel == 'linear':
        model = LinearSVC(C=C, random_state=42, dual=False)  
    else:
        model = SVC(C=C, kernel=kernel, random_state=42)

    model.fit(X_train_fold, y_train_fold)
    
    y_val_pred = model.predict(X_val_fold)
    f1 = f1_score(y_val_fold, y_val_pred, zero_division=0)
    
    tn, fp, fn, tp = confusion_matrix(y_val_fold, y_val_pred).ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    return accuracy_score(y_val_fold, y_val_pred), f1, fpr

for params in ParameterGrid(param_grid):
    print(f"Testing params: {params}")
    results = Parallel(n_jobs=-1)(
        delayed(train_fold)(
            params['C'], params['kernel'],
            X_train_selected.iloc[train], y_train.iloc[train],  
            X_train_selected.iloc[val], y_train.iloc[val]
        ) for train, val in kf.split(X_train_selected)
    )
    
    mean_acc, mean_f1, mean_fpr = np.mean(results, axis=0)
    if mean_f1 > best_f1 or (mean_f1 == best_f1 and mean_fpr < lowest_fpr):
        best_f1, lowest_fpr, best_params = mean_f1, mean_fpr, params

        with open("/kaggle/working/best_params_SVM_final.pkl", "wb") as f:
            pickle.dump(best_params, f)

if best_params:
    print(f"\nTraining final SVM model with best params: {best_params}")
    if best_params['kernel'] == 'linear':
        best_model = LinearSVC(C=best_params['C'], random_state=42, dual=False)  
    else:
        best_model = SVC(**best_params, random_state=42)

    best_model.fit(X_train_selected, y_train)

    with open("/kaggle/working/best_model_SVM_final.pkl", "wb") as model_file:
        pickle.dump(best_model, model_file)

    tracemalloc.start()
    start_time = time.time()
    
    y_test_pred = best_model.predict(X_test_selected)  
    end_time = time.time()
    current, peak_memory = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    
    inference_time = end_time - start_time
    peak_memory_kb = peak_memory / 1024  

    tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0 

    print("\nTest Set Performance (SVM):")
    print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_test_pred, zero_division=0):.4f}")
    print(f"Recall: {recall_score(y_test, y_test_pred, zero_division=0):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_test_pred, zero_division=0):.4f}")
    print(f"False Positive Rate: {fpr:.4f}")
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
    print(f"\nInference Time: {inference_time:.6f} seconds")
    print(f"Peak Memory Usage: {peak_memory_kb:.2f} KB")

else:
    print("No trained model found.")


Testing params: {'C': 1, 'kernel': 'linear'}
Testing params: {'C': 1, 'kernel': 'rbf'}
Testing params: {'C': 10, 'kernel': 'linear'}
Testing params: {'C': 10, 'kernel': 'rbf'}

Training final SVM model with best params: {'C': 10, 'kernel': 'rbf'}

Test Set Performance (SVM):
Accuracy: 0.9259
Precision: 0.9994
Recall: 0.9103
F1 Score: 0.9528
False Positive Rate: 0.0026

Confusion Matrix:
 [[ 51270    134]
 [ 21111 214343]]

Inference Time: 33.142426 seconds
Peak Memory Usage: 126594.06 KB


In [None]:
import numpy as np
import pickle
import time
import tracemalloc
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import gc
gc.collect()


X_train_selected = X_train_selected.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test_selected = X_test_selected.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

nb_model = GaussianNB()
nb_model.fit(X_train_selected, y_train)

with open("/kaggle/working/best_model_nb.pkl", "wb") as model_file:
    pickle.dump(nb_model, model_file)

tracemalloc.start()
start_time = time.time()

y_test_pred = nb_model.predict(X_test_selected)  

end_time = time.time()
current, peak_memory = tracemalloc.get_traced_memory()
tracemalloc.stop()

inference_time = end_time - start_time
peak_memory_kb = peak_memory / 1024  

tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Positive Rate

print("\nTest Set Performance (Naïve Bayes):")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred, zero_division=0):.4f}")
print(f"Recall: {recall_score(y_test, y_test_pred, zero_division=0):.4f}")
print(f"F1 Score: {f1_score(y_test, y_test_pred, zero_division=0):.4f}")
print(f"False Positive Rate: {fpr:.4f}")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print(f"\nInference Time: {inference_time:.6f} seconds")
print(f"Peak Memory Usage: {peak_memory_kb:.2f} KB")



Test Set Performance (Naïve Bayes):
Accuracy: 0.7655
Precision: 0.9913
Recall: 0.7206
F1 Score: 0.8346
False Positive Rate: 0.0288

Confusion Matrix:
 [[ 49922   1482]
 [ 65778 169676]]

Inference Time: 0.206109 seconds
Peak Memory Usage: 247613.24 KB


In [None]:
import numpy as np
import pickle
import time
import tracemalloc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import gc
gc.collect()

X_train_selected = X_train_selected.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test_selected = X_test_selected.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

lr_model = LogisticRegression(solver='liblinear', random_state=42)  
lr_model.fit(X_train_selected, y_train)

with open("/kaggle/working/best_model_lr.pkl", "wb") as model_file:
    pickle.dump(lr_model, model_file)

tracemalloc.start()
start_time = time.time()

y_test_pred = lr_model.predict(X_test_selected)  

end_time = time.time()
current, peak_memory = tracemalloc.get_traced_memory()
tracemalloc.stop()

inference_time = end_time - start_time
peak_memory_kb = peak_memory / 1024

tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  

print("\nTest Set Performance (Logistic Regression):")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred, zero_division=0):.4f}")
print(f"Recall: {recall_score(y_test, y_test_pred, zero_division=0):.4f}")
print(f"F1 Score: {f1_score(y_test, y_test_pred, zero_division=0):.4f}")
print(f"False Positive Rate: {fpr:.4f}")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print(f"\nInference Time: {inference_time:.6f} seconds")
print(f"Peak Memory Usage: {peak_memory_kb:.2f} KB")



Test Set Performance (Logistic Regression):
Accuracy: 0.9537
Precision: 0.9959
Recall: 0.9475
F1 Score: 0.9711
False Positive Rate: 0.0177

Confusion Matrix:
 [[ 50495    909]
 [ 12368 223086]]

Inference Time: 0.023276 seconds
Peak Memory Usage: 7817.40 KB


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import time
import tracemalloc
import gc
gc.collect()

X_train_reshaped = X_train_selected.values.reshape(X_train_selected.shape[0], X_train_selected.shape[1], 1)
X_test_reshaped = X_test_selected.values.reshape(X_test_selected.shape[0], X_test_selected.shape[1], 1)

model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_selected.shape[1], 1)),
    BatchNormalization(),
    Dropout(0.2),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')  
])

model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(X_train_reshaped, y_train, validation_data=(X_test_reshaped, y_test), 
          epochs=20, batch_size=64, callbacks=[early_stop])

model.save("/kaggle/working/best_model_cnn_lstm.h5")

tracemalloc.start()
start_time = time.time()

y_test_pred_prob = model.predict(X_test_reshaped)  
y_test_pred = (y_test_pred_prob > 0.5).astype(int) 
end_time = time.time()
current, peak_memory = tracemalloc.get_traced_memory()
tracemalloc.stop()

inference_time = end_time - start_time
peak_memory_kb = peak_memory / 1024 

tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  

print("\nTest Set Performance (CNN-LSTM):")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred, zero_division=0):.4f}")
print(f"Recall: {recall_score(y_test, y_test_pred, zero_division=0):.4f}")
print(f"F1 Score: {f1_score(y_test, y_test_pred, zero_division=0):.4f}")
print(f"False Positive Rate: {fpr:.4f}")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print(f"\nInference Time: {inference_time:.6f} seconds")
print(f"Peak Memory Usage: {peak_memory_kb:.2f} KB")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m1773/1773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 18ms/step - accuracy: 0.9530 - loss: 0.1325 - val_accuracy: 0.9934 - val_loss: 0.0376
Epoch 2/20
[1m1773/1773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 17ms/step - accuracy: 0.9897 - loss: 0.0344 - val_accuracy: 0.8949 - val_loss: 0.3569
Epoch 3/20
[1m1773/1773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 17ms/step - accuracy: 0.9961 - loss: 0.0161 - val_accuracy: 0.9275 - val_loss: 0.2360
Epoch 4/20
[1m1773/1773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 17ms/step - accuracy: 0.9965 - loss: 0.0147 - val_accuracy: 0.9632 - val_loss: 0.0862
Epoch 5/20
[1m1773/1773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 17ms/step - accuracy: 0.9973 - loss: 0.0114 - val_accuracy: 0.9610 - val_loss: 0.0640
Epoch 6/20
[1m1773/1773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 17ms/step - accuracy: 0.9964 - loss: 0.0137 - val_accuracy: 0.9492 - val_loss: 0.0981
[1m

Key takeaways:

a) before training, use random forest feature selection method to find highly correlated features, and remove redundant features/apply other strategies like PCA or smthg else

b) use binarization strategies as used here, for the features, before training the Tsetlin machine

c) the dataset used here, is useful for academic purposes only, and as an exploratory purpose, to test the waters, so to speak. it is too well-labeled, too
detailed, and too separable.

d) Use CPU, not GPU, cuz the Python Package for Tsetlin machine is only supported by CPU