In [1]:
!pip install pyTsetlinMachine pandas scikit-learn pyarrow matplotlib seaborn

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from pyTsetlinMachine.tm import MultiClassTsetlinMachine
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import seaborn as sns
import glob


Collecting pyTsetlinMachine
  Downloading pyTsetlinMachine-0.6.6.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyTsetlinMachine
  Building wheel for pyTsetlinMachine (setup.py) ... [?25l[?25hdone
  Created wheel for pyTsetlinMachine: filename=pyTsetlinMachine-0.6.6-cp311-cp311-linux_x86_64.whl size=59490 sha256=3f31c163e52f00ebec10ed5acb21686ee87e3221aa2204aad7fe0ad143d83307
  Stored in directory: /root/.cache/pip/wheels/46/fb/7b/94130662b0133acfaf27f087dcaa857e5280c61190f90de342
Successfully built pyTsetlinMachine
Installing collected packages: pyTsetlinMachine
Successfully installed pyTsetlinMachine-0.6.6


This version of the script uses binarization strategies for features

Source of training dataset:

https://ieeexplore.ieee.org/abstract/document/8888419

https://www.kaggle.com/datasets/dhoogla/cicddos2019


(for preliminary testing, the test-train split in this dataset has been used)

In [3]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

data_dir = "/kaggle/input/myddosdata"
attack_data = pd.read_json(os.path.join(data_dir, "attack.json"), lines=True)
benign_train = pd.read_json(os.path.join(data_dir, "Benign_Train.json"), lines=True)
benign_test = pd.read_json(os.path.join(data_dir, "Benign_Test.json"), lines=True)

attack_data['Label'] = 1
benign_train['Label'] = 0
benign_test['Label'] = 0

attack_train = attack_data.sample(n=900, random_state=42)
attack_test = attack_data.drop(attack_train.index)

attack_test.to_json("attack_test_samples.json", orient="records", lines=True)

train_data = pd.concat([benign_train, attack_train], ignore_index=True)
test_data = pd.concat([benign_test, attack_test], ignore_index=True)

def rename_columns(df):
    return df.rename(columns=lambda x: x.replace('_', ' ').title() if isinstance(x, str) else x)

train_data = rename_columns(train_data)
test_data = rename_columns(test_data)

categorical_features = ['Protocol']
protocol_categories = [[0, 6, 17]]
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', categories=protocol_categories)

encoded_train = one_hot_encoder.fit_transform(train_data[categorical_features])
encoded_test = one_hot_encoder.transform(test_data[categorical_features])
joblib.dump(one_hot_encoder, "onehot_encoder.pkl")

train_encoded_df = pd.DataFrame(encoded_train, columns=one_hot_encoder.get_feature_names_out(categorical_features), index=train_data.index)
test_encoded_df = pd.DataFrame(encoded_test, columns=one_hot_encoder.get_feature_names_out(categorical_features), index=test_data.index)

train_data = pd.concat([train_data.drop(columns=categorical_features), train_encoded_df], axis=1)
test_data = pd.concat([test_data.drop(columns=categorical_features), test_encoded_df], axis=1)

binary_flags = ["Fwd Psh Flags", "Bwd Psh Flags", "Fwd Urg Flags", "Bwd Urg Flags"]
for flag in binary_flags:
    if flag in train_data.columns:
        train_data[flag] = (train_data[flag] > 0).astype(int)
        test_data[flag] = (test_data[flag] > 0).astype(int)

final_features = [
    "Flow Iat Mean", "Idle Mean", "Fwd Iat Mean", "Packet Length Mean", "Fwd Packet Length Mean",
    "Flow Iat Std", "Fwd Packet Length Min", "Idle Min", "Flow Iat Min", "Init Fwd Win Bytes",
    "Packet Length Variance", "Cwe Flag Count", "Protocol_0", "Protocol_6", "Protocol_17",
    "Flow Packets Per S", "Fwd Packets Per S", "Fwd Psh Flags", "Fwd Act Data Packets", "Fwd Iat Std",
    "Avg Fwd Segment Size", "Flow Iat Max", "Total Fwd Packets", "Subflow Fwd Packets",
    "Fwd Iat Min", "Urg Flag Count", "Ack Flag Count", "Rst Flag Count", "Fwd Packet Length Std",
    "Fwd Iat Max", "Packet Length Min", "Active Max", "Label"
]

train_data = train_data[[f for f in final_features if f in train_data.columns]]
test_data = test_data[[f for f in final_features if f in test_data.columns]]

X_train = train_data.drop(columns=['Label'])
y_train = train_data['Label']
X_test = test_data.drop(columns=['Label'])
y_test = test_data['Label']

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
joblib.dump(scaler, "minmax_scaler.pkl")

print(" Preprocessing complete!")
print(f" Train shape: {X_train.shape}")
print(f" Test shape: {X_test.shape}")

print(f"Final number of features used for training: {X_train.shape[1]}")
print("Features:")
for feature in train_data.drop(columns=['Label']).columns:
    print(f"- {feature}")


 Preprocessing complete!
 Train shape: (2019, 32)
 Test shape: (237, 32)
Final number of features used for training: 32
Features:
- Flow Iat Mean
- Idle Mean
- Fwd Iat Mean
- Packet Length Mean
- Fwd Packet Length Mean
- Flow Iat Std
- Fwd Packet Length Min
- Idle Min
- Flow Iat Min
- Init Fwd Win Bytes
- Packet Length Variance
- Cwe Flag Count
- Protocol_0
- Protocol_6
- Protocol_17
- Flow Packets Per S
- Fwd Packets Per S
- Fwd Psh Flags
- Fwd Act Data Packets
- Fwd Iat Std
- Avg Fwd Segment Size
- Flow Iat Max
- Total Fwd Packets
- Subflow Fwd Packets
- Fwd Iat Min
- Urg Flag Count
- Ack Flag Count
- Rst Flag Count
- Fwd Packet Length Std
- Fwd Iat Max
- Packet Length Min
- Active Max


In [4]:
print(f"Total training samples: {len(train_data)}")

print("Training set label distribution:\n", train_data['Label'].value_counts())

Total training samples: 2019
Training set label distribution:
 Label
0    1119
1     900
Name: count, dtype: int64


In [5]:
final_feature_names = train_data.drop(columns=['Label']).columns.tolist()

print(f"Final number of features used for training: {len(final_feature_names)}")
print("Features:")
for feature in final_feature_names:
    print(f"- {feature}")

Final number of features used for training: 32
Features:
- Flow Iat Mean
- Idle Mean
- Fwd Iat Mean
- Packet Length Mean
- Fwd Packet Length Mean
- Flow Iat Std
- Fwd Packet Length Min
- Idle Min
- Flow Iat Min
- Init Fwd Win Bytes
- Packet Length Variance
- Cwe Flag Count
- Protocol_0
- Protocol_6
- Protocol_17
- Flow Packets Per S
- Fwd Packets Per S
- Fwd Psh Flags
- Fwd Act Data Packets
- Fwd Iat Std
- Avg Fwd Segment Size
- Flow Iat Max
- Total Fwd Packets
- Subflow Fwd Packets
- Fwd Iat Min
- Urg Flag Count
- Ack Flag Count
- Rst Flag Count
- Fwd Packet Length Std
- Fwd Iat Max
- Packet Length Min
- Active Max


In [6]:
print("Training set class distribution:")
print(y_train.value_counts())

print("Testing set class distribution:")
print(y_test.value_counts())


Training set class distribution:
Label
0    1119
1     900
Name: count, dtype: int64
Testing set class distribution:
Label
0    122
1    115
Name: count, dtype: int64


to get statistically significant results, 
hyperparameter tuning is used

In [15]:
import numpy as np
import pickle
import time
import tracemalloc
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from pyTsetlinMachine.tm import MultiClassTsetlinMachine
from joblib import Parallel, delayed

X_train_np = np.array(X_train, dtype=np.float32)
y_train_np = np.array(y_train, dtype=np.int32)
X_test_np = np.array(X_test, dtype=np.float32)
y_test_np = np.array(y_test, dtype=np.int32)

param_grid = {'num_clauses': [200, 500, 1000], 'T': [15, 30, 60], 's': [3.0, 4.0, 6.0]}
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_params, best_f1, lowest_fpr, best_model = None, 0, float('inf'), None

def train_fold(num_clauses, T, s, X_train_fold, y_train_fold, X_val_fold, y_val_fold):
    tm = MultiClassTsetlinMachine(num_clauses, T, s)
    best_epoch_f1, early_stop_counter = 0, 0

    for epoch in range(1, 51):
        tm.fit(X_train_fold, y_train_fold, epochs=1)
        y_val_pred = tm.predict(X_val_fold)
        f1 = f1_score(y_val_fold, y_val_pred, zero_division=0)
        if f1 > best_epoch_f1:
            best_epoch_f1, early_stop_counter = f1, 0
        else:
            early_stop_counter += 1
        if early_stop_counter >= 3:
            break

    y_val_pred = tm.predict(X_val_fold)
    tn, fp, fn, tp = confusion_matrix(y_val_fold, y_val_pred).ravel()
    return accuracy_score(y_val_fold, y_val_pred), f1_score(y_val_fold, y_val_pred, zero_division=0), fp / (fp + tn) if (fp + tn) > 0 else 0

for params in ParameterGrid(param_grid):
    print(f"Testing params: {params}")
    results = Parallel(n_jobs=-1)(
        delayed(train_fold)(
            params['num_clauses'], params['T'], params['s'],
            X_train_np[train], y_train_np[train],
            X_train_np[val], y_train_np[val]
        ) 
        for train, val in kf.split(X_train_np)
    )

    mean_acc, mean_f1, mean_fpr = np.mean(results, axis=0)
    print(f" Params: {params} -> F1: {mean_f1:.4f}, FPR: {mean_fpr:.4f}")

    if mean_f1 > best_f1 or (mean_f1 == best_f1 and mean_fpr < lowest_fpr):
        best_f1, lowest_fpr, best_params = mean_f1, mean_fpr, params

        with open("/kaggle/working/params.pkl", "wb") as f:
            pickle.dump(best_params, f)
        print(" Best hyperparameters saved.")

if best_params:
    print("\n Re-training best model on full training set...")
    best_model = MultiClassTsetlinMachine(best_params['num_clauses'], best_params['T'], best_params['s'])
    best_model.fit(X_train_np, y_train_np, epochs=50)

    with open("/kaggle/working/model.pkl", "wb") as model_file:
        pickle.dump(best_model, model_file)
    print("Trained model saved.")

    def predict_chunk(model, X_chunk):
        return model.predict(X_chunk)

    num_jobs = -1
    num_chunks = max(2, abs(num_jobs) * 2)
    X_test_chunks = np.array_split(X_test_np, num_chunks)

    tracemalloc.start()
    start_time = time.time()

    y_test_pred_chunks = Parallel(n_jobs=num_jobs)(
        delayed(predict_chunk)(best_model, chunk) for chunk in X_test_chunks
    )
    y_test_pred = np.concatenate(y_test_pred_chunks)

    inference_time = time.time() - start_time
    current, peak_memory = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    print("\n Final Test Set Evaluation:")
    print(f"Accuracy  : {accuracy_score(y_test_np, y_test_pred):.4f}")
    print(f"Precision : {precision_score(y_test_np, y_test_pred, zero_division=0):.4f}")
    print(f"Recall    : {recall_score(y_test_np, y_test_pred, zero_division=0):.4f}")
    print(f"F1 Score  : {f1_score(y_test_np, y_test_pred, zero_division=0):.4f}")
    print(f" Inference Time (Parallel): {inference_time:.6f} seconds")
    print(f" Peak Memory during Inference: {peak_memory / 1024:.2f} KB")
    print("\nConfusion Matrix:\n", confusion_matrix(y_test_np, y_test_pred))

else:
    print(" No trained model found. Please check grid search or data.")


Testing params: {'T': 15, 'num_clauses': 200, 's': 3.0}
 Params: {'T': 15, 'num_clauses': 200, 's': 3.0} -> F1: 0.5774, FPR: 0.0190
 Best hyperparameters saved.
Testing params: {'T': 15, 'num_clauses': 200, 's': 4.0}
 Params: {'T': 15, 'num_clauses': 200, 's': 4.0} -> F1: 0.5738, FPR: 0.0177
Testing params: {'T': 15, 'num_clauses': 200, 's': 6.0}
 Params: {'T': 15, 'num_clauses': 200, 's': 6.0} -> F1: 0.5824, FPR: 0.0389
 Best hyperparameters saved.
Testing params: {'T': 15, 'num_clauses': 500, 's': 3.0}
 Params: {'T': 15, 'num_clauses': 500, 's': 3.0} -> F1: 0.5503, FPR: 0.0000
Testing params: {'T': 15, 'num_clauses': 500, 's': 4.0}
 Params: {'T': 15, 'num_clauses': 500, 's': 4.0} -> F1: 0.5492, FPR: 0.0000
Testing params: {'T': 15, 'num_clauses': 500, 's': 6.0}
 Params: {'T': 15, 'num_clauses': 500, 's': 6.0} -> F1: 0.5525, FPR: 0.0000
Testing params: {'T': 15, 'num_clauses': 1000, 's': 3.0}
 Params: {'T': 15, 'num_clauses': 1000, 's': 3.0} -> F1: 0.5492, FPR: 0.0000
Testing params: 

In [13]:
import numpy as np
import pickle
import time
import tracemalloc

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from joblib import Parallel, delayed

param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20], 'min_samples_split': [2, 5]}
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_params, best_f1, lowest_fpr, best_model = None, 0, float('inf'), None

def train_fold(n_estimators, max_depth, min_samples_split, X_train_fold, y_train_fold, X_val_fold, y_val_fold):
    model = RandomForestClassifier(
        n_estimators=n_estimators, max_depth=max_depth,
        min_samples_split=min_samples_split, random_state=42, n_jobs=-1
    )
    model.fit(X_train_fold, y_train_fold)

    y_val_pred = model.predict(X_val_fold)
    f1 = f1_score(y_val_fold, y_val_pred, zero_division=0)
    
    tn, fp, fn, tp = confusion_matrix(y_val_fold, y_val_pred).ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    return accuracy_score(y_val_fold, y_val_pred), f1, fpr

for params in ParameterGrid(param_grid):
    print(f" Testing params: {params}")
    results = Parallel(n_jobs=-1)(
        delayed(train_fold)(
            params['n_estimators'], params['max_depth'], params['min_samples_split'],
            X_train_np[train], y_train_np[train],
            X_train_np[val], y_train_np[val]
        )
        for train, val in kf.split(X_train_np)
    )

    mean_acc, mean_f1, mean_fpr = np.mean(results, axis=0)
    print(f" Params: {params} -> F1: {mean_f1:.4f}, FPR: {mean_fpr:.4f}")

    if mean_f1 > best_f1 or (mean_f1 == best_f1 and mean_fpr < lowest_fpr):
        best_f1, lowest_fpr, best_params = mean_f1, mean_fpr, params
        with open("/kaggle/working/best_params_rf.pkl", "wb") as f:
            pickle.dump(best_params, f)
        print("  Best hyperparameters saved.")

if best_params:
    print(f"\n Training final model with best params: {best_params}")
    best_model = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
    best_model.fit(X_train_np, y_train_np)

    with open("/kaggle/working/best_model_rf.pkl", "wb") as model_file:
        pickle.dump(best_model, model_file)
    print("  Trained model saved.")

    tracemalloc.start()
    start_time = time.time()

    y_test_pred = best_model.predict(X_test_np)

    end_time = time.time()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    inference_time = end_time - start_time
    peak_memory_kb = peak / 1024  

    print("\n Test Set Performance (Random Forest):")
    print(f" Accuracy : {accuracy_score(y_test_np, y_test_pred):.4f}")
    print(f" Precision: {precision_score(y_test_np, y_test_pred, zero_division=0):.4f}")
    print(f" Recall   : {recall_score(y_test_np, y_test_pred, zero_division=0):.4f}")
    print(f" F1 Score : {f1_score(y_test_np, y_test_pred, zero_division=0):.4f}")
    print("\n Confusion Matrix:\n", confusion_matrix(y_test_np, y_test_pred))

    print(f"\n Inference Time: {inference_time:.6f} seconds")
    print(f" Peak Memory Usage: {peak_memory_kb:.2f} KB")

else:
    print(" No trained model found. Please check grid search or data.")


 Testing params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
 Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100} -> F1: 0.9586, FPR: 0.0260
  Best hyperparameters saved.
 Testing params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
 Params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200} -> F1: 0.9572, FPR: 0.0306
 Testing params: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
 Params: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100} -> F1: 0.9611, FPR: 0.0270
  Best hyperparameters saved.
 Testing params: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
 Params: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200} -> F1: 0.9572, FPR: 0.0288
 Testing params: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
 Params: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100} -> F1: 0.9589, FPR: 0.0287
 Testing params: {'max_depth': 20, 'min_sampl

In [8]:
import pickle
with open("/kaggle/working/params.pkl", "rb") as file:
    tm_loaded = pickle.load(file)

print("Model successfully loaded")


Model successfully loaded


In [9]:
import pickle
import numpy as np

# Load the trained Tsetlin Machine model
with open("/kaggle/working/model.pkl", "rb") as file:
    tm_loaded = pickle.load(file)

print(" Model successfully loaded!")

X_sample = np.array([[0, 1, 0, 1, 1, 0, 1, 0]])  # Replace with actual sample data

y_pred = tm_loaded.predict(X_sample)

print(f" Predicted Class: {y_pred[0]}")


 Model successfully loaded!
 Predicted Class: 0


Key takeaways:

a) before training, plot correlation matrix to find highly correlated features, and remove redundant features/apply other strategies like PCA or smthg else

b) use binarization strategies as used here, for the features, before training the Tsetlin machine

c) the dataset used here, is useful for academic purposes only, and as an exploratory purpose, to test the waters, so to speak. it is too well-labeled, too detailed, and too separable.

d) Use CPU, not GPU, cuz the Python Package for Tsetlin machine is only supported by CPU