In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 1) Load your separate train & test CSVs
train_df = pd.read_csv("/content/drive/MyDrive/Research Project/KDDTrain+.txt", header=None)
test_df  = pd.read_csv("/content/drive/MyDrive/Research Project/KDDTest+.txt",  header=None)

# 2) Assign column names
columns = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes",
    "land","wrong_fragment","urgent","hot","num_failed_logins","logged_in",
    "num_compromised","root_shell","su_attempted","num_root","num_file_creations",
    "num_shells","num_access_files","num_outbound_cmds","is_host_login",
    "is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
    "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate",
    "srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate",
    "attack_type","difficulty"
]
train_df.columns = columns
test_df.columns  = columns

# 3) Convert attack_type → binary label: 'normal'→0, others→1
train_df["label"] = (train_df["attack_type"] != "normal").astype(int)
test_df["label"]  = (test_df["attack_type"]  != "normal").astype(int)

# 4) Select only numeric features
drop_cols = ["protocol_type","service","flag","attack_type","difficulty"]
feature_cols = [c for c in train_df.columns if c not in drop_cols + ["label"]]

X_train = train_df[feature_cols].values
y_train = train_df["label"].values

X_test  = test_df[feature_cols].values
y_test  = test_df["label"].values

# 5) Instantiate and train the Decision Tree
clf = DecisionTreeClassifier(
    criterion="entropy",
    max_depth=10,
    min_samples_leaf=5,
    random_state=42
)
clf.fit(X_train, y_train)

# 6) Predictions & evaluation on the separate test set
y_pred = clf.predict(X_test)
print("Classification Report:\n",
      classification_report(y_test, y_pred, target_names=["normal","attack"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

      normal       0.69      0.97      0.81      9711
      attack       0.97      0.67      0.79     12833

    accuracy                           0.80     22544
   macro avg       0.83      0.82      0.80     22544
weighted avg       0.85      0.80      0.80     22544

Confusion Matrix:
 [[9448  263]
 [4219 8614]]


In [None]:
from sklearn.metrics import roc_auc_score

def evaluate_model(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:,1] if hasattr(clf, "predict_proba") else None

    results = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None,
        "ConfusionMatrix": confusion_matrix(y_test, y_pred).tolist()
    }
    return results

In [None]:
import json
results = evaluate_model(clf, X_test, y_test)
with open("/content/drive/MyDrive/Results/Decision Tree/Baseline_Decision_Tree_Results.txt", "w") as f:
    json.dump(results, f, indent=2)