In [34]:
import pandas as pd
import numpy as np
import os
import time
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import json

In [35]:
csv_path = r'F:\dataset\merged csv\dataset_balance.csv'
df = pd.read_csv(csv_path)

dict_2classes = {'DDoS-SlowLoris': 'Attack', 'BenignTraffic': 'Benign'}
df['label'] = df['label'].map(dict_2classes)
df = df.dropna(subset=['label'])

In [36]:
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
    'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
    'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
    'ece_flag_number', 'cwr_flag_number', 'ack_count',
    'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
    'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
    'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
    'Radius', 'Covariance', 'Variance', 'Weight'
]
X = df[X_columns]
y = df['label']

In [37]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [39]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],  
    'class_weight': ['balanced']
}

grid_search = GridSearchCV(
    estimator=LogisticRegression(n_jobs=-1, max_iter=1000),
    param_grid=param_grid,
    cv=5,
    scoring='f1_macro',
    verbose=0
)

start_train_time = time.time()
grid_search.fit(X_train, y_train)
end_train_time = time.time()

best_model = grid_search.best_estimator_

In [40]:
start_pred_time = time.time()
y_pred = best_model.predict(X_test)
end_pred_time = time.time()

In [41]:
training_duration = end_train_time - start_train_time
prediction_duration = end_pred_time - start_pred_time

print("##### Optimized Logistic Regression (2-class) #####")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("F1 Score:", f1_score(y_test, y_pred, average='macro'))
print(f"Training Time: {training_duration:.4f} seconds")
print(f"Prediction Time: {prediction_duration:.4f} seconds")

##### Optimized Logistic Regression (2-class) #####
Accuracy: 0.8283000746985381
Recall: 0.8283027032542267
Precision: 0.8290991462404536
F1 Score: 0.8281965805967706
Training Time: 43.7659 seconds
Prediction Time: 0.0010 seconds


In [42]:
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

result = {
    "Model": "LR_optimal",
    "Accuracy": round(float(acc), 6),
    "Recall": round(float(recall), 6),
    "Precision": round(float(precision), 6),
    "F1 Score": round(float(f1), 6),
    "Training Time (s)": round(float(training_duration), 4),
    "Prediction Time (s)": round(float(prediction_duration), 4)
}

with open("result_LR_optimal.json", "w") as f:
    json.dump(result, f, indent=4)

print("Model evaluation results have been saved to result_LR_optimal.json")

Model evaluation results have been saved to result_LR_optimal.json
