In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')

# Top 12 features
top_12_features = ['dmeansz', 'sloss', 'Sjit', 'Dpkts', 'Sload', 'Djit',
                   'Sintpkt', 'Spkts', 'ct_state_ttl', 'Dintpkt', 'sbytes', 'tcprtt']

# Load column names
features_df = pd.read_csv('NUSW-NB15_features.csv', header=None, encoding='ISO-8859-1')
column_names = features_df[1].tolist()[1:]

# Load and combine all parts of the dataset
files = ['UNSW-NB15_1.csv', 'UNSW-NB15_2.csv', 'UNSW-NB15_3.csv', 'UNSW-NB15_4.csv']
df = pd.concat([pd.read_csv(f, header=None, encoding='utf-8') for f in files], ignore_index=True)
df.columns = column_names

# Keep only selected features + target
df = df[top_12_features + ['Label']]

# Encode any categorical columns (just in case)
for col in df.select_dtypes(include='object').columns:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Split features and target
X = df.drop('Label', axis=1)
y = df['Label']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split and train
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(f"✅ Accuracy: {model.score(X_test, y_test):.4f}")
print("🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

# Save model and scaler to .pkl files
joblib.dump(model, 'logistic_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("💾 Model and scaler saved as 'logistic_model.pkl' and 'scaler.pkl'")


✅ Accuracy: 0.9737
🧩 Confusion Matrix:
 [[439931   3822]
 [  9559  54698]]

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99    443753
           1       0.93      0.85      0.89     64257

    accuracy                           0.97    508010
   macro avg       0.96      0.92      0.94    508010
weighted avg       0.97      0.97      0.97    508010

💾 Model and scaler saved as 'logistic_model.pkl' and 'scaler.pkl'
