In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# ------------------------------- Load and Prepare Dataset -------------------------------
print("Loading dataset...")

# List of file paths to be merged
file_paths = ['UNSW-NB15_1.csv', 'UNSW-NB15_2.csv', 'UNSW-NB15_3.csv', 'UNSW-NB15_4.csv']
# Read and merge all CSV files into a single DataFrame
df = pd.concat([pd.read_csv(file, low_memory=False) for file in file_paths], ignore_index=True)

# Encode the label column
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])

# Drop non-numeric columns and fill NaNs
df = df.select_dtypes(include=[np.number]).fillna(0)

# Optionally reduce dataset size for faster testing
print("Using subset of the data (10,000 samples)...")
df = df.sample(n=10000, random_state=42)

# Separate features and target
X = df.drop(columns=['Label'])
y = df['Label']

# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Apply SMOTE for class imbalance
print("Applying SMOTE...")
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Early stopping for deep learning models
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# ------------------------------- Random Forest -------------------------------
print("Training Random Forest...")
rf = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)  # Reduced depth
rf.fit(X_train_res, y_train_res)
y_pred_rf = rf.predict(X_test)
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

# ------------------------------- XGBoost -------------------------------
print("Training XGBoost...")
xgboost = XGBClassifier(random_state=42)
xgboost.fit(X_train_res, y_train_res)
y_pred_xgb = xgboost.predict(X_test)
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))

# ------------------------------- SVM -------------------------------
print("Training SVM...")
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train_res, y_train_res)
y_pred_svm = svm.predict(X_test)
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))

# ------------------------------- LSTM -------------------------------
print("Training LSTM...")
X_train_lstm = X_train_res.reshape((X_train_res.shape[0], X_train_res.shape[1], 1))
X_test_lstm = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

lstm_model = Sequential([
    LSTM(20, activation='relu', input_shape=(X_train_lstm.shape[1], 1)),  # Reduced units
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
lstm_model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_lstm, y_train_res, epochs=3, batch_size=32,
               validation_data=(X_test_lstm, y_test), callbacks=[early_stop], verbose=0)

y_pred_lstm = lstm_model.predict(X_test_lstm)
y_pred_lstm = (y_pred_lstm > 0.5)
print("LSTM Classification Report:\n", classification_report(y_test, y_pred_lstm))

# ------------------------------- CNN -------------------------------
print("Training CNN...")
X_train_cnn = X_train_res.reshape((X_train_res.shape[0], X_train_res.shape[1], 1))
X_test_cnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

cnn_model = Sequential([
    Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)),  # Reduced filters
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
cnn_model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train_cnn, y_train_res, epochs=3, batch_size=32,
              validation_data=(X_test_cnn, y_test), callbacks=[early_stop], verbose=0)

y_pred_cnn = cnn_model.predict(X_test_cnn)
y_pred_cnn = (y_pred_cnn > 0.5)
print("CNN Classification Report:\n", classification_report(y_test, y_pred_cnn))

# ------------------------------- Autoencoder -------------------------------
print("Training Autoencoder...")
input_dim = X_train_res.shape[1]
autoencoder = Sequential([
    Dense(64, activation='relu', input_shape=(input_dim,)),  # Reduced size
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(64, activation='relu'),
    Dense(input_dim, activation='sigmoid')
])
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(X_train_res, X_train_res, epochs=3, batch_size=32,
                validation_data=(X_test, X_test), callbacks=[early_stop], verbose=0)

X_test_pred = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - X_test_pred, 2), axis=1)
threshold = np.percentile(mse, 95)
y_pred_autoencoder = (mse > threshold).astype(int)

print("Autoencoder Classification Report:\n", classification_report(y_test, y_pred_autoencoder))


Loading dataset...
Using subset of the data (10,000 samples)...
Applying SMOTE...
Training Random Forest...
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99      1720
           1       0.90      1.00      0.95       280

    accuracy                           0.98      2000
   macro avg       0.95      0.99      0.97      2000
weighted avg       0.99      0.98      0.98      2000

Training XGBoost...
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1720
           1       0.95      0.97      0.96       280

    accuracy                           0.99      2000
   macro avg       0.97      0.98      0.98      2000
weighted avg       0.99      0.99      0.99      2000

Training SVM...
SVM Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99      

  super().__init__(**kwargs)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
LSTM Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99      1720
           1       0.87      1.00      0.93       280

    accuracy                           0.98      2000
   macro avg       0.93      0.99      0.96      2000
weighted avg       0.98      0.98      0.98      2000

Training CNN...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
CNN Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99      1720
           1       0.89      0.99      0.94       280

    accuracy                           0.98      2000
   macro avg       0.94      0.99      0.96      2000
weighted avg       0.98      0.98      0.98      2000

Training Autoencoder...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Autoencoder Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.94      0.90      1720
           1       0.03      0.01      0.02       280

    accuracy                           0.81      2000
   macro avg       0.44      0.48      0.46      2000
weighted avg       0.74      0.81      0.77      2000

