In [1]:
pip install --upgrade tensorflow


Collecting tensorflow
  Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting tensorboard~=2.20.0 (from tensorflow)
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (620.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.7/620.7 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorboard-2.20.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m85.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboard, tensorflow
  Attempting uninstall: tensorboard
    Found existing installation: tensorboard 2.19.0
    Uninstalling tensorboard-2.19.0:
      Successfully uninstalled tensorboard-2.19.0
  Attempting uninstall: tensorflow
    Found existing installation: tensorflow 2.19.0
    Uninstalling tensorflow-2.

In [2]:
import pandas as pd
import numpy as np
import joblib
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
from typing import Dict, Any

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
drive_path = '/content/drive/MyDrive/ANOMALY_DETECTION/'

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.decomposition import PCA
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.ensemble import RandomForestClassifier

# --- 1. Configuration ---
DRIVE_PATH = '/content/drive/MyDrive/ANOMALY_DETECTION/'
original_data_path = drive_path + 'clean_data.csv'
synthetic_data_path = drive_path + 'final_balanced_dataset.csv'
artifacts_path = drive_path

# Model & Preprocessing Parameters
TEST_SIZE = 0.2
RANDOM_STATE = 42
PCA_COMPONENTS = 0.95

def load_and_prepare_data(original_path: str, synthetic_path: str, test_size: float, random_state: int) -> Dict[str, Any]:
    """Loads, splits, and combines original and synthetic data."""
    print("--- Step 1: Loading and Preparing Data ---")
    original_df = pd.read_csv(original_path)
    synthetic_df = pd.read_csv(synthetic_path)

    # Create a stratified hold-out test set from the original data
    train_df, test_df = train_test_split(original_df, test_size=test_size, random_state=random_state, stratify=original_df['Label'])
    joblib.dump(test_df, artifacts_path + 'hold_out_test_set.pkl')
    print(f"Hold-out test set created and saved. Shape: {test_df.shape}")

    # Create the full training set by combining original training data with synthetic attacks
    synthetic_attacks_df = synthetic_df[synthetic_df['Label'] != 'BENIGN']
    full_train_df = pd.concat([train_df, synthetic_attacks_df], ignore_index=True).sample(frac=1, random_state=random_state)
    print(f"Full training dataset assembled. Shape: {full_train_df.shape}")

    return {'train': full_train_df, 'test': test_df}

def preprocess_data(train_df: pd.DataFrame, test_df: pd.DataFrame, pca_components: float) -> Dict[str, Any]:
    """Applies a full preprocessing pipeline to the data."""
    print("--- Step 2: Preprocessing Data ---")
    # One-hot encode categorical features
    categorical_cols = ['Destination_port_group']
    train_encoded = pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)
    test_encoded = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

    # Align columns to ensure consistency
    train_labels = train_encoded['Label']
    train_features = train_encoded.drop(columns=['Label'])
    test_labels = test_encoded['Label']
    test_features = test_encoded.drop(columns=['Label'])
    train_features, test_features = train_features.align(test_features, join='left', axis=1, fill_value=0)

    # Isolate benign training data to fit the preprocessors
    benign_train_features = train_features[train_labels == 'BENIGN']
    print(f"Fitting Scaler and PCA on {len(benign_train_features)} benign samples...")

    # Fit Scaler and PCA ONLY on benign training data to prevent data leakage
    scaler = MinMaxScaler()
    scaler.fit(benign_train_features)
    pca = PCA(n_components=pca_components, random_state=RANDOM_STATE)
    pca.fit(scaler.transform(benign_train_features))

    # Transform all relevant data subsets
    benign_train_pca = pca.transform(scaler.transform(benign_train_features))

    attack_train_features = train_features[train_labels != 'BENIGN']
    attack_train_labels = train_labels[train_labels != 'BENIGN']
    attack_train_pca = pca.transform(scaler.transform(attack_train_features))

    return {
        'benign_train_pca': benign_train_pca,
        'attack_train_pca': attack_train_pca,
        'attack_train_labels': attack_train_labels,
        'scaler': scaler,
        'pca': pca
    }

def train_autoencoder(data: np.ndarray) -> (Model, float):
    """Trains an Autoencoder for anomaly detection."""
    print("--- Step 3: Training Autoencoder ---")
    X_train, X_val = train_test_split(data, test_size=0.2, random_state=RANDOM_STATE)

    input_dim = X_train.shape[1]
    encoding_dim1 = int(input_dim * 0.75)
    encoding_dim2 = int(input_dim * 0.5)

    input_layer = Input(shape=(input_dim,))
    encoder = Dense(encoding_dim1, activation="relu")(input_layer)
    encoder = Dense(encoding_dim2, activation="relu")(encoder)
    decoder = Dense(encoding_dim1, activation="relu")(encoder)
    decoder = Dense(input_dim, activation="sigmoid")(decoder)

    autoencoder = Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')

    callbacks = [
        EarlyStopping(patience=5, restore_best_weights=True, monitor='val_loss'),
        ReduceLROnPlateau(patience=3, factor=0.5, monitor='val_loss')
    ]

    autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, shuffle=True, validation_data=(X_val, X_val), callbacks=callbacks, verbose=1)

    # Determine anomaly threshold
    reconstructions = autoencoder.predict(X_val)
    errors = np.mean(np.square(X_val - reconstructions), axis=1)
    threshold = np.mean(errors) + 3 * np.std(errors)
    print(f"Autoencoder Anomaly Threshold set to: {threshold}")

    return autoencoder, threshold

def train_classifier(features: np.ndarray, labels: pd.Series) -> (RandomForestClassifier, LabelEncoder):
    """Trains a RandomForest classifier for attack classification."""
    print("--- Step 4: Training RandomForest Classifier ---")
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)

    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1, class_weight='balanced')
    rf_classifier.fit(features, encoded_labels)

    return rf_classifier, label_encoder

def save_artifacts(path: str, artifacts: Dict[str, Any]):
    """Saves all model and preprocessor artifacts."""
    print("--- Step 5: Saving All Artifacts ---")
    for name, artifact in artifacts.items():
        if name == 'autoencoder':
            artifact.save(path + f'{name}.h5')
        else:
            joblib.dump(artifact, path + f'{name}.pkl')
    print("✅ All artifacts saved successfully.")

def main():
    """Main function to run the entire training pipeline."""
    # Step 1: Load and split data
    data_dict = load_and_prepare_data(original_data_path, synthetic_data_path, TEST_SIZE, RANDOM_STATE)

    # Step 2: Preprocess data and get components for training
    preprocessed_dict = preprocess_data(data_dict['train'], data_dict['test'], PCA_COMPONENTS)

    # Step 3: Train the anomaly detection model
    autoencoder, threshold = train_autoencoder(preprocessed_dict['benign_train_pca'])

    # Step 4: Train the attack classification model
    rf_classifier, label_encoder = train_classifier(preprocessed_dict['attack_train_pca'], preprocessed_dict['attack_train_labels'])

    # Step 5: Save all the generated artifacts
    artifacts_to_save = {
        'scaler': preprocessed_dict['scaler'],
        'pca_unified': preprocessed_dict['pca'],
        'autoencoder': autoencoder,
        'threshold': threshold,
        'rf_classifier': rf_classifier,
        'label_encoder': label_encoder,
        'training_columns': preprocessed_dict['scaler'].feature_names_in_ # Save column names for future inference
    }
    save_artifacts(artifacts_path, artifacts_to_save)
    print("\nTraining pipeline complete. You can now evaluate performance on 'hold_out_test_set.pkl'.")

if __name__ == '__main__':
    main()

--- Step 1: Loading and Preparing Data ---
Hold-out test set created and saved. Shape: (565569, 78)
