In [1]:
import os
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Dropout, Flatten, BatchNormalization, GlobalAveragePooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

2025-11-07 23:27:05.916307: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-07 23:27:06.185962: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-07 23:27:07.524443: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


In [2]:
# Configure TensorFlow for CPU optimization
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.config.threading.set_intra_op_parallelism_threads(4)
tf.config.threading.set_inter_op_parallelism_threads(4)
tf.config.set_soft_device_placement(True)

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration
CONFIG = {
    'dataset_path': "../data/raw/CSE-CIC-IDS2018",
    'sample_size': 150000,  # Samples per file, set to None for all data
    'test_size': 0.2,
    'random_state': 42,
    'min_samples': 100,
    'batch_size': 256,  # Larger batch for CNN
    'epochs': 30,
    'model_path': 'best_1dcnn_model.keras'
}

In [3]:
def load_and_sample_data(dataset_path, sample_size=None):
    """
    Load CSV files with optional sampling.
    
    Args:
        dataset_path (str): Path to dataset directory
        sample_size (int): Number of samples per file (None = all data)
    
    Returns:
        pd.DataFrame: Combined dataframe
    """
    all_files = [
        os.path.join(dataset_path, f) 
        for f in os.listdir(dataset_path) 
        if f.endswith('.csv')
    ]
    
    if not all_files:
        raise ValueError(f"No CSV files found in {dataset_path}")
    
    processed_frames = []
    logger.info(f"Starting to process {len(all_files)} files...")
    
    for file in all_files:
        try:
            df = pd.read_csv(file, low_memory=False)
            
            # Sample if specified
            if sample_size and len(df) > sample_size:
                df = df.sample(n=sample_size, random_state=CONFIG['random_state'])
            
            processed_frames.append(df)
            logger.info(f"Processed: {os.path.basename(file)} ({len(df)} rows)")
            
        except Exception as e:
            logger.error(f"Error reading {file}: {e}")
            continue
    
    if not processed_frames:
        raise ValueError("No files were successfully processed")
    
    logger.info("Concatenating dataframes...")
    combined_df = pd.concat(processed_frames, ignore_index=True)
    logger.info(f"Combined dataset shape: {combined_df.shape}")
    
    return combined_df

In [4]:
def preprocess_data(df):
    """
    Clean and prepare data for modeling.
    
    Args:
        df (pd.DataFrame): Raw dataframe
    
    Returns:
        tuple: (X, y, label_counts, label_encoder)
    """
    # Drop unnecessary columns
    columns_to_drop = [
        'Flow ID', 'Source IP', 'Source Port', 
        'Destination IP', 'Destination Port', 'Timestamp'
    ]
    df = df.drop(columns=columns_to_drop, errors='ignore')
    
    # Handle missing values
    numeric_columns = df.select_dtypes(include=['number']).columns
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())
    
    # Replace infinite values
    df = df.replace([np.inf, -np.inf], np.nan)
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())
    
    # Encode categorical columns
    label_encoder = LabelEncoder()
    categorical_cols = df.select_dtypes(include=['object']).columns
    
    for col in categorical_cols:
        if col == 'Label':
            df[col] = label_encoder.fit_transform(df[col])
        else:
            temp_encoder = LabelEncoder()
            df[col] = temp_encoder.fit_transform(df[col])
    
    # Check label distribution
    logger.info("Original Label Distribution:")
    label_counts = df['Label'].value_counts()
    logger.info(f"\n{label_counts}")
    
    # Separate features and labels
    X = df.drop('Label', axis=1).values
    y = df['Label'].values
    
    # Normalize features
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    
    return X, y, label_counts, label_encoder

In [5]:
def prepare_data_for_cnn(X, y, label_counts, min_samples=100):
    """
    Prepare data for 1D-CNN model training.
    
    Args:
        X (np.array): Feature matrix
        y (np.array): Labels
        label_counts (pd.Series): Label distribution
        min_samples (int): Minimum samples per class
    
    Returns:
        tuple: (X_train, X_test, y_train, y_test, num_classes)
    """
    # Filter classes with sufficient samples
    valid_classes = label_counts[label_counts >= min_samples].index.tolist()
    mask = np.isin(y, valid_classes)
    X_filtered = X[mask]
    y_filtered = y[mask]
    
    logger.info(f"\nFiltered to {len(valid_classes)} classes")
    unique, counts = np.unique(y_filtered, return_counts=True)
    logger.info(f"Class distribution: {dict(zip(unique, counts))}")
    
    # Balance data with SMOTE
    logger.info("Applying SMOTE for class balancing...")
    smote = SMOTE(random_state=CONFIG['random_state'])
    X_resampled, y_resampled = smote.fit_resample(X_filtered, y_filtered)
    
    # Convert labels to categorical
    y_resampled_cat = to_categorical(y_resampled)
    num_classes = y_resampled_cat.shape[1]
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_resampled, y_resampled_cat, 
        test_size=CONFIG['test_size'], 
        random_state=CONFIG['random_state'],
        stratify=y_resampled
    )
    
    # Reshape for 1D-CNN: (samples, timesteps, features)
    # For network traffic, we treat each feature as a time step
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
    
    logger.info(f"Training set shape: {X_train.shape}")
    logger.info(f"Test set shape: {X_test.shape}")
    logger.info(f"Number of classes: {num_classes}")
    
    return X_train, X_test, y_train, y_test, num_classes

In [6]:
def create_1dcnn_model(input_shape, num_classes):
    """
    Create optimized 1D-CNN model for network intrusion detection.
    
    1D-CNN is ideal for:
    - Sequential pattern detection in network traffic
    - Fast inference (5-10x faster than LSTM/GRU)
    - Better feature extraction from flow-based data
    
    Args:
        input_shape (tuple): Shape of input data
        num_classes (int): Number of output classes
    
    Returns:
        Sequential: Compiled Keras model
    """
    model = Sequential([
        # First Conv Block - Extract low-level patterns
        Conv1D(filters=128, kernel_size=3, activation='relu', 
               padding='same', input_shape=input_shape),
        BatchNormalization(),
        Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),
        
        # Second Conv Block - Extract mid-level patterns
        Conv1D(filters=256, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        Conv1D(filters=256, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),
        
        # Third Conv Block - Extract high-level patterns
        Conv1D(filters=512, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.4),
        
        # Global pooling instead of Flatten (reduces parameters)
        GlobalAveragePooling1D(),
        
        # Dense layers for classification
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        
        # Output layer
        Dense(num_classes, activation='softmax')
    ])
    
    # Compile with Adam optimizer
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
    )
    
    logger.info("\n" + "="*60)
    logger.info("1D-CNN MODEL ARCHITECTURE")
    logger.info("="*60)
    model.summary(print_fn=logger.info)
    logger.info("="*60 + "\n")
    
    return model

In [7]:
def train_model(model, X_train, X_test, y_train, y_test):
    """
    Train the 1D-CNN model with optimized callbacks.
    
    Args:
        model: Keras model
        X_train, X_test: Training and test features
        y_train, y_test: Training and test labels
    
    Returns:
        History: Training history
    """
    # Setup callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=7,
        restore_best_weights=True,
        verbose=1
    )
    
    model_checkpoint = ModelCheckpoint(
        CONFIG['model_path'],
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    )
    
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-7,
        verbose=1
    )
    
    logger.info("\n" + "="*60)
    logger.info("STARTING TRAINING")
    logger.info("="*60)
    
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=CONFIG['epochs'],
        batch_size=CONFIG['batch_size'],
        callbacks=[early_stopping, model_checkpoint, reduce_lr],
        verbose=1
    )
    
    return history


In [8]:
def evaluate_model(model, X_test, y_test, label_encoder):
    """
    Comprehensive model evaluation with metrics and confusion matrix.
    
    Args:
        model: Trained model
        X_test, y_test: Test data
        label_encoder: Label encoder for class names
    """
    logger.info("\n" + "="*60)
    logger.info("MODEL EVALUATION")
    logger.info("="*60)
    
    # Evaluate
    results = model.evaluate(X_test, y_test, verbose=0)
    logger.info(f"Test Loss: {results[0]:.4f}")
    logger.info(f"Test Accuracy: {results[1]*100:.2f}%")
    logger.info(f"Test Precision: {results[2]*100:.2f}%")
    logger.info(f"Test Recall: {results[3]*100:.2f}%")
    
    # Predictions
    y_pred = model.predict(X_test, verbose=0)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)
    
    # Classification Report
    logger.info("\n" + "="*60)
    logger.info("CLASSIFICATION REPORT")
    logger.info("="*60)
    try:
        class_names = label_encoder.inverse_transform(np.unique(y_test_classes))
        report = classification_report(y_test_classes, y_pred_classes, 
                                       target_names=class_names)
        logger.info(f"\n{report}")
    except:
        report = classification_report(y_test_classes, y_pred_classes)
        logger.info(f"\n{report}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_test_classes, y_pred_classes)
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True)
    plt.title('Confusion Matrix - 1D-CNN Model', fontsize=16, fontweight='bold')
    plt.ylabel('True Label', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.tight_layout()
    plt.savefig('confusion_matrix_1dcnn.png', dpi=300, bbox_inches='tight')
    logger.info("Confusion matrix saved as 'confusion_matrix_1dcnn.png'")
    
    return results

In [9]:
def plot_training_history(history):
    """
    Create comprehensive training history plots.
    
    Args:
        history: Training history object
    """
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Accuracy plot
    axes[0, 0].plot(history.history['accuracy'], label='Training Accuracy', linewidth=2)
    axes[0, 0].plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
    axes[0, 0].set_title('Model Accuracy', fontsize=14, fontweight='bold')
    axes[0, 0].set_xlabel('Epoch', fontsize=12)
    axes[0, 0].set_ylabel('Accuracy', fontsize=12)
    axes[0, 0].legend(loc='lower right')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Loss plot
    axes[0, 1].plot(history.history['loss'], label='Training Loss', linewidth=2)
    axes[0, 1].plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
    axes[0, 1].set_title('Model Loss', fontsize=14, fontweight='bold')
    axes[0, 1].set_xlabel('Epoch', fontsize=12)
    axes[0, 1].set_ylabel('Loss', fontsize=12)
    axes[0, 1].legend(loc='upper right')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Precision plot
    axes[1, 0].plot(history.history['precision'], label='Training Precision', linewidth=2)
    axes[1, 0].plot(history.history['val_precision'], label='Validation Precision', linewidth=2)
    axes[1, 0].set_title('Model Precision', fontsize=14, fontweight='bold')
    axes[1, 0].set_xlabel('Epoch', fontsize=12)
    axes[1, 0].set_ylabel('Precision', fontsize=12)
    axes[1, 0].legend(loc='lower right')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Recall plot
    axes[1, 1].plot(history.history['recall'], label='Training Recall', linewidth=2)
    axes[1, 1].plot(history.history['val_recall'], label='Validation Recall', linewidth=2)
    axes[1, 1].set_title('Model Recall', fontsize=14, fontweight='bold')
    axes[1, 1].set_xlabel('Epoch', fontsize=12)
    axes[1, 1].set_ylabel('Recall', fontsize=12)
    axes[1, 1].legend(loc='lower right')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('training_history_1dcnn.png', dpi=300, bbox_inches='tight')
    logger.info("Training plots saved as 'training_history_1dcnn.png'")
    plt.show()

In [10]:
def main():
    """Main execution function."""
    try:
        logger.info("\n" + "="*60)
        logger.info("1D-CNN NETWORK INTRUSION DETECTION SYSTEM")
        logger.info("="*60 + "\n")
        
        # Step 1: Load data
        combined_df = load_and_sample_data(
            CONFIG['dataset_path'],
            CONFIG['sample_size']
        )
        
        # Step 2: Preprocess
        X, y, label_counts, label_encoder = preprocess_data(combined_df)
        
        # Step 3: Prepare for CNN
        X_train, X_test, y_train, y_test, num_classes = prepare_data_for_cnn(
            X, y, label_counts,
            min_samples=CONFIG['min_samples']
        )
        
        # Step 4: Create 1D-CNN model
        model = create_1dcnn_model(
            (X_train.shape[1], X_train.shape[2]),
            num_classes
        )
        
        # Step 5: Train
        history = train_model(model, X_train, X_test, y_train, y_test)
        
        # Step 6: Evaluate
        evaluate_model(model, X_test, y_test, label_encoder)
        
        # Step 7: Plot training history
        plot_training_history(history)
        
        logger.info("\n" + "="*60)
        logger.info("✅ TRAINING COMPLETED SUCCESSFULLY!")
        logger.info("="*60)
        logger.info(f"Model saved at: {CONFIG['model_path']}")
        logger.info("="*60 + "\n")
        
    except Exception as e:
        logger.error(f"An error occurred: {e}", exc_info=True)
        raise

In [None]:
if __name__ == "__main__":
    main()

2025-11-07 23:27:08,294 - INFO - 
2025-11-07 23:27:08,295 - INFO - 1D-CNN NETWORK INTRUSION DETECTION SYSTEM

2025-11-07 23:27:08,296 - INFO - Starting to process 14 files...
2025-11-07 23:27:08,557 - INFO - Processed: DoS attacks-Slowloris.csv (36754 rows)
2025-11-07 23:27:08,568 - INFO - Processed: Brute Force -Web.csv (2073 rows)
2025-11-07 23:27:24,379 - INFO - Processed: DDoS attacks-LOIC-HTTP.csv (150000 rows)
2025-11-07 23:27:24,403 - INFO - Processed: DDOS attack-LOIC-UDP.csv (5784 rows)
2025-11-07 23:27:27,620 - INFO - Processed: DoS attacks-SlowHTTPTest.csv (150000 rows)
2025-11-07 23:27:31,575 - INFO - Processed: Infilteration.csv (150000 rows)
2025-11-07 23:27:31,587 - INFO - Processed: Brute Force -XSS.csv (734 rows)
2025-11-07 23:27:32,614 - INFO - Processed: DoS attacks-GoldenEye.csv (139922 rows)
2025-11-07 23:27:37,057 - INFO - Processed: FTP-BruteForce.csv (150000 rows)
2025-11-07 23:27:41,609 - INFO - Processed: SSH-Bruteforce.csv (150000 rows)
2025-11-07 23:27:41,61

2025-11-07 23:28:45,838 - INFO - Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ conv1d (Conv1D)                 │ (None, 78, 128)        │           512 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ batch_normalization             │ (None, 78, 128)        │           512 │
│ (BatchNormalization)            │                        │               │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv1d_1 (Conv1D)               │ (None, 78, 128)        │        49,280 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ batch_normalization_1           │ (None, 78, 128)        │           512 │
│ (BatchNormalization)            │                        │               │
├──────────────────────

Epoch 1/30
[1m43007/43007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step - accuracy: 0.8606 - loss: 0.2901 - precision: 0.8713 - recall: 0.8522
Epoch 1: val_accuracy improved from -inf to 0.86662, saving model to best_1dcnn_model.keras
[1m43007/43007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8632s[0m 201ms/step - accuracy: 0.8606 - loss: 0.2901 - precision: 0.8713 - recall: 0.8522 - val_accuracy: 0.8666 - val_loss: 0.3255 - val_precision: 0.8668 - val_recall: 0.8665 - learning_rate: 0.0010
Epoch 2/30
[1m43007/43007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step - accuracy: 0.8963 - loss: 0.1884 - precision: 0.8970 - recall: 0.8955
Epoch 2: val_accuracy improved from 0.86662 to 0.90015, saving model to best_1dcnn_model.keras
[1m43007/43007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8548s[0m 199ms/step - accuracy: 0.8963 - loss: 0.1884 - precision: 0.8970 - recall: 0.8955 - val_accuracy: 0.9001 - val_loss: 0.1778 - val_precision: 0.9004 