In [None]:

# # Machine Learning Pipeline for Network Intrusion Detection
# 
# This notebook implements a comprehensive machine learning pipeline for network intrusion detection using the CSE-CIC-IDS2018 dataset. The pipeline includes data preprocessing, exploratory data analysis, feature engineering, and various machine learning models for classification.
# 
# ## Table of Contents
# 1. [Setup and Imports](#setup)
# 2. [Data Loading and Preprocessing](#preprocessing)
# 3. [Exploratory Data Analysis (EDA)](#eda)
# 4. [Feature Engineering](#feature_engineering)
# 5. [Machine Learning Models](#ml_models)
# 6. [Model Evaluation](#evaluation)
# 7. [Model Deployment](#deployment)

# <a id="setup"></a>
# ## 1. Setup and Imports

In [None]:

# Configuration
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_STATE = 42

# Basic data manipulation libraries
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# Data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
import plotly.graph_objects as go

# Machine learning libraries
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.ensemble import (
    IsolationForest, RandomForestClassifier, GradientBoostingClassifier, 
    AdaBoostClassifier, VotingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, classification_report, roc_curve, auc, 
    precision_recall_curve, average_precision_score
)
from sklearn.pipeline import Pipeline

# Advanced ML models
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# Neural Networks (optional)
try:
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Dropout
    from tensorflow.keras.callbacks import EarlyStopping
    from tensorflow.keras.optimizers import Adam
    
    # Set random seed for TensorFlow
    tf.random.set_seed(RANDOM_STATE)
    TENSORFLOW_AVAILABLE = True
except ImportError:
    print("TensorFlow not available. Neural network models will be skipped.")
    TENSORFLOW_AVAILABLE = False

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Set NumPy random seed
np.random.seed(RANDOM_STATE)

print("Setup complete. All necessary libraries imported.")

# <a id="preprocessing"></a>
# ## 2. Data Loading and Preprocessing

In [None]:

def load_data(file_path=None, sample_size=None, create_sample=False, n_samples=1000):
    """
    Load data from a CSV file or create a sample dataset.
    
    Args:
        file_path (str, optional): Path to the CSV file
        sample_size (int, optional): Number of rows to sample from the CSV file
        create_sample (bool): Whether to create a synthetic sample dataset
        n_samples (int): Number of samples to generate if create_sample is True
        
    Returns:
        pd.DataFrame: Loaded or created DataFrame
    """
    if create_sample:
        print("Creating sample dataset for demonstration...")
        return create_sample_dataset(n_samples)
    
    if file_path is None:
        # Try to find CSV files in the data directory
        data_dir = os.path.join(os.getcwd(), 'data')
        if os.path.exists(data_dir):
            csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
            if csv_files:
                file_path = os.path.join(data_dir, csv_files[0])
                print(f"Found CSV file: {file_path}")
            else:
                print("No CSV files found in the data directory. Creating sample dataset...")
                return create_sample_dataset(n_samples)
        else:
            print("Data directory not found. Creating sample dataset...")
            return create_sample_dataset(n_samples)
    
    print(f"Loading data from {file_path}...")
    try:
        # Check if the file exists
        if not os.path.exists(file_path):
            print(f"File {file_path} not found. Creating sample dataset...")
            return create_sample_dataset(n_samples)
        
        # Load the data
        if sample_size:
            # Load a random sample of rows
            df = pd.read_csv(file_path, nrows=sample_size)
            print(f"Loaded {len(df)} rows (sample) from {file_path}")
        else:
            # Load the entire dataset
            df = pd.read_csv(file_path)
            print(f"Loaded {len(df)} rows from {file_path}")
        
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        print("Creating sample dataset instead...")
        return create_sample_dataset(n_samples)

def create_sample_dataset(n_samples=1000):
    """
    Create a sample dataset for demonstration purposes.
    
    Args:
        n_samples (int): Number of samples to generate
        
    Returns:
        pd.DataFrame: Sample DataFrame
    """
    np.random.seed(RANDOM_STATE)
    
    # Create sample features
    sample_data = {
        'Dst Port': np.random.randint(1, 65536, n_samples),
        'Protocol': np.random.choice([0, 6, 17], n_samples),  # 0=ICMP, 6=TCP, 17=UDP
        'Flow Duration': np.random.randint(1, 100000, n_samples),
        'Tot Fwd Pkts': np.random.randint(1, 100, n_samples),
        'Tot Bwd Pkts': np.random.randint(1, 100, n_samples),
        'TotLen Fwd Pkts': np.random.randint(1, 10000, n_samples),
        'TotLen Bwd Pkts': np.random.randint(1, 10000, n_samples),
        'Fwd Pkt Len Max': np.random.randint(1, 1500, n_samples),
        'Fwd Pkt Len Min': np.random.randint(0, 100, n_samples),
        'Fwd Pkt Len Mean': np.random.uniform(10, 500, n_samples),
        'Bwd Pkt Len Max': np.random.randint(1, 1500, n_samples),
        'Bwd Pkt Len Min': np.random.randint(0, 100, n_samples),
        'Bwd Pkt Len Mean': np.random.uniform(10, 500, n_samples),
        'Flow Byts/s': np.random.uniform(0, 10000, n_samples),
        'Flow Pkts/s': np.random.uniform(0, 1000, n_samples),
        'Flow IAT Mean': np.random.uniform(0, 1000, n_samples),
        'Flow IAT Std': np.random.uniform(0, 500, n_samples),
        'Flow IAT Max': np.random.uniform(0, 2000, n_samples),
        'Flow IAT Min': np.random.uniform(0, 100, n_samples),
        'Fwd Header Length.1': np.random.randint(20, 100, n_samples),  # Duplicate column
        'Fwd Header Length': np.random.randint(20, 100, n_samples),
        'Label': np.random.choice(['Benign', 'DoS Hulk', 'PortScan', 'Brute Force-Web', 'Web Attack'], n_samples, 
                                 p=[0.7, 0.1, 0.1, 0.05, 0.05])
    }
    
    # Create DataFrame
    df = pd.DataFrame(sample_data)
    print(f"Created sample dataset: {df.shape[0]} rows, {df.shape[1]} columns")
    
    return df

def preprocess_data(df, normalize_names=True, remove_duplicates=True, 
                   encode_categorical=True, handle_missing=True, 
                   handle_negative=True, remove_outliers=False,
                   scale_features=True):
    """
    Preprocess the dataset with various cleaning and transformation steps.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        normalize_names (bool): Whether to normalize column names
        remove_duplicates (bool): Whether to remove duplicate columns
        encode_categorical (bool): Whether to encode categorical features
        handle_missing (bool): Whether to handle missing values
        handle_negative (bool): Whether to handle negative values
        remove_outliers (bool): Whether to remove outliers
        scale_features (bool): Whether to scale numerical features
        
    Returns:
        pd.DataFrame: Preprocessed DataFrame
        dict: Dictionary of preprocessing artifacts (encoders, scalers, etc.)
    """
    print("Starting data preprocessing...")
    artifacts = {}
    
    # Make a copy of the DataFrame to avoid modifying the original
    df_processed = df.copy()
    
    # Step 1: Normalize column names
    if normalize_names:
        df_processed = normalize_column_names(df_processed)
    
    # Step 2: Remove duplicate columns
    if remove_duplicates:
        df_processed = remove_duplicate_columns(df_processed)
    
    # Step 3: Encode categorical features
    if encode_categorical:
        df_processed, label_encoders = encode_categorical_features(df_processed)
        artifacts['label_encoders'] = label_encoders
    
    # Step 4: Handle missing values (NaN and infinite values)
    if handle_missing:
        # Replace infinite values with NaN
        df_processed = replace_inf_with_nan(df_processed)
        
        # Impute missing values
        df_processed = impute_missing_values(df_processed)
    
    # Step 5: Handle negative values in columns that should be non-negative
    if handle_negative:
        df_processed = replace_negative_values(df_processed)
    
    # Step 6: Remove outliers
    if remove_outliers:
        df_processed = remove_outliers_isolation_forest(df_processed)
    
    # Step 7: Scale numerical features
    if scale_features:
        df_processed, scaler = scale_numerical_features(df_processed)
        artifacts['scaler'] = scaler
    
    print(f"Data preprocessing complete. Final shape: {df_processed.shape}")
    return df_processed, artifacts

def normalize_column_names(df):
    """
    Normalize column names by removing leading/trailing whitespace,
    replacing spaces with underscores, and converting to lowercase.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with normalized column names
    """
    print("Normalizing column names...")
    
    # Store original column names for reference
    original_columns = df.columns.tolist()
    
    # Normalize column names
    df.columns = df.columns.str.strip().str.replace(' ', '_').str.lower()
    
    # Print mapping of original to normalized column names
    changed_columns = sum(1 for orig, norm in zip(original_columns, df.columns) if orig != norm)
    print(f"Normalized {changed_columns} column names")
    
    return df

def remove_duplicate_columns(df):
    """
    Identify and remove duplicate columns from the DataFrame.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with duplicate columns removed
    """
    print("Checking for duplicate columns...")
    
    # Get list of all columns
    columns = df.columns.tolist()
    
    # Initialize list to store duplicate columns
    duplicate_columns = []
    
    # Check for columns ending with '.1', '.2', etc.
    for col in columns:
        if col.endswith(('.1', '.2', '.3', '.4', '.5')):
            base_col = col.rsplit('.', 1)[0]
            if base_col in columns:
                # Check if the columns are actually duplicates
                if df[col].equals(df[base_col]):
                    duplicate_columns.append(col)
                    print(f"Found duplicate column: {col} (duplicate of {base_col})")
    
    # Check for duplicate columns with different names
    for i, col1 in enumerate(columns):
        for col2 in columns[i+1:]:
            if col1 not in duplicate_columns and col2 not in duplicate_columns:
                if df[col1].equals(df[col2]):
                    duplicate_columns.append(col2)
                    print(f"Found duplicate column: {col2} (duplicate of {col1})")
    
    # Remove duplicate columns
    if duplicate_columns:
        df = df.drop(columns=duplicate_columns)
        print(f"Removed {len(duplicate_columns)} duplicate columns")
    else:
        print("No duplicate columns found")
    
    return df

def encode_categorical_features(df):
    """
    Encode categorical features using Label Encoding.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with encoded categorical features
        dict: Dictionary mapping column names to their respective label encoders
    """
    print("Encoding categorical features...")
    
    # Create a copy of the DataFrame to avoid modifying the original
    df_encoded = df.copy()
    
    # Initialize dictionary to store label encoders
    label_encoders = {}
    
    # Identify categorical columns (excluding the target variable)
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    if 'label' in categorical_columns:
        categorical_columns.remove('label')
    
    print(f"Found {len(categorical_columns)} categorical columns")
    
    # Encode each categorical column
    for col in categorical_columns:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
    
    # Handle the target variable separately
    if 'label' in df.columns and df['label'].dtype in ['object', 'category']:
        le = LabelEncoder()
        df_encoded['label'] = le.fit_transform(df['label'].astype(str))
        label_encoders['label'] = le
        print(f"Encoded target variable 'label' with {len(le.classes_)} classes")
    
    return df_encoded, label_encoders

def replace_inf_with_nan(df):
    """
    Replace infinite values with NaN in the DataFrame.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with infinite values replaced with NaN
    """
    print("Replacing infinite values with NaN...")
    
    # Create a copy of the DataFrame to avoid modifying the original
    df_no_inf = df.copy()
    
    # Replace infinite values with NaN
    df_no_inf = df_no_inf.replace([np.inf, -np.inf], np.nan)
    
    # Count NaN values after replacement
    nan_counts = df_no_inf.isna().sum()
    nan_columns = nan_counts[nan_counts > 0]
    
    if len(nan_columns) > 0:
        print(f"Found {nan_columns.sum()} NaN values across {len(nan_columns)} columns")
    else:
        print("No infinite values found")
    
    return df_no_inf

def impute_missing_values(df):
    """
    Impute missing values (NaN) with the column median for numerical features
    and the most frequent value for categorical features.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with missing values imputed
    """
    print("Imputing missing values...")
    
    # Create a copy of the DataFrame to avoid modifying the original
    df_imputed = df.copy()
    
    # Get list of numerical and categorical columns
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Impute missing values in numerical columns with median
    for col in numerical_columns:
        if df_imputed[col].isna().any():
            median_value = df_imputed[col].median()
            df_imputed[col] = df_imputed[col].fillna(median_value)
            print(f"Imputed missing values in '{col}' with median: {median_value:.4f}")
    
    # Impute missing values in categorical columns with most frequent value
    for col in categorical_columns:
        if df_imputed[col].isna().any():
            most_frequent = df_imputed[col].mode()[0]
            df_imputed[col] = df_imputed[col].fillna(most_frequent)
            print(f"Imputed missing values in '{col}' with most frequent value: {most_frequent}")
    
    # Verify that there are no more missing values
    nan_counts = df_imputed.isna().sum()
    if nan_counts.sum() > 0:
        print("Warning: There are still missing values in the DataFrame")
        print(nan_counts[nan_counts > 0])
    else:
        print("All missing values have been imputed")
    
    return df_imputed

def replace_negative_values(df):
    """
    Replace nonsensical negative values in specific columns with the median.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with nonsensical negative values replaced
    """
    print("Checking for negative values in columns that should be non-negative...")
    
    # Create a copy of the DataFrame to avoid modifying the original
    df_no_neg = df.copy()
    
    # List of columns that should not have negative values
    # These are typically columns representing counts, durations, lengths, etc.
    non_negative_columns = [
        col for col in df.columns if any(keyword in col.lower() for keyword in 
                                        ['duration', 'length', 'packets', 'bytes', 'count', 'min', 'max', 'mean'])
    ]
    
    # Replace negative values with the median in each column
    replaced_count = 0
    for col in non_negative_columns:
        if col in df_no_neg.columns and df_no_neg[col].dtype in ['int64', 'float64'] and (df_no_neg[col] < 0).any():
            neg_count = (df_no_neg[col] < 0).sum()
            median_value = df_no_neg[df_no_neg[col] >= 0][col].median()
            df_no_neg.loc[df_no_neg[col] < 0, col] = median_value
            replaced_count += neg_count
            print(f"Replaced {neg_count} negative values in '{col}' with median: {median_value:.4f}")
    
    if replaced_count == 0:
        print("No negative values found in columns that should be non-negative")
    else:
        print(f"Replaced a total of {replaced_count} negative values")
    
    return df_no_neg

def remove_outliers_isolation_forest(df, contamination=0.05):
    """
    Detect and remove outliers using the Isolation Forest algorithm.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        contamination (float): The proportion of outliers in the dataset
        
    Returns:
        pd.DataFrame: DataFrame with outliers removed
    """
    print(f"Detecting outliers using Isolation Forest (contamination={contamination})...")
    
    # Create a copy of the DataFrame to avoid modifying the original
    df_no_outliers = df.copy()
    
    # Get list of all columns except the target variable
    X = df.drop(columns=['label'] if 'label' in df.columns else [])
    
    # Initialize and fit the Isolation Forest model
    isolation_forest = IsolationForest(contamination=contamination, random_state=RANDOM_STATE)
    outlier_pred = isolation_forest.fit_predict(X)
    
    # Count outliers
    outlier_count = (outlier_pred == -1).sum()
    print(f"Detected {outlier_count} outliers ({outlier_count/len(df)*100:.2f}% of the dataset)")
    
    # Remove outliers
    df_no_outliers = df_no_outliers[outlier_pred == 1]
    print(f"DataFrame shape after removing outliers: {df_no_outliers.shape}")
    
    return df_no_outliers

def scale_numerical_features(df):
    """
    Scale numerical features using StandardScaler.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with scaled numerical features
        StandardScaler: Fitted scaler object
    """
    print("Scaling numerical features...")
    
    # Create a copy of the DataFrame to avoid modifying the original
    df_scaled = df.copy()
    
    # Get list of numerical columns except the target variable
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    if 'label' in numerical_columns:
        numerical_columns.remove('label')
    
    # Initialize and fit the scaler
    scaler = StandardScaler()
    df_scaled[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    
    print(f"Scaled {len(numerical_columns)} numerical features")
    
    return df_scaled, scaler

# <a id="eda"></a>
# ## 3. Exploratory Data Analysis (EDA)

In [None]:

def perform_eda(df, target_col='label'):
    """
    Perform exploratory data analysis on the dataset.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        target_col (str): Name of the target column
        
    Returns:
        dict: Dictionary containing EDA results
    """
    print("Performing exploratory data analysis...")
    
    eda_results = {}
    
    # Basic statistics
    print("Calculating basic statistics...")
    eda_results['basic_stats'] = df.describe()
    
    # Class distribution
    if target_col in df.columns:
        print("Analyzing class distribution...")
        class_counts = df[target_col].value_counts()
        class_percentages = class_counts / len(df) * 100
        
        eda_results['class_distribution'] = {
            'counts': class_counts,
            'percentages': class_percentages
        }
        
        # Plot class distribution
        plt.figure(figsize=(12, 6))
        ax = sns.barplot(x=class_counts.index, y=class_counts.values)
        plt.title('Class Distribution')
        plt.xlabel('Class')
        plt.ylabel('Count')
        plt.xticks(rotation=45, ha='right')
        
        # Add count labels on top of bars
        for i, count in enumerate(class_counts.values):
            ax.text(i, count + 0.1, f'{count} ({class_percentages[i]:.1f}%)', 
                    ha='center', va='bottom', fontsize=10)
        
        plt.tight_layout()
        plt.show()
    
    # Correlation analysis
    print("Calculating feature correlations...")
    numerical_df = df.select_dtypes(include=['int64', 'float64'])
    if len(numerical_df.columns) > 1:  # Need at least 2 columns for correlation
        correlation_matrix = numerical_df.corr()
        eda_results['correlation_matrix'] = correlation_matrix
        
        # Plot correlation heatmap
        plt.figure(figsize=(14, 12))
        mask = np.triu(correlation_matrix)
        sns.heatmap(correlation_matrix, annot=False, mask=mask, cmap='coolwarm', 
                    linewidths=0.5, vmin=-1, vmax=1)
        plt.title('Feature Correlation Heatmap')
        plt.tight_layout()
        plt.show()
        
        # Find highly correlated features
        high_corr_threshold = 0.8
        high_corr_features = []
        
        for i in range(len(correlation_matrix.columns)):
            for j in range(i+1, len(correlation_matrix.columns)):
                if abs(correlation_matrix.iloc[i, j]) > high_corr_threshold:
                    high_corr_features.append((
                        correlation_matrix.columns[i],
                        correlation_matrix.columns[j],
                        correlation_matrix.iloc[i, j]
                    ))
        
        eda_results['high_corr_features'] = high_corr_features
        
        if high_corr_features:
            print(f"Found {len(high_corr_features)} pairs of highly correlated features (|r| > {high_corr_threshold}):")
            for feat1, feat2, corr in high_corr_features[:10]:  # Show top 10
                print(f"  {feat1} and {feat2}: r = {corr:.4f}")
            if len(high_corr_features) > 10:
                print(f"  ... and {len(high_corr_features) - 10} more")
    
    # Feature distributions
    print("Analyzing feature distributions...")
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    if target_col in numerical_columns:
        numerical_columns.remove(target_col)
    
    # Sample a subset of numerical features if there are too many
    if len(numerical_columns) > 6:
        sampled_columns = np.random.choice(numerical_columns, 6, replace=False)
    else:
        sampled_columns = numerical_columns
    
    if sampled_columns.size > 0:
        # Plot distributions of sampled features
        fig, axes = plt.subplots(2, 3, figsize=(18, 10))
        axes = axes.flatten()
        
        for i, col in enumerate(sampled_columns):
            if i < len(axes):
                sns.histplot(df[col], kde=True, ax=axes[i])
                axes[i].set_title(f'Distribution of {col}')
                axes[i].set_xlabel(col)
                axes[i].set_ylabel('Frequency')
        
        plt.tight_layout()
        plt.show()
    
    # PCA visualization if there are enough numerical features
    if len(numerical_columns) >= 3:
        print("Performing PCA visualization...")
        # Select numerical features for PCA
        X = df[numerical_columns]
        
        # Standardize the features
        X_scaled = StandardScaler().fit_transform(X)
        
        # Apply PCA
        pca = PCA(n_components=3)
        principal_components = pca.fit_transform(X_scaled)
        
        # Create a DataFrame with the principal components
        pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3'])
        
        # Add the target variable
        if target_col in df.columns:
            pca_df[target_col] = df[target_col].values
            
            # Plot PCA results
            plt.figure(figsize=(10, 8))
            scatter = plt.scatter(pca_df['PC1'], pca_df['PC2'], c=pca_df[target_col].astype('category').cat.codes, 
                                 alpha=0.6, cmap='viridis')
            plt.title('PCA: First Two Principal Components')
            plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
            plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
            plt.colorbar(scatter, label=target_col)
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
            
            # 3D PCA plot
            fig = plt.figure(figsize=(10, 8))
            ax = fig.add_subplot(111, projection='3d')
            scatter = ax.scatter(pca_df['PC1'], pca_df['PC2'], pca_df['PC3'], 
                               c=pca_df[target_col].astype('category').cat.codes, 
                               alpha=0.6, cmap='viridis')
            ax.set_title('PCA: First Three Principal Components')
            ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
            ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
            ax.set_zlabel(f'PC3 ({pca.explained_variance_ratio_[2]:.2%})')
            plt.colorbar(scatter, label=target_col)
            plt.tight_layout()
            plt.show()
        
        # Store PCA results
        eda_results['pca'] = {
            'pca_object': pca,
            'explained_variance_ratio': pca.explained_variance_ratio_,
            'cumulative_variance': np.cumsum(pca.explained_variance_ratio_)
        }
        
        print(f"PCA explained variance: {pca.explained_variance_ratio_}")
        print(f"Cumulative explained variance: {np.cumsum(pca.explained_variance_ratio_)}")
    
    print("Exploratory data analysis complete")
    return eda_results

# <a id="feature_engineering"></a>
# ## 4. Feature Engineering

In [None]:

def engineer_features(df, target_col='label', correlation_threshold=0.1, 
                     n_features=None, feature_selection_method='correlation'):
    """
    Perform feature engineering on the dataset.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        target_col (str): Name of the target column
        correlation_threshold (float): Minimum absolute correlation with target for feature selection
        n_features (int, optional): Number of features to select (if None, use correlation_threshold)
        feature_selection_method (str): Method for feature selection ('correlation', 'selectkbest', 'rfe')
        
    Returns:
        pd.DataFrame: DataFrame with selected features
        list: List of selected feature names
    """
    print("Performing feature engineering...")
    
    # Separate features and target
    if target_col in df.columns:
        X = df.drop(columns=[target_col])
        y = df[target_col]
    else:
        X = df.copy()
        y = None
        print(f"Warning: Target column '{target_col}' not found in DataFrame")
        return df, df.columns.tolist()
    
    # Get numerical feature names
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    
    # Feature selection based on correlation with target
    if feature_selection_method == 'correlation' and y is not None:
        print(f"Selecting features based on correlation with target (threshold={correlation_threshold})...")
        
        # Calculate correlation with target for each feature
        correlations = []
        for col in numerical_features:
            corr = np.corrcoef(X[col], y)[0, 1]
            correlations.append((col, abs(corr)))
        
        # Sort features by absolute correlation
        correlations.sort(key=lambda x: x[1], reverse=True)
        
        # Select features based on threshold or number
        if n_features is not None:
            selected_features = [col for col, _ in correlations[:n_features]]
            print(f"Selected top {len(selected_features)} features based on correlation with target")
        else:
            selected_features = [col for col, corr in correlations if corr >= correlation_threshold]
            print(f"Selected {len(selected_features)} features with |correlation| >= {correlation_threshold}")
        
        # Print top correlations
        print("Top feature correlations with target:")
        for col, corr in correlations[:10]:
            print(f"  {col}: |r| = {corr:.4f}")
    
    # Feature selection using SelectKBest
    elif feature_selection_method == 'selectkbest' and y is not None:
        n_features = n_features or min(10, len(numerical_features))
        print(f"Selecting top {n_features} features using SelectKBest (f_classif)...")
        
        # Initialize and fit SelectKBest
        selector = SelectKBest(f_classif, k=n_features)
        selector.fit(X[numerical_features], y)
        
        # Get selected feature indices and names
        selected_indices = selector.get_support(indices=True)
        selected_features = [numerical_features[i] for i in selected_indices]
        
        # Print selected features and their scores
        feature_scores = list(zip(numerical_features, selector.scores_))
        feature_scores.sort(key=lambda x: x[1], reverse=True)
        
        print("Top feature scores:")
        for col, score in feature_scores[:10]:
            print(f"  {col}: score = {score:.4f}")
    
    # Feature selection using RFE with Random Forest
    elif feature_selection_method == 'rfe' and y is not None:
        n_features = n_features or min(10, len(numerical_features))
        print(f"Selecting top {n_features} features using RFE with Random Forest...")
        
        # Initialize and fit RFE
        estimator = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
        selector = RFE(estimator, n_features_to_select=n_features, step=1)
        selector.fit(X[numerical_features], y)
        
        # Get selected feature names
        selected_features = [numerical_features[i] for i, selected in enumerate(selector.support_) if selected]
        
        print(f"Selected {len(selected_features)} features using RFE")
    
    else:
        # If no valid feature selection method or no target, use all features
        selected_features = X.columns.tolist()
        print(f"Using all {len(selected_features)} features (no feature selection applied)")
    
    # Add target column back to selected features
    if target_col not in selected_features and target_col in df.columns:
        selected_features.append(target_col)
    
    # Create DataFrame with selected features
    df_selected = df[selected_features].copy()
    
    print(f"Feature engineering complete. Selected {len(selected_features)} features")
    return df_selected, selected_features

# <a id="ml_models"></a>
# ## 5. Machine Learning Models

In [None]:

def split_data(df, target_col='label', test_size=0.2, val_size=0.25):
    """
    Split the dataset into training, validation, and test sets.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        target_col (str): Name of the target column
        test_size (float): Proportion of data to use for testing
        val_size (float): Proportion of training data to use for validation
        
    Returns:
        tuple: (X_train, X_val, X_test, y_train, y_val, y_test)
    """
    print(f"Splitting data into train, validation, and test sets...")
    
    # Separate features and target
    if target_col in df.columns:
        X = df.drop(columns=[target_col])
        y = df[target_col]
    else:
        raise ValueError(f"Target column '{target_col}' not found in DataFrame")
    
    # First split: training + validation vs. test
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=test_size, random_state=RANDOM_STATE, stratify=y
    )
    
    # Second split: training vs. validation
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=val_size, random_state=RANDOM_STATE, stratify=y_train_val
    )
    
    print(f"Data split complete:")
    print(f"  Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(df):.1%})")
    print(f"  Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(df):.1%})")
    print(f"  Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(df):.1%})")
    
    return X_train, X_val, X_test, y_train, y_val, y_test

def train_models(X_train, y_train, X_val, y_val, models_to_train=None):
    """
    Train multiple machine learning models on the dataset.
    
    Args:
        X_train (pd.DataFrame): Training features
        y_train (pd.Series): Training target
        X_val (pd.DataFrame): Validation features
        y_val (pd.Series): Validation target
        models_to_train (list, optional): List of model names to train
        
    Returns:
        dict: Dictionary of trained models and their performance metrics
    """
    print("Training machine learning models...")
    
    # Define available models
    available_models = {
        'random_forest': {
            'name': 'Random Forest',
            'model': RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE),
            'params': {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10]
            }
        },
        'gradient_boosting': {
            'name': 'Gradient Boosting',
            'model': GradientBoostingClassifier(random_state=RANDOM_STATE),
            'params': {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7]
            }
        },
        'xgboost': {
            'name': 'XGBoost',
            'model': xgb.XGBClassifier(random_state=RANDOM_STATE),
            'params': {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7]
            }
        },
        'lightgbm': {
            'name': 'LightGBM',
            'model': lgb.LGBMClassifier(random_state=RANDOM_STATE),
            'params': {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7]
            }
        },
        'logistic_regression': {
            'name': 'Logistic Regression',
            'model': LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
            'params': {
                'C': [0.1, 1.0, 10.0],
                'solver': ['liblinear', 'lbfgs']
            }
        },
        'svm': {
            'name': 'Support Vector Machine',
            'model': SVC(probability=True, random_state=RANDOM_STATE),
            'params': {
                'C': [0.1, 1.0, 10.0],
                'kernel': ['linear', 'rbf'],
                'gamma': ['scale', 'auto']
            }
        },
        'knn': {
            'name': 'K-Nearest Neighbors',
            'model': KNeighborsClassifier(),
            'params': {
                'n_neighbors': [3, 5, 7, 9],
                'weights': ['uniform', 'distance']
            }
        },
        'decision_tree': {
            'name': 'Decision Tree',
            'model': DecisionTreeClassifier(random_state=RANDOM_STATE),
            'params': {
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10]
            }
        }
    }
    
    # Add neural network model if TensorFlow is available
    if TENSORFLOW_AVAILABLE:
        available_models['neural_network'] = {
            'name': 'Neural Network',
            'model': None,  # Will be created during training
            'params': {}  # No hyperparameter tuning for neural network
        }
    
    # Select models to train
    if models_to_train is None:
        # Train all available models
        models_to_train = list(available_models.keys())
    else:
        # Validate requested models
        for model_name in models_to_train:
            if model_name not in available_models:
                print(f"Warning: Model '{model_name}' not found. Available models: {list(available_models.keys())}")
                models_to_train.remove(model_name)
    
    # Initialize dictionary to store trained models and their metrics
    trained_models = {}
    
    # Train each selected model
    for model_name in models_to_train:
        print(f"\nTraining {available_models[model_name]['name']}...")
        
        # Special case for neural network
        if model_name == 'neural_network' and TENSORFLOW_AVAILABLE:
            # Create and train neural network
            model, history = train_neural_network(X_train, y_train, X_val, y_val)
            
            # Make predictions on validation set
            y_val_pred = model.predict(X_val)
            y_val_pred_classes = np.argmax(y_val_pred, axis=1)
            
            # Calculate metrics
            accuracy = accuracy_score(y_val, y_val_pred_classes)
            precision = precision_score(y_val, y_val_pred_classes, average='weighted')
            recall = recall_score(y_val, y_val_pred_classes, average='weighted')
            f1 = f1_score(y_val, y_val_pred_classes, average='weighted')
            
            # Store model and metrics
            trained_models[model_name] = {
                'name': available_models[model_name]['name'],
                'model': model,
                'metrics': {
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1
                },
                'history': history
            }
            
            print(f"  Validation Accuracy: {accuracy:.4f}")
            print(f"  Validation F1 Score: {f1:.4f}")
        
        else:
            # Get model and hyperparameters
            model_info = available_models[model_name]
            model = model_info['model']
            params = model_info['params']
            
            # Train model with cross-validation
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
            grid_search = GridSearchCV(
                model, params, cv=cv, scoring='f1_weighted', n_jobs=-1, verbose=0
            )
            
            try:
                grid_search.fit(X_train, y_train)
                
                # Get best model
                best_model = grid_search.best_estimator_
                
                # Make predictions on validation set
                y_val_pred = best_model.predict(X_val)
                
                # Calculate metrics
                accuracy = accuracy_score(y_val, y_val_pred)
                precision = precision_score(y_val, y_val_pred, average='weighted')
                recall = recall_score(y_val, y_val_pred, average='weighted')
                f1 = f1_score(y_val, y_val_pred, average='weighted')
                
                # Store model and metrics
                trained_models[model_name] = {
                    'name': model_info['name'],
                    'model': best_model,
                    'metrics': {
                        'accuracy': accuracy,
                        'precision': precision,
                        'recall': recall,
                        'f1': f1
                    },
                    'best_params': grid_search.best_params_,
                    'cv_results': grid_search.cv_results_
                }
                
                print(f"  Best Parameters: {grid_search.best_params_}")
                print(f"  Validation Accuracy: {accuracy:.4f}")
                print(f"  Validation F1 Score: {f1:.4f}")
            
            except Exception as e:
                print(f"  Error training {model_info['name']}: {e}")
    
    print("\nModel training complete")
    return trained_models

def train_neural_network(X_train, y_train, X_val, y_val, epochs=50, batch_size=32):
    """
    Train a neural network model for classification.
    
    Args:
        X_train (pd.DataFrame): Training features
        y_train (pd.Series): Training target
        X_val (pd.DataFrame): Validation features
        y_val (pd.Series): Validation target
        epochs (int): Number of training epochs
        batch_size (int): Batch size for training
        
    Returns:
        tuple: (trained_model, training_history)
    """
    if not TENSORFLOW_AVAILABLE:
        print("TensorFlow not available. Cannot train neural network.")
        return None, None
    
    print("Training neural network...")
    
    # Convert target to one-hot encoding
    num_classes = len(np.unique(y_train))
    y_train_onehot = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
    y_val_onehot = tf.keras.utils.to_categorical(y_val, num_classes=num_classes)
    
    # Define model architecture
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')
    ])
    
    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    # Define early stopping
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    )
    
    # Train model
    history = model.fit(
        X_train, y_train_onehot,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_val, y_val_onehot),
        callbacks=[early_stopping],
        verbose=0
    )
    
    # Plot training history
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    print(f"Neural network training complete after {len(history.history['loss'])} epochs")
    print(f"Final validation accuracy: {history.history['val_accuracy'][-1]:.4f}")
    
    return model, history

# <a id="evaluation"></a>
# ## 6. Model Evaluation

In [None]:

def evaluate_models(trained_models, X_test, y_test):
    """
    Evaluate trained models on the test set.
    
    Args:
        trained_models (dict): Dictionary of trained models
        X_test (pd.DataFrame): Test features
        y_test (pd.Series): Test target
        
    Returns:
        dict: Dictionary of evaluation results
    """
    print("Evaluating models on test set...")
    
    # Initialize dictionary to store evaluation results
    evaluation_results = {}
    
    # Evaluate each model
    for model_name, model_info in trained_models.items():
        print(f"\nEvaluating {model_info['name']}...")
        
        model = model_info['model']
        
        # Special case for neural network
        if model_name == 'neural_network' and TENSORFLOW_AVAILABLE:
            # Make predictions
            y_pred_proba = model.predict(X_test)
            y_pred = np.argmax(y_pred_proba, axis=1)
        else:
            # Make predictions
            y_pred = model.predict(X_test)
            
            # Get prediction probabilities if available
            if hasattr(model, 'predict_proba'):
                y_pred_proba = model.predict_proba(X_test)
            else:
                y_pred_proba = None
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        # Generate confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        
        # Generate classification report
        cr = classification_report(y_test, y_pred, output_dict=True)
        
        # Store evaluation results
        evaluation_results[model_name] = {
            'name': model_info['name'],
            'metrics': {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1
            },
            'confusion_matrix': cm,
            'classification_report': cr,
            'predictions': y_pred,
            'probabilities': y_pred_proba
        }
        
        # Print metrics
        print(f"  Test Accuracy: {accuracy:.4f}")
        print(f"  Test Precision: {precision:.4f}")
        print(f"  Test Recall: {recall:.4f}")
        print(f"  Test F1 Score: {f1:.4f}")
        
        # Plot confusion matrix
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=np.unique(y_test), 
                   yticklabels=np.unique(y_test))
        plt.title(f'Confusion Matrix - {model_info["name"]}')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.tight_layout()
        plt.show()
        
        # Plot ROC curve for binary classification
        if len(np.unique(y_test)) == 2 and y_pred_proba is not None:
            plt.figure(figsize=(8, 6))
            
            # Calculate ROC curve and AUC
            fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:, 1])
            roc_auc = auc(fpr, tpr)
            
            # Plot ROC curve
            plt.plot(fpr, tpr, lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
            plt.plot([0, 1], [0, 1], 'k--', lw=2)
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'ROC Curve - {model_info["name"]}')
            plt.legend(loc='lower right')
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
            
            # Store ROC curve data
            evaluation_results[model_name]['roc_curve'] = {
                'fpr': fpr,
                'tpr': tpr,
                'auc': roc_auc
            }
    
    # Compare models
    print("\nModel Comparison:")
    model_comparison = {
        'Model': [],
        'Accuracy': [],
        'Precision': [],
        'Recall': [],
        'F1 Score': []
    }
    
    for model_name, results in evaluation_results.items():
        model_comparison['Model'].append(results['name'])
        model_comparison['Accuracy'].append(results['metrics']['accuracy'])
        model_comparison['Precision'].append(results['metrics']['precision'])
        model_comparison['Recall'].append(results['metrics']['recall'])
        model_comparison['F1 Score'].append(results['metrics']['f1'])
    
    # Create DataFrame for comparison
    comparison_df = pd.DataFrame(model_comparison)
    comparison_df = comparison_df.sort_values('F1 Score', ascending=False).reset_index(drop=True)
    
    # Print comparison table
    print(comparison_df)
    
    # Plot model comparison
    plt.figure(figsize=(12, 8))
    
    # Create bar chart
    x = np.arange(len(comparison_df))
    width = 0.2
    
    plt.bar(x - 1.5*width, comparison_df['Accuracy'], width, label='Accuracy')
    plt.bar(x - 0.5*width, comparison_df['Precision'], width, label='Precision')
    plt.bar(x + 0.5*width, comparison_df['Recall'], width, label='Recall')
    plt.bar(x + 1.5*width, comparison_df['F1 Score'], width, label='F1 Score')
    
    plt.xlabel('Model')
    plt.ylabel('Score')
    plt.title('Model Comparison')
    plt.xticks(x, comparison_df['Model'], rotation=45, ha='right')
    plt.legend()
    plt.grid(True, axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print("\nModel evaluation complete")
    return evaluation_results

# <a id="deployment"></a>
# ## 7. Model Deployment

In [None]:

def save_model(model, model_name, artifacts=None, output_dir='models'):
    """
    Save a trained model and its artifacts for deployment.
    
    Args:
        model: Trained model object
        model_name (str): Name of the model
        artifacts (dict, optional): Dictionary of model artifacts (e.g., scaler, encoders)
        output_dir (str): Directory to save the model
        
    Returns:
        str: Path to the saved model
    """
    import pickle
    import os
    
    print(f"Saving model: {model_name}...")
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Save model
    model_path = os.path.join(output_dir, f"{model_name}.pkl")
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    
    # Save artifacts if provided
    if artifacts:
        artifacts_path = os.path.join(output_dir, f"{model_name}_artifacts.pkl")
        with open(artifacts_path, 'wb') as f:
            pickle.dump(artifacts, f)
        print(f"Model artifacts saved to: {artifacts_path}")
    
    print(f"Model saved to: {model_path}")
    return model_path

def load_model(model_path, artifacts_path=None):
    """
    Load a saved model and its artifacts.
    
    Args:
        model_path (str): Path to the saved model
        artifacts_path (str, optional): Path to the saved artifacts
        
    Returns:
        tuple: (model, artifacts)
    """
    import pickle
    
    print(f"Loading model from: {model_path}...")
    
    # Load model
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    
    # Load artifacts if provided
    artifacts = None
    if artifacts_path:
        with open(artifacts_path, 'rb') as f:
            artifacts = pickle.load(f)
        print(f"Model artifacts loaded from: {artifacts_path}")
    
    print("Model loaded successfully")
    return model, artifacts

def create_prediction_pipeline(model, artifacts=None):
    """
    Create a prediction pipeline for the model.
    
    Args:
        model: Trained model object
        artifacts (dict, optional): Dictionary of model artifacts
        
    Returns:
        function: Prediction function
    """
    def predict(data):
        """
        Make predictions using the trained model.
        
        Args:
            data (pd.DataFrame or dict): Input data for prediction
            
        Returns:
            dict: Prediction results
        """
        # Convert dictionary to DataFrame if necessary
        if isinstance(data, dict):
            data = pd.DataFrame([data])
        
        # Preprocess data if artifacts are provided
        if artifacts:
            # Apply label encoding to categorical features
            if 'label_encoders' in artifacts:
                for col, encoder in artifacts['label_encoders'].items():
                    if col in data.columns and col != 'label':
                        data[col] = encoder.transform(data[col].astype(str))
            
            # Apply scaling to numerical features
            if 'scaler' in artifacts:
                numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
                data[numerical_columns] = artifacts['scaler'].transform(data[numerical_columns])
        
        # Make prediction
        prediction = model.predict(data)
        
        # Get prediction probabilities if available
        if hasattr(model, 'predict_proba'):
            probabilities = model.predict_proba(data)
        else:
            probabilities = None
        
        # Decode prediction if label encoder is available
        if artifacts and 'label_encoders' in artifacts and 'label' in artifacts['label_encoders']:
            prediction = artifacts['label_encoders']['label'].inverse_transform(prediction)
        
        # Create result dictionary
        result = {
            'prediction': prediction.tolist(),
            'probabilities': probabilities.tolist() if probabilities is not None else None
        }
        
        return result
    
    return predict

# <a id="main"></a>
# ## 8. Main Execution

In [None]:

def main():
    """
    Main function to execute the complete machine learning pipeline.
    """
    print("Starting Network Intrusion Detection ML Pipeline...")
    
    # Step 1: Load data
    df = load_data(sample_size=10000)  # Adjust sample size as needed
    
    # Step 2: Preprocess data
    df_processed, artifacts = preprocess_data(df)
    
    # Step 3: Perform exploratory data analysis
    eda_results = perform_eda(df_processed)
    
    # Step 4: Engineer features
    df_selected, selected_features = engineer_features(df_processed)
    
    # Step 5: Split data
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(df_selected)
    
    # Step 6: Train models
    # Select a subset of models for faster execution
    models_to_train = ['random_forest', 'gradient_boosting', 'logistic_regression']
    trained_models = train_models(X_train, y_train, X_val, y_val, models_to_train)
    
    # Step 7: Evaluate models
    evaluation_results = evaluate_models(trained_models, X_test, y_test)
    
    # Step 8: Save best model
    # Find best model based on F1 score
    best_model_name = max(evaluation_results, key=lambda x: evaluation_results[x]['metrics']['f1'])
    best_model = trained_models[best_model_name]['model']
    
    print(f"\nBest model: {trained_models[best_model_name]['name']}")
    print(f"F1 Score: {evaluation_results[best_model_name]['metrics']['f1']:.4f}")
    
    # Save best model
    model_path = save_model(best_model, best_model_name, artifacts)
    
    # Create prediction pipeline
    prediction_pipeline = create_prediction_pipeline(best_model, artifacts)
    
    print("\nNetwork Intrusion Detection ML Pipeline completed successfully!")
    
    return {
        'best_model': best_model,
        'best_model_name': best_model_name,
        'model_path': model_path,
        'artifacts': artifacts,
        'evaluation_results': evaluation_results,
        'prediction_pipeline': prediction_pipeline
    }

# Execute main function if running as script
if __name__ == "__main__":
    main()