# Machine Learning Pipeline for Network Intrusion Detection

This notebook implements a comprehensive machine learning pipeline for network intrusion detection using the CSE-CIC-IDS2018 dataset. The pipeline includes data preprocessing, exploratory data analysis, feature engineering, and various machine learning models for classification.

## Table of Contents
1. [Setup and Imports](#setup)
2. [Data Preprocessing and Transformation](#preprocessing)
3. [Exploratory Data Analysis (EDA)](#eda)
4. [Feature Engineering](#feature_engineering)
5. [Machine Learning Phase](#ml_phase)
6. [Model Evaluation](#evaluation)
7. [Model Usage](#usage)

<a id="setup"></a>
## 1. Setup and Imports

First, we'll import all the necessary libraries for our machine learning pipeline.

In [None]:
# Basic data manipulation libraries
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# Data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
import plotly.graph_objects as go

# Machine learning libraries
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import IsolationForest, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, classification_report, roc_curve, auc, 
    silhouette_score, mean_squared_error, r2_score
)

# Advanced ML models
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# Neural Networks
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set(style="whitegrid")

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

<a id="preprocessing"></a>
## 2. Data Preprocessing and Transformation

In this section, we'll load the dataset, inspect it, clean it, and prepare it for analysis and modeling.

### 2.1 Create Sample Dataset

Since we're working with a limited environment, we'll create a sample dataset for demonstration purposes.

In [None]:
def create_sample_dataset(n_samples=1000):
    """
    Create a sample dataset for demonstration purposes.
    
    Args:
        n_samples (int): Number of samples to generate
        
    Returns:
        pd.DataFrame: Sample DataFrame
    """
    np.random.seed(42)
    
    # Create sample features
    sample_data = {
        'Flow Duration': np.random.randint(1, 100000, n_samples),
        'Total Fwd Packets': np.random.randint(1, 100, n_samples),
        'Total Backward Packets': np.random.randint(1, 100, n_samples),
        'Total Length of Fwd Packets': np.random.randint(1, 10000, n_samples),
        'Total Length of Bwd Packets': np.random.randint(1, 10000, n_samples),
        'Fwd Packet Length Max': np.random.randint(1, 1500, n_samples),
        'Fwd Packet Length Min': np.random.randint(0, 100, n_samples),
        'Fwd Packet Length Mean': np.random.uniform(10, 500, n_samples),
        'Bwd Packet Length Max': np.random.randint(1, 1500, n_samples),
        'Bwd Packet Length Min': np.random.randint(0, 100, n_samples),
        'Bwd Packet Length Mean': np.random.uniform(10, 500, n_samples),
        'Flow Bytes/s': np.random.uniform(0, 10000, n_samples),
        'Flow Packets/s': np.random.uniform(0, 1000, n_samples),
        'Flow IAT Mean': np.random.uniform(0, 1000, n_samples),
        'Flow IAT Std': np.random.uniform(0, 500, n_samples),
        'Flow IAT Max': np.random.uniform(0, 2000, n_samples),
        'Flow IAT Min': np.random.uniform(0, 100, n_samples),
        'Fwd Header Length.1': np.random.randint(20, 100, n_samples),  # Duplicate column
        'Fwd Header Length': np.random.randint(20, 100, n_samples),
        'Protocol': np.random.choice(['TCP', 'UDP', 'ICMP'], n_samples),
        'Destination Port': np.random.choice([80, 443, 22, 53, 8080], n_samples),
        'Label': np.random.choice(['BENIGN', 'DoS Hulk', 'PortScan', 'Brute Force-Web', 'Web Attack'], n_samples, 
                                 p=[0.7, 0.1, 0.1, 0.05, 0.05])
    }
    
    # Create DataFrame
    df = pd.DataFrame(sample_data)
    print(f"Created sample dataset: {df.shape[0]} rows, {df.shape[1]} columns")
    
    return df

# Create a sample dataset for demonstration
df = create_sample_dataset(200)

### 2.2 Inspect Data Types and Count Non-Null Values

In [None]:
# Display basic information about the dataset
print("Dataset Information:")
df.info()

# Display the first few rows of the dataset
print("\nFirst 5 rows of the dataset:")
df.head()

# Count non-null values for each column
print("\nNon-null value counts:")
non_null_counts = df.count()
print(non_null_counts)

# Calculate percentage of non-null values
non_null_percentage = (non_null_counts / len(df)) * 100
print("\nPercentage of non-null values:")
print(non_null_percentage)

# Display data types
print("\nData types:")
print(df.dtypes)

### 2.3 Normalize Column Names

In [None]:
# Function to normalize column names
def normalize_column_names(df):
    """
    Normalize column names by removing leading/trailing whitespace,
    replacing spaces with underscores, and converting to lowercase.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with normalized column names
    """
    # Store original column names for reference
    original_columns = df.columns.tolist()
    
    # Normalize column names
    df.columns = df.columns.str.strip().str.replace(' ', '_').str.lower()
    
    # Print mapping of original to normalized column names
    print("Column name mapping:")
    for orig, norm in zip(original_columns, df.columns):
        if orig != norm:
            print(f"  {orig} -> {norm}")
    
    return df

# Normalize column names
df = normalize_column_names(df)

# Display the first few rows with normalized column names
df.head()

### 2.4 Identify and Remove Duplicate Columns

In [None]:
# Function to identify and remove duplicate columns
def remove_duplicate_columns(df):
    """
    Identify and remove duplicate columns from the DataFrame.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with duplicate columns removed
    """
    # Get list of all columns
    columns = df.columns.tolist()
    
    # Initialize list to store duplicate columns
    duplicate_columns = []
    
    # Check for columns ending with '.1', '.2', etc.
    for col in columns:
        if col.endswith(('.1', '.2', '.3', '.4', '.5')):
            base_col = col.rsplit('.', 1)[0]
            if base_col in columns:
                # Check if the columns are actually duplicates
                if df[col].equals(df[base_col]):
                    duplicate_columns.append(col)
                    print(f"Found duplicate column: {col} (duplicate of {base_col})")
    
    # Remove duplicate columns
    if duplicate_columns:
        df = df.drop(columns=duplicate_columns)
        print(f"Removed {len(duplicate_columns)} duplicate columns")
    else:
        print("No duplicate columns found")
    
    return df

# Remove duplicate columns
df = remove_duplicate_columns(df)

# Display the shape of the DataFrame after removing duplicate columns
print(f"DataFrame shape after removing duplicate columns: {df.shape}")

### 2.5 Convert Categorical Features using Label Encoding

In [None]:
# Function to encode categorical features
def encode_categorical_features(df):
    """
    Encode categorical features using Label Encoding.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with encoded categorical features
        dict: Dictionary mapping column names to their respective label encoders
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_encoded = df.copy()
    
    # Initialize dictionary to store label encoders
    label_encoders = {}
    
    # Identify categorical columns (excluding the target variable)
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    if 'label' in categorical_columns:
        categorical_columns.remove('label')
    
    print(f"Found {len(categorical_columns)} categorical columns: {categorical_columns}")
    
    # Encode each categorical column
    for col in categorical_columns:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
        print(f"Encoded {col}: {dict(zip(le.classes_, le.transform(le.classes_)))}")
    
    # Handle the target variable separately
    if 'label' in df.columns:
        le = LabelEncoder()
        df_encoded['label'] = le.fit_transform(df['label'].astype(str))
        label_encoders['label'] = le
        print(f"Encoded label: {dict(zip(le.classes_, le.transform(le.classes_)))}")
    
    return df_encoded, label_encoders

# Encode categorical features
df_encoded, label_encoders = encode_categorical_features(df)

# Display the first few rows of the encoded DataFrame
df_encoded.head()

### 2.6 Convert All Data Types to Float

In [None]:
# Function to convert all data types to float
def convert_to_float(df):
    """
    Convert all columns (except the target variable) to float.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with all columns converted to float
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_float = df.copy()
    
    # Get list of all columns except the target variable
    columns = df.columns.tolist()
    if 'label' in columns:
        columns.remove('label')
    
    # Convert each column to float
    for col in columns:
        df_float[col] = pd.to_numeric(df[col], errors='coerce').astype(float)
    
    return df_float

# Convert all data types to float
df_float = convert_to_float(df_encoded)

# Display the data types after conversion
print("Data types after conversion:")
print(df_float.dtypes)

### 2.7 Replace Infinite Values with NaN

In [None]:
# Function to replace infinite values with NaN
def replace_inf_with_nan(df):
    """
    Replace infinite values with NaN in the DataFrame.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with infinite values replaced with NaN
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_no_inf = df.copy()
    
    # Replace infinite values with NaN
    df_no_inf = df_no_inf.replace([np.inf, -np.inf], np.nan)
    
    # Count NaN values after replacement
    nan_counts = df_no_inf.isna().sum()
    print("NaN counts after replacing infinite values:")
    print(nan_counts[nan_counts > 0])
    
    return df_no_inf

# Replace infinite values with NaN
df_no_inf = replace_inf_with_nan(df_float)

### 2.8 Handle Missing Values (NaN) by Imputing with the Column Mean

In [None]:
# Function to impute missing values with column mean
def impute_missing_values(df):
    """
    Impute missing values (NaN) with the column mean.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with missing values imputed
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_imputed = df.copy()
    
    # Get list of all columns except the target variable
    columns = df.columns.tolist()
    if 'label' in columns:
        columns.remove('label')
    
    # Impute missing values with column mean
    for col in columns:
        if df_imputed[col].isna().any():
            mean_value = df_imputed[col].mean()
            df_imputed[col] = df_imputed[col].fillna(mean_value)
            print(f"Imputed {df_imputed[col].isna().sum()} missing values in {col} with mean: {mean_value:.4f}")
    
    # Verify that there are no more missing values
    nan_counts = df_imputed.isna().sum()
    if nan_counts.sum() > 0:
        print("Warning: There are still missing values in the DataFrame")
        print(nan_counts[nan_counts > 0])
    else:
        print("All missing values have been imputed")
    
    return df_imputed

# Impute missing values with column mean
df_imputed = impute_missing_values(df_no_inf)

### 2.9 Replace Nonsensical Negative Values with the Median

In [None]:
# Function to replace nonsensical negative values with the median
def replace_negative_values(df):
    """
    Replace nonsensical negative values in specific columns with the median.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with nonsensical negative values replaced
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_no_neg = df.copy()
    
    # List of columns that should not have negative values
    # These are typically columns representing counts, durations, lengths, etc.
    non_negative_columns = [
        col for col in df.columns if any(keyword in col.lower() for keyword in 
                                        ['duration', 'length', 'packets', 'bytes', 'count', 'min', 'max', 'mean'])
    ]
    
    print(f"Found {len(non_negative_columns)} columns that should not have negative values")
    
    # Replace negative values with the median in each column
    for col in non_negative_columns:
        if col in df_no_neg.columns and (df_no_neg[col] < 0).any():
            neg_count = (df_no_neg[col] < 0).sum()
            median_value = df_no_neg[df_no_neg[col] >= 0][col].median()
            df_no_neg.loc[df_no_neg[col] < 0, col] = median_value
            print(f"Replaced {neg_count} negative values in {col} with median: {median_value:.4f}")
    
    return df_no_neg

# Replace nonsensical negative values with the median
df_no_neg = replace_negative_values(df_imputed)

### 2.10 Detect and Remove Outliers using Isolation Forest

In [None]:
# Function to detect and remove outliers using Isolation Forest
def remove_outliers(df, contamination=0.05):
    """
    Detect and remove outliers using the Isolation Forest algorithm.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        contamination (float): The proportion of outliers in the dataset
        
    Returns:
        pd.DataFrame: DataFrame with outliers removed
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_no_outliers = df.copy()
    
    # Get list of all columns except the target variable
    X = df.drop(columns=['label'] if 'label' in df.columns else [])
    
    # Initialize and fit the Isolation Forest model
    isolation_forest = IsolationForest(contamination=contamination, random_state=RANDOM_STATE)
    outlier_pred = isolation_forest.fit_predict(X)
    
    # Count outliers
    outlier_count = (outlier_pred == -1).sum()
    print(f"Detected {outlier_count} outliers ({outlier_count/len(df)*100:.2f}% of the dataset)")
    
    # Remove outliers
    df_no_outliers = df_no_outliers[outlier_pred == 1]
    print(f"DataFrame shape after removing outliers: {df_no_outliers.shape}")
    
    return df_no_outliers

# Detect and remove outliers
df_no_outliers = remove_outliers(df_no_neg)

### 2.11 Scale All Numerical Features using StandardScaler

In [None]:
# Function to scale numerical features
def scale_features(df):
    """
    Scale all numerical features using StandardScaler.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with scaled features
        StandardScaler: Fitted scaler object
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_scaled = df.copy()
    
    # Get list of all columns except the target variable
    X = df.drop(columns=['label'] if 'label' in df.columns else [])
    
    # Initialize and fit the StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Create a new DataFrame with scaled features
    df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    
    # Add the target variable back to the DataFrame
    if 'label' in df.columns:
        df_scaled['label'] = df['label'].values
    
    print(f"Features scaled using StandardScaler")
    
    return df_scaled, scaler

# Scale numerical features
df_scaled, scaler = scale_features(df_no_outliers)

# Display the first few rows of the scaled DataFrame
df_scaled.head()

### 2.12 Summary of Data Preprocessing

In [None]:
# Display summary of data preprocessing steps
print("Summary of Data Preprocessing Steps:")
print(f"1. Original DataFrame shape: {df.shape}")
print(f"2. After removing duplicate columns: {df_encoded.shape}")
print(f"3. After handling missing values: {df_imputed.shape}")
print(f"4. After removing outliers: {df_no_outliers.shape}")
print(f"5. Final DataFrame shape: {df_scaled.shape}")

# Display class distribution
if 'label' in df_scaled.columns:
    print("\nClass Distribution:")
    class_counts = df_scaled['label'].value_counts()
    class_percentages = class_counts / len(df_scaled) * 100
    
    # Create a DataFrame to display class distribution
    class_distribution = pd.DataFrame({
        'Count': class_counts,
        'Percentage': class_percentages
    })
    
    # Map numeric labels back to original class names
    if 'label' in label_encoders:
        class_distribution.index = [label_encoders['label'].inverse_transform([i])[0] for i in class_distribution.index]
    
    print(class_distribution)

<a id="eda"></a>
## 3. Exploratory Data Analysis (EDA)

In this section, we'll perform exploratory data analysis to understand the dataset better.

### 3.1 Descriptive Statistics

In [None]:
# Function to compute descriptive statistics
def compute_descriptive_statistics(df):
    """
    Compute descriptive statistics for the DataFrame.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        
    Returns:
        pd.DataFrame: DataFrame with descriptive statistics
    """
    # Get list of all columns except the target variable
    X = df.drop(columns=['label'] if 'label' in df.columns else [])
    
    # Compute descriptive statistics
    stats = X.describe(percentiles=[0.25, 0.5, 0.75])
    
    # Add mode to the statistics
    mode = X.mode().iloc[0]
    stats.loc['mode'] = mode
    
    return stats

# Compute descriptive statistics
stats = compute_descriptive_statistics(df_no_outliers)

# Display descriptive statistics
print("Descriptive Statistics:")
stats

### 3.2 Visualize Descriptive Statistics

In [None]:
# Function to visualize descriptive statistics
def visualize_descriptive_statistics(stats):
    """
    Visualize descriptive statistics using bar plots.
    
    Args:
        stats (pd.DataFrame): DataFrame with descriptive statistics
    """
    # Select a subset of columns for visualization
    selected_columns = stats.columns[:5]  # Select first 5 columns
    
    # Select statistics to visualize
    selected_stats = ['min', '25%', '50%', '75%', 'max', 'mean', 'std']
    
    # Create a figure with subplots
    fig, axes = plt.subplots(len(selected_columns), 1, figsize=(12, 4 * len(selected_columns)))
    
    # Plot each column's statistics
    for i, col in enumerate(selected_columns):
        ax = axes[i] if len(selected_columns) > 1 else axes
        
        # Extract statistics for the column
        col_stats = stats.loc[selected_stats, col]
        
        # Create bar plot
        col_stats.plot(kind='bar', ax=ax)
        
        # Set title and labels
        ax.set_title(f'Descriptive Statistics for {col}')
        ax.set_ylabel('Value')
        ax.set_xlabel('Statistic')
        
        # Add value labels on top of bars
        for j, v in enumerate(col_stats):
            ax.text(j, v, f'{v:.2f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

# Visualize descriptive statistics
visualize_descriptive_statistics(stats)

### 3.3 Distribution Analysis

In [None]:
# Function to plot histograms for selected features
def plot_histograms(df, n_cols=5):
    """
    Plot histograms for selected features.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        n_cols (int): Number of columns to visualize
    """
    # Get list of all columns except the target variable
    X = df.drop(columns=['label'] if 'label' in df.columns else [])
    
    # Select a subset of columns for visualization
    selected_columns = X.columns[:n_cols]  # Select first n_cols columns
    
    # Create a figure with subplots
    fig, axes = plt.subplots(n_cols, 1, figsize=(12, 4 * n_cols))
    
    # Plot histogram for each column
    for i, col in enumerate(selected_columns):
        ax = axes[i] if n_cols > 1 else axes
        
        # Plot histogram
        sns.histplot(df[col], kde=True, ax=ax)
        
        # Set title and labels
        ax.set_title(f'Distribution of {col}')
        ax.set_xlabel('Value')
        ax.set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

# Plot histograms for selected features
plot_histograms(df_no_outliers)

### 3.4 Correlation and Relationship Analysis

In [None]:
# Function to create a correlation matrix heatmap
def create_correlation_matrix(df):
    """
    Create a correlation matrix heatmap.
    
    Args:
        df (pd.DataFrame): Input DataFrame
    """
    # Get list of all columns including the target variable
    columns = df.columns.tolist()
    
    # Compute correlation matrix
    corr_matrix = df[columns].corr()
    
    # Create a heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, linewidths=0.5)
    plt.title('Correlation Matrix Heatmap')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

# Create a correlation matrix heatmap
create_correlation_matrix(df_no_outliers)

In [None]:
# Function to create a filtered heatmap for high correlations
def create_filtered_heatmap(df, threshold=0.5):
    """
    Create a filtered heatmap for high correlations.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        threshold (float): Correlation threshold
    """
    # Get list of all columns including the target variable
    columns = df.columns.tolist()
    
    # Compute correlation matrix
    corr_matrix = df[columns].corr()
    
    # Create a mask for the upper triangle
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    
    # Create a filtered correlation matrix
    filtered_corr = corr_matrix.mask(np.abs(corr_matrix) < threshold)
    filtered_corr = filtered_corr.mask(mask)
    
    # Create a heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(filtered_corr, annot=True, cmap='coolwarm', center=0, linewidths=0.5, fmt='.2f')
    plt.title(f'Filtered Correlation Matrix Heatmap (|corr| >= {threshold})')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

# Create a filtered heatmap for high correlations (threshold = 0.5)
create_filtered_heatmap(df_no_outliers, threshold=0.5)

<a id="feature_engineering"></a>
## 4. Feature Engineering

In this section, we'll perform feature engineering to prepare the data for machine learning.

### 4.1 Select Features Based on High Correlation with the Target

In [None]:
# Function to select features based on correlation with the target
def select_features(df, threshold=0.1):
    """
    Select features based on correlation with the target.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        threshold (float): Correlation threshold
        
    Returns:
        list: List of selected feature names
    """
    # Check if the target variable exists
    if 'label' not in df.columns:
        print("Warning: Target variable 'label' not found in the DataFrame")
        return df.columns.tolist()
    
    # Compute correlation with the target
    corr_with_target = df.corr()['label'].abs().sort_values(ascending=False)
    
    # Select features with correlation above the threshold
    selected_features = corr_with_target[corr_with_target >= threshold].index.tolist()
    
    # Remove the target variable from the list of selected features
    if 'label' in selected_features:
        selected_features.remove('label')
    
    print(f"Selected {len(selected_features)} features with correlation >= {threshold}")
    
    return selected_features

# Select features based on correlation with the target
selected_features = select_features(df_scaled, threshold=0.1)

# Display selected features and their correlation with the target
if 'label' in df_scaled.columns:
    corr_with_target = df_scaled.corr()['label'].abs().sort_values(ascending=False)
    print("\nSelected Features and Their Correlation with the Target:")
    print(corr_with_target[selected_features])

### 4.2 Split Dataset into Features (X) and Target (y)

In [None]:
# Function to split dataset into features and target
def split_features_target(df, selected_features=None):
    """
    Split dataset into features (X) and target (y).
    
    Args:
        df (pd.DataFrame): Input DataFrame
        selected_features (list): List of selected feature names
        
    Returns:
        tuple: (X, y) where X is the feature matrix and y is the target vector
    """
    # Check if the target variable exists
    if 'label' not in df.columns:
        print("Warning: Target variable 'label' not found in the DataFrame")
        return df, None
    
    # Use selected features if provided, otherwise use all features except the target
    if selected_features is not None:
        X = df[selected_features]
    else:
        X = df.drop(columns=['label'])
    
    # Extract the target variable
    y = df['label']
    
    print(f"Features shape: {X.shape}")
    print(f"Target shape: {y.shape}")
    
    return X, y

# Split dataset into features and target
X, y = split_features_target(df_scaled, selected_features)

<a id="ml_phase"></a>
## 5. Machine Learning Phase

In this section, we'll apply various machine learning algorithms to the dataset.

### 5.1 Data Splitting

In [None]:
# Function to split data into training and testing sets
def split_data(X, y, test_size=0.2, stratify=True):
    """
    Split data into training and testing sets.
    
    Args:
        X (pd.DataFrame): Feature matrix
        y (pd.Series): Target vector
        test_size (float): Proportion of the dataset to include in the test split
        stratify (bool): Whether to use stratified sampling
        
    Returns:
        tuple: (X_train, X_test, y_train, y_test)
    """
    # Check if the target variable exists
    if y is None:
        print("Warning: Target variable is None, cannot split data")
        return X, None, None, None
    
    # Split data into training and testing sets
    if stratify:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=RANDOM_STATE, stratify=y
        )
        print(f"Data split using stratified sampling (test_size={test_size})")
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=RANDOM_STATE
        )
        print(f"Data split using random sampling (test_size={test_size})")
    
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Testing set: {X_test.shape[0]} samples")
    
    return X_train, X_test, y_train, y_test

# Split data into training and testing sets
X_train, X_test, y_train, y_test = split_data(X, y, test_size=0.2, stratify=True)

### 5.2 Classification Models

In [None]:
# Function to train and evaluate classification models
def train_evaluate_classification_models(X_train, X_test, y_train, y_test):
    """
    Train and evaluate multiple classification models.
    
    Args:
        X_train (pd.DataFrame): Training feature matrix
        X_test (pd.DataFrame): Testing feature matrix
        y_train (pd.Series): Training target vector
        y_test (pd.Series): Testing target vector
        
    Returns:
        dict: Dictionary of fitted models
        pd.DataFrame: DataFrame with model evaluation metrics
    """
    # Check if the target variable exists
    if y_train is None or y_test is None:
        print("Warning: Target variable is None, cannot train classification models")
        return None, None
    
    # Initialize dictionary to store models
    models = {}
    
    # Initialize dictionary to store evaluation metrics
    metrics = {
        'Model': [],
        'Accuracy': [],
        'Precision': [],
        'Recall': [],
        'F1 Score': []
    }
    
    # Define classification models
    classification_models = {
        'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
        'Decision Tree': DecisionTreeClassifier(random_state=RANDOM_STATE),
        'Random Forest': RandomForestClassifier(random_state=RANDOM_STATE),
        'Logistic Regression': LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)
    }
    
    # Train and evaluate each model
    for name, model in classification_models.items():
        print(f"Training {name}...")
        
        # Fit the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Compute evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        # Store the model
        models[name] = model
        
        # Store evaluation metrics
        metrics['Model'].append(name)
        metrics['Accuracy'].append(accuracy)
        metrics['Precision'].append(precision)
        metrics['Recall'].append(recall)
        metrics['F1 Score'].append(f1)
        
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  F1 Score: {f1:.4f}")
        print()
    
    # Create a DataFrame with evaluation metrics
    metrics_df = pd.DataFrame(metrics)
    metrics_df = metrics_df.sort_values('F1 Score', ascending=False).reset_index(drop=True)
    
    return models, metrics_df

# Train and evaluate classification models
classification_models, metrics_df = train_evaluate_classification_models(X_train, X_test, y_train, y_test)

<a id="evaluation"></a>
## 6. Model Evaluation

In this section, we'll evaluate the performance of the trained models.

### 6.1 Metrics

In [None]:
# Display model evaluation metrics
if metrics_df is not None:
    print("Model Evaluation Metrics:")
    metrics_df

### 6.2 Confusion Matrix

In [None]:
# Function to plot confusion matrix
def plot_confusion_matrix(model, X_test, y_test, model_name):
    """
    Plot confusion matrix for a classification model.
    
    Args:
        model: Fitted classification model
        X_test (pd.DataFrame): Testing feature matrix
        y_test (pd.Series): Testing target vector
        model_name (str): Name of the model
    """
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Compute confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()

# Plot confusion matrix for the best model
if classification_models is not None and metrics_df is not None:
    best_model_name = metrics_df.iloc[0]['Model']
    best_model = classification_models[best_model_name]
    plot_confusion_matrix(best_model, X_test, y_test, best_model_name)

### 6.3 Model Comparison

In [None]:
# Function to visualize model comparison
def visualize_model_comparison(metrics_df):
    """
    Visualize model comparison using bar plots.
    
    Args:
        metrics_df (pd.DataFrame): DataFrame with model evaluation metrics
    """
    # Check if metrics_df exists
    if metrics_df is None:
        print("Warning: metrics_df is None, cannot visualize model comparison")
        return
    
    # Create a figure with subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.flatten()
    
    # Plot each metric
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    
    for i, metric in enumerate(metrics):
        # Sort DataFrame by the current metric
        df_sorted = metrics_df.sort_values(metric, ascending=False).reset_index(drop=True)
        
        # Create bar plot
        sns.barplot(x='Model', y=metric, data=df_sorted, ax=axes[i])
        
        # Set title and labels
        axes[i].set_title(f'Model Comparison - {metric}')
        axes[i].set_xlabel('Model')
        axes[i].set_ylabel(metric)
        
        # Rotate x-axis labels
        axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45, ha='right')
        
        # Add value labels on top of bars
        for j, v in enumerate(df_sorted[metric]):
            axes[i].text(j, v, f'{v:.4f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

# Visualize model comparison
if metrics_df is not None:
    visualize_model_comparison(metrics_df)

<a id="usage"></a>
## 7. Model Usage

In this section, we'll demonstrate how to use the trained models for prediction.

In [None]:
# Function to predict using all models
def predict_with_all_models(sample, models, scaler, label_encoder=None):
    """
    Predict the class of a single data sample using all trained models.
    
    Args:
        sample (pd.DataFrame): Single data sample
        models (dict): Dictionary of fitted models
        scaler (StandardScaler): Fitted scaler object
        label_encoder (LabelEncoder): Fitted label encoder for the target variable
        
    Returns:
        pd.DataFrame: DataFrame with predictions from all models
    """
    # Check if models exist
    if models is None:
        print("Warning: models is None, cannot make predictions")
        return None
    
    # Scale the sample
    sample_scaled = scaler.transform(sample)
    
    # Initialize dictionary to store predictions
    predictions = {
        'Model': [],
        'Prediction': [],
        'Confidence': []
    }
    
    # Make predictions with each model
    for name, model in models.items():
        # Make prediction
        pred = model.predict(sample_scaled)[0]
        
        # Get prediction confidence
        if hasattr(model, 'predict_proba'):
            prob = model.predict_proba(sample_scaled)[0]
            confidence = prob[pred] if len(prob) > 1 else prob[0]
        else:
            confidence = None
        
        # Convert numeric prediction to class name
        if label_encoder is not None:
            pred_class = label_encoder.inverse_transform([pred])[0]
        else:
            pred_class = pred
        
        # Store prediction
        predictions['Model'].append(name)
        predictions['Prediction'].append(pred_class)
        predictions['Confidence'].append(confidence)
    
    # Create a DataFrame with predictions
    predictions_df = pd.DataFrame(predictions)
    
    return predictions_df

# Demonstrate model usage with a sample from the test set
if classification_models is not None and X_test is not None and y_test is not None:
    # Get a sample from the test set
    sample_idx = 0
    sample = X_test.iloc[[sample_idx]]
    true_label = y_test.iloc[sample_idx]
    
    # Convert numeric label to class name
    if 'label' in label_encoders:
        true_class = label_encoders['label'].inverse_transform([true_label])[0]
    else:
        true_class = true_label
    
    print(f"True label: {true_class}")
    
    # Predict with all models
    predictions_df = predict_with_all_models(
        sample, classification_models, scaler, 
        label_encoders['label'] if 'label' in label_encoders else None
    )
    
    # Display predictions
    if predictions_df is not None:
        print("\nPredictions from all models:")
        predictions_df

In [None]:
# Function to implement a prediction pipeline
def prediction_pipeline(data, selected_features, scaler, model, label_encoder=None):
    """
    Implement a prediction pipeline for a single data sample.
    
    Args:
        data (pd.DataFrame): Input data
        selected_features (list): List of selected feature names
        scaler (StandardScaler): Fitted scaler object
        model: Fitted model
        label_encoder (LabelEncoder): Fitted label encoder for the target variable
        
    Returns:
        tuple: (prediction, confidence)
    """
    # Select features
    X = data[selected_features]
    
    # Scale features
    X_scaled = scaler.transform(X)
    
    # Make prediction
    pred = model.predict(X_scaled)[0]
    
    # Get prediction confidence
    if hasattr(model, 'predict_proba'):
        prob = model.predict_proba(X_scaled)[0]
        confidence = prob[pred] if len(prob) > 1 else prob[0]
    else:
        confidence = None
    
    # Convert numeric prediction to class name
    if label_encoder is not None:
        pred_class = label_encoder.inverse_transform([pred])[0]
    else:
        pred_class = pred
    
    return pred_class, confidence

# Example usage of the prediction pipeline
print("Example Usage of the Prediction Pipeline:")
print("""# Load and preprocess new data
new_data = pd.read_csv('new_data.csv')
new_data = preprocess_data(new_data)  # Apply the same preprocessing steps

# Make prediction
prediction, confidence = prediction_pipeline(
    new_data, selected_features, scaler, best_model, 
    label_encoders['label'] if 'label' in label_encoders else None
)

print(f"Prediction: {prediction}")
print(f"Confidence: {confidence:.4f}")
""")