<a href="https://colab.research.google.com/github/NikhitaB15/Intrusion-Detection-System/blob/main/IDS_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import scapy.all as scapy
import threading
import time
import os
import pickle

# ------ PART 1: DATA LOADING AND PREPROCESSING ------

def load_nsl_kdd_data(train_path='/content/KDDTrain+.txt', test_path='/content/KDDTest+.txt'):
    """Load NSL-KDD dataset and perform initial preprocessing"""

    # Column names based on NSL-KDD dataset documentation
    columns = [
        'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
        'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
        'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
        'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
        'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
        'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
        'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
        'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
        'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
        'dst_host_srv_rerror_rate', 'attack_type', 'difficulty_level'
    ]

    # Load the datasets
    print("Loading NSL-KDD dataset...")
    try:
        train_data = pd.read_csv(train_path, header=None, names=columns)
        test_data = pd.read_csv(test_path, header=None, names=columns)

        # Combine train and test for preprocessing
        combined_data = pd.concat([train_data, test_data], axis=0)

        # Categorize attacks into 5 main classes
        attack_mapping = {
            'normal': 'normal',
            'back': 'dos', 'land': 'dos', 'neptune': 'dos', 'pod': 'dos',
            'smurf': 'dos', 'teardrop': 'dos', 'mailbomb': 'dos', 'apache2': 'dos',
            'processtable': 'dos', 'udpstorm': 'dos', 'worm': 'dos',
            'ipsweep': 'probe', 'nmap': 'probe', 'portsweep': 'probe', 'satan': 'probe',
            'mscan': 'probe', 'saint': 'probe',
            'ftp_write': 'r2l', 'guess_passwd': 'r2l', 'imap': 'r2l', 'multihop': 'r2l',
            'phf': 'r2l', 'spy': 'r2l', 'warezclient': 'r2l', 'warezmaster': 'r2l',
            'sendmail': 'r2l', 'named': 'r2l', 'snmpgetattack': 'r2l', 'snmpguess': 'r2l',
            'xlock': 'r2l', 'xsnoop': 'r2l', 'httptunnel': 'r2l',
            'buffer_overflow': 'u2r', 'loadmodule': 'u2r', 'perl': 'u2r', 'rootkit': 'u2r',
            'ps': 'u2r', 'sqlattack': 'u2r', 'xterm': 'u2r'
        }

        # Create attack category column
        combined_data['attack_category'] = combined_data['attack_type'].map(
            lambda x: attack_mapping.get(x.lower(), 'unknown')
        )

        # Binary classification: normal vs attack
        combined_data['is_attack'] = (combined_data['attack_category'] != 'normal').astype(int)

        return combined_data

    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Please make sure the NSL-KDD dataset is downloaded and paths are correct.")
        return None

def engineer_features(data):
    """Perform feature engineering on the NSL-KDD dataset"""

    print("Performing feature engineering...")

    # Create feature for ratio of source to destination bytes
    data['bytes_ratio'] = data['src_bytes'] / (data['dst_bytes'] + 1)  # Adding 1 to avoid division by zero

    # Connection rate features
    data['error_rate_sum'] = data['serror_rate'] + data['rerror_rate']
    data['srv_error_rate_sum'] = data['srv_serror_rate'] + data['srv_rerror_rate']
    data['host_error_rate_sum'] = data['dst_host_serror_rate'] + data['dst_host_rerror_rate']

    # Create a feature for connection time
    data['is_long_connection'] = (data['duration'] > 300).astype(int)

    # Feature for data transfer rate
    data['transfer_rate'] = (data['src_bytes'] + data['dst_bytes']) / (data['duration'] + 1)

    # Feature for login attempt success rate
    data['login_success_rate'] = data['logged_in'] / (data['num_failed_logins'] + 1)

    # Feature for same service rate consistency
    data['service_consistency'] = abs(data['same_srv_rate'] - data['dst_host_same_srv_rate'])

    # Feature to detect port scanning behavior
    data['port_scan_indicator'] = ((data['count'] > 3) &
                                 (data['srv_count'] == 1) &
                                 (data['dst_host_srv_count'] < 5)).astype(int)

    return data

def prepare_data(data):
    """Prepare data for model training by splitting features and target"""

    print("Preparing data for model training...")

    # Define features and target
    X = data.drop(['attack_type', 'difficulty_level', 'attack_category', 'is_attack'], axis=1)
    y = data['is_attack']  # Binary classification

    # Define categorical and numerical features
    categorical_features = ['protocol_type', 'service', 'flag']
    numerical_features = [col for col in X.columns if col not in categorical_features]

    # Create preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test, preprocessor

# ------ PART 2: MODEL TRAINING ------

def train_models(X_train, y_train, preprocessor):
    """Train Random Forest and Gradient Boosting models and create ensemble"""

    print("Training models...")

    # Define models
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=20,
        min_samples_split=10,
        random_state=42
    )

    gb_model = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42
    )

    # Create pipelines with preprocessing
    rf_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', rf_model)
    ])

    gb_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', gb_model)
    ])

    # Train individual models first
    print("Training Random Forest model...")
    rf_pipeline.fit(X_train, y_train)

    print("Training Gradient Boosting model...")
    gb_pipeline.fit(X_train, y_train)

    # Create ensemble model using voting
    ensemble_model = VotingClassifier(
        estimators=[
            ('rf', rf_pipeline),
            ('gb', gb_pipeline)
        ],
        voting='soft'  # Use probability estimates for voting
    )

    # Train ensemble model
    print("Training ensemble model...")
    ensemble_model.fit(X_train, y_train)

    return ensemble_model, rf_pipeline, gb_pipeline

def evaluate_model(model, X_test, y_test):
    """Evaluate model performance"""

    print("Evaluating model performance...")

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")

    # Detailed classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(cm)

    return accuracy, y_pred

# ------ PART 3: REAL-TIME PACKET SNIFFING WITH SCAPY ------

class NetworkSniffer:
    """Class for real-time packet sniffing and analysis"""

    def __init__(self, model, preprocessor):
        self.model = model
        self.preprocessor = preprocessor
        self.is_running = False
        self.packet_buffer = []
        self.buffer_size = 100  # Number of packets to analyze at once
        self.analysis_interval = 10  # Analysis interval in seconds

    def start_sniffing(self, interface=None):
        """Start the packet sniffer"""

        self.is_running = True
        self.sniffer_thread = threading.Thread(target=self._sniff_packets, args=(interface,))
        self.analyzer_thread = threading.Thread(target=self._analyze_packets)

        print(f"Starting packet sniffer on interface: {interface or 'default'}")
        self.sniffer_thread.daemon = True
        self.analyzer_thread.daemon = True

        self.sniffer_thread.start()
        self.analyzer_thread.start()

    def stop_sniffing(self):
        """Stop the packet sniffer"""

        print("Stopping packet sniffer...")
        self.is_running = False
        if hasattr(self, 'sniffer_thread'):
            self.sniffer_thread.join(timeout=1)
        if hasattr(self, 'analyzer_thread'):
            self.analyzer_thread.join(timeout=1)

    def _sniff_packets(self, interface):
        """Sniff packets using Scapy"""

        # Use Scapy to capture packets
        try:
            scapy.sniff(
                iface=interface,
                prn=self._process_packet,
                store=False,
                stop_filter=lambda _: not self.is_running
            )
        except Exception as e:
            print(f"Error sniffing packets: {e}")
            self.is_running = False

    def _process_packet(self, packet):
        """Process captured packet and store features"""

        # Extract features from packet
        features = self._extract_packet_features(packet)

        # Add to buffer
        if features:
            self.packet_buffer.append(features)

            # If buffer is full, signal analysis
            if len(self.packet_buffer) >= self.buffer_size:
                self._analyze_current_buffer()

    def _extract_packet_features(self, packet):
        """Extract relevant features from a packet for IDS analysis"""

        features = {}

        # Basic packet info
        if scapy.IP in packet:
            ip = packet[scapy.IP]
            features['protocol_type'] = ip.proto
            features['src_bytes'] = len(packet)
            features['dst_bytes'] = 0  # Can't know response size from single packet

            # TCP specific features
            if scapy.TCP in packet:
                tcp = packet[scapy.TCP]
                features['service'] = tcp.dport
                features['flag'] = tcp.flags
                features['src_port'] = tcp.sport
                features['dst_port'] = tcp.dport

            # UDP specific features
            elif scapy.UDP in packet:
                udp = packet[scapy.UDP]
                features['service'] = udp.dport
                features['flag'] = 0
                features['src_port'] = udp.sport
                features['dst_port'] = udp.dport

            # ICMP specific features
            elif scapy.ICMP in packet:
                features['service'] = 0
                features['flag'] = 0
                features['src_port'] = 0
                features['dst_port'] = 0

            # Default placeholder values for required NSL-KDD features
            # These would be calculated based on connection tracking in a real IDS
            features['duration'] = 0
            features['land'] = 0
            features['wrong_fragment'] = 0
            features['urgent'] = 0
            features['hot'] = 0
            features['num_failed_logins'] = 0
            features['logged_in'] = 0
            features['num_compromised'] = 0
            features['root_shell'] = 0
            features['su_attempted'] = 0
            features['num_root'] = 0
            features['num_file_creations'] = 0
            features['num_shells'] = 0
            features['num_access_files'] = 0
            features['num_outbound_cmds'] = 0
            features['is_host_login'] = 0
            features['is_guest_login'] = 0
            features['count'] = 1
            features['srv_count'] = 1

            # Return the feature dictionary
            return features

        return None

    def _analyze_packets(self):
        """Continuously analyze buffered packets"""

        while self.is_running:
            if len(self.packet_buffer) > 0:
                self._analyze_current_buffer()
            time.sleep(self.analysis_interval)

    def _analyze_current_buffer(self):
        """Analyze current packet buffer for intrusions"""

        if not self.packet_buffer:
            return

        print(f"Analyzing {len(self.packet_buffer)} packets...")

        try:
            # Convert buffer to DataFrame
            df = pd.DataFrame(self.packet_buffer)

            # Fill missing values
            df = df.fillna(0)

            # Add engineered features from the model training
            # In a real implementation, we would calculate these based on connection tracking
            df['bytes_ratio'] = df['src_bytes'] / (df['dst_bytes'] + 1)
            df['error_rate_sum'] = 0
            df['srv_error_rate_sum'] = 0
            df['host_error_rate_sum'] = 0
            df['is_long_connection'] = 0
            df['transfer_rate'] = df['src_bytes'] / 1
            df['login_success_rate'] = 0
            df['service_consistency'] = 0
            df['port_scan_indicator'] = 0

            # Add other required columns with default values
            for col in ['serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
                       'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
                       'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
                       'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
                       'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
                       'dst_host_srv_rerror_rate']:
                if col not in df.columns:
                    df[col] = 0

            # Make predictions
            predictions = self.model.predict(df)
            probabilities = self.model.predict_proba(df)

            # Count attacks detected
            attack_count = np.sum(predictions == 1)

            if attack_count > 0:
                print(f"⚠️ ALERT: Detected {attack_count} potential attacks out of {len(predictions)} packets!")

                # Get indices of potential attacks
                attack_indices = np.where(predictions == 1)[0]

                # Log attack details
                for idx in attack_indices:
                    attack_prob = probabilities[idx][1]
                    packet_data = self.packet_buffer[idx]
                    print(f"Attack probability: {attack_prob:.4f}")
                    print(f"Packet details: src_bytes={packet_data.get('src_bytes')}, "
                          f"protocol={packet_data.get('protocol_type')}, "
                          f"service={packet_data.get('service')}")
                    print("-" * 50)
            else:
                print("No attacks detected in current buffer.")

        except Exception as e:
            print(f"Error analyzing packets: {e}")

        # Clear buffer
        self.packet_buffer = []

# ------ PART 4: MAIN APPLICATION ------

def save_model(model, filename='ids_model.pkl'):
    """Save trained model to file"""

    print(f"Saving model to {filename}...")
    with open(filename, 'wb') as f:
        pickle.dump(model, f)

def load_model(filename='ids_model.pkl'):
    """Load trained model from file"""

    print(f"Loading model from {filename}...")
    try:
        with open(filename, 'rb') as f:
            model = pickle.load(f)
        return model
    except FileNotFoundError:
        print(f"Model file {filename} not found.")
        return None

def main():
    """Main function to run the IDS"""

    # Check if model exists
    model_file = 'ids_model.pkl'
    if os.path.exists(model_file):
        # Load existing model
        model = load_model(model_file)
        print("Model loaded successfully!")
    else:
        # Train new model
        print("No existing model found. Training new model...")

        # Load and preprocess data
        data = load_nsl_kdd_data()
        if data is None:
            print("Failed to load dataset. Exiting.")
            return

        # Engineer features
        data = engineer_features(data)

        # Prepare data for training
        X_train, X_test, y_train, y_test, preprocessor = prepare_data(data)

        # Train models
        ensemble_model, rf_pipeline, gb_pipeline = train_models(X_train, y_train, preprocessor)

        # Evaluate models (now they're properly fitted)
        print("\n--- Random Forest Model Evaluation ---")
        rf_accuracy, _ = evaluate_model(rf_pipeline, X_test, y_test)

        print("\n--- Gradient Boosting Model Evaluation ---")
        gb_accuracy, _ = evaluate_model(gb_pipeline, X_test, y_test)

        print("\n--- Ensemble Model Evaluation ---")
        ensemble_accuracy, _ = evaluate_model(ensemble_model, X_test, y_test)

        # Save model
        save_model(ensemble_model, model_file)
        model = ensemble_model

    # Start real-time packet sniffing
    print("\nStarting real-time intrusion detection system...")
    sniffer = NetworkSniffer(model, None)  # We pass None as preprocessor is part of the pipeline

    try:
        # Get interface from user
        print("Available interfaces:")
        for i, iface in enumerate(scapy.get_if_list()):
            print(f"{i}: {iface}")

        interface_idx = input("Select interface number (or press Enter for default): ")
        interface = None
        if interface_idx.strip():
            interface = scapy.get_if_list()[int(interface_idx)]

        # Start sniffing
        sniffer.start_sniffing(interface)

        # Run until user stops
        print("IDS is running. Press Ctrl+C to stop.")
        while True:
            time.sleep(1)

    except KeyboardInterrupt:
        print("Stopping IDS...")
    finally:
        sniffer.stop_sniffing()
        print("IDS stopped.")

if __name__ == "__main__":
    main()

Loading model from ids_model.pkl...
Model loaded successfully!

Starting real-time intrusion detection system...
Available interfaces:
0: lo
1: eth0
Stopping IDS...
Stopping packet sniffer...
IDS stopped.
