<a href="https://colab.research.google.com/github/Shivam2Goyal/Advanced-Area-Computation-System/blob/main/NID_using_RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
import warnings

# Ignore warnings to keep the output clean
warnings.filterwarnings("ignore")

# Load the NSL-KDD dataset and define column names
column_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins",
    "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root",
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count",
    "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate",
    "dst_host_srv_rerror_rate", "label"
]

# Read the dataset
df = pd.read_csv("/content/KDDTrain+_20Percent.txt", names=column_names)

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Drop any missing values (if present)
df.dropna(inplace=True)

# Convert categorical columns into numerical values
categorical_features = ["protocol_type", "service", "flag"]
encoders = {}

for col in categorical_features:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])  # Convert text labels to numbers
    encoders[col] = encoder  # Store encoders for potential inverse transformation

# Separate features (X) and target variable (y)
X = df.drop(columns=["label"])
y = df["label"]

# Extract only numerical columns for standardization
numeric_columns = X.select_dtypes(include=np.number).columns
X_numeric = X[numeric_columns]

# Apply standardization to scale numeric features
scaler = StandardScaler()
X_scaled_numeric = scaler.fit_transform(X_numeric)

# Create a DataFrame with scaled numerical data
X_scaled = pd.DataFrame(X_scaled_numeric, columns=numeric_columns, index=X.index)

# Merge back the categorical (already encoded) columns
X_scaled = pd.concat([X_scaled, X[categorical_features]], axis=1)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [4]:
# Train a Random Forest model using sklearn
rf_sklearn = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=42)
rf_sklearn.fit(X_train, y_train)
y_pred_sklearn = rf_sklearn.predict(X_test)

# Print Sklearn Random Forest Accuracy
print(f"Sklearn Random Forest Accuracy: {accuracy_score(y_test, y_pred_sklearn)*100:.4f}%")

Sklearn Random Forest Accuracy: 84.9230%


In [6]:
# Implementing Random Forest from scratch (except using sklearn's DecisionTreeClassifier)
class CustomRandomForest:
    def __init__(self, n_trees=100, max_depth=None, min_samples_split=2, max_features="sqrt"):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.trees = []

    def bootstrap_sample(self, X, y):
        """Generates a bootstrapped dataset by randomly sampling with replacement."""
        n_samples = X.shape[0]
        sample_indices = np.random.choice(n_samples, n_samples, replace=True)
        return X.iloc[sample_indices], y.iloc[sample_indices]  # Select data using indices

    def fit(self, X, y):
        """Train multiple Decision Trees on different bootstrapped datasets."""
        self.trees = []
        n_features = X.shape[1]

        # Determine the number of features to consider for each tree
        if self.max_features == "sqrt":
            num_features = max(int(np.sqrt(n_features)), 10)  # Ensure a minimum of 10 features
        elif self.max_features == "log2":
            num_features = max(int(np.log2(n_features)), 10)
        else:
            num_features = n_features // 2  # Use half of the total features

        for _ in range(self.n_trees):
            X_sample, y_sample = self.bootstrap_sample(X, y)
            selected_features = np.random.choice(n_features, num_features, replace=False)

            # Train a Decision Tree using the selected subset of features
            tree = DecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                criterion="gini"
            )
            tree.fit(X_sample.iloc[:, selected_features], y_sample)
            self.trees.append((tree, selected_features))  # Store both the tree and selected features

    def predict(self, X):
        """Predict using majority voting across all Decision Trees."""
        predictions = np.zeros((X.shape[0], len(self.trees)))

        for i, (tree, selected_features) in enumerate(self.trees):
            predictions[:, i] = tree.predict(X.iloc[:, selected_features])

        # Majority voting: select the most common prediction among all trees
        final_predictions = np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=1, arr=predictions)
        return final_predictions.astype(int)

# Train the custom Random Forest model
rf_custom = CustomRandomForest(n_trees=100, max_depth=None, min_samples_split=2)
rf_custom.fit(X_train, y_train)
y_pred_custom = rf_custom.predict(X_test)

# Print Custom Random Forest Accuracy
print(f"Custom Random Forest Accuracy: {accuracy_score(y_test, y_pred_custom)*100:.4f}%")

Custom Random Forest Accuracy: 76.5447%
