In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import time

# Load the NSL-KDD dataset
# Replace 'path_to_dataset' with the actual path to the NSL-KDD dataset file
dataset = pd.read_csv('kdd.csv')

# Assuming the dataset has features in columns 1 to n and the target labels in the last column
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess categorical features with one-hot encoding and impute missing values
categorical_features = ['protocol_type', 'service', 'flag']
numeric_features = [col for col in X.columns if col not in categorical_features]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply the preprocessing steps to the training set and measure the time
start_time = time.time()
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Transform the testing set using the same preprocessing steps and measure the time
start_time = time.time()
X_test_preprocessed = preprocessor.transform(X_test)
transformation_time = time.time() - start_time

# Train a Random Forest model and measure the time
start_time = time.time()
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train_preprocessed, y_train)
training_time = time.time() - start_time

# Make predictions and measure the time
start_time = time.time()
y_pred = classifier.predict(X_test_preprocessed)
prediction_time = time.time() - start_time

# Evaluate the model
accuracy = 100 * accuracy_score(y_test, y_pred)

# Calculate False Positive Rate for each class
false_positive_rates = dict()
for class_label in np.unique(y_test):
    tn, fp, fn, tp = confusion_matrix(y_test == class_label, y_pred == class_label).ravel()
    false_positive_rates[class_label] = fp / (fp + tn)

# Average False Positive Rate across all classes
average_false_positive_rate = sum(false_positive_rates.values()) / len(false_positive_rates)

# Print results
print("Training Time: {:.2f} seconds".format(training_time))
print("Transformation Time: {:.2f} seconds".format(transformation_time))
print("Prediction Time: {:.2f} seconds".format(prediction_time))
print("Accuracy: {:.4f}".format(accuracy))
print("Average False Positive Rate: {:.4f}".format(average_false_positive_rate))


Training Time: 39.75 seconds
Transformation Time: 0.11 seconds
Prediction Time: 2.11 seconds
Accuracy: 99.6802
Average False Positive Rate: 0.0002
