In [2]:
# 1. Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# This code assumes you have 'KDDTrain+.txt' and 'KDDTest+.txt' in the same folder.

# 2. Load the Training and Testing Datasets
# Generate generic column names since the files don't have headers.
col_names = [f"feature_{i}" for i in range(41)] + ['label', 'difficulty']

# Load training data
print("Loading training data...")
train_df = pd.read_csv(r"C:\Users\sande\Downloads\ids\nsl_kdd\KDDTrain+.txt", header=None, names=col_names)

# Load testing data
print("Loading testing data...")
test_df = pd.read_csv(r"C:\Users\sande\Downloads\ids\nsl_kdd\KDDTest+.txt", header=None, names=col_names)

# 3. Prepare the Data
# Drop the 'difficulty' column
train_df = train_df.drop('difficulty', axis=1)
test_df = test_df.drop('difficulty', axis=1)

# Separate features (X) and labels (y) for both sets
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']

X_test = test_df.drop('label', axis=1)
y_test = test_df['label']


# --- Important Preprocessing Steps ---

# A. Convert Categorical Features to Numbers
print("Preprocessing data...")
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# B. Align columns between training and testing sets to ensure they match
train_cols = X_train.columns
test_cols = X_test.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0

missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X_train[c] = 0

X_test = X_test[X_train.columns]

# C. Scale the Features
# THIS IS CRITICAL FOR k-NN! Since k-NN is based on distance,
# features with large scales can dominate those with small scales.
# We must scale the data so all features are treated equally.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# 4. Create and Train the k-NN Model
# 'n_neighbors=5' is a common starting point for 'k'. You can experiment with other values.
model = KNeighborsClassifier(n_neighbors=5)

print("Training the k-NN model...")
# k-NN's "training" is very fast as it just stores the data.
model.fit(X_train_scaled, y_train)
print("Model training complete.")


# 5. Make Predictions on the Test Data
print("Making predictions on the test set...")
y_pred = model.predict(X_test_scaled)
print("Prediction complete.")


# 6. Evaluate the Model's Performance
print("\n--- k-NN Model Evaluation ---")

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

Loading training data...
Loading testing data...
Preprocessing data...
Training the k-NN model...
Model training complete.
Making predictions on the test set...
Prediction complete.

--- k-NN Model Evaluation ---
Accuracy: 0.6930
Precision: 0.5841
Recall: 0.6930
F1-Score: 0.5987

Classification Report:
                 precision    recall  f1-score   support

        apache2       0.00      0.00      0.00       737
           back       0.33      0.67      0.44       359
buffer_overflow       0.55      0.55      0.55        20
      ftp_write       0.25      0.33      0.29         3
   guess_passwd       1.00      0.02      0.05      1231
     httptunnel       0.00      0.00      0.00       133
           imap       0.00      0.00      0.00         1
        ipsweep       0.90      0.99      0.94       141
           land       1.00      0.57      0.73         7
     loadmodule       0.50      1.00      0.67         2
       mailbomb       0.00      0.00      0.00       293
          m