In [64]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif
from collections import Counter

In [65]:
train_df = pd.read_csv(r'C:\Users\HP\OneDrive\Desktop\nsl_kdd\nsl-kdd\KDDTrain+.arff', comment='@', header=None)
test_df = pd.read_csv(r'C:\Users\HP\OneDrive\Desktop\nsl_kdd\nsl-kdd\KDDTest+.arff', comment='@', header=None)

In [66]:
# Optional: Assign column names for clarity
num_features = train_df.shape[1] - 1
cols = [f"f{i}" for i in range(num_features)] + ['label']
train_df.columns = cols
test_df.columns = cols

In [67]:
# -----------------------------
# 3. Encode categorical features
# -----------------------------
categorical_cols = ['f1', 'f2', 'f3']  # protocol_type, service, flag
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]])
    le.fit(combined)
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

In [68]:
# -----------------------------
# 4. Encode labels (binary)
# -----------------------------
train_df['label'] = train_df['label'].apply(lambda x: 0 if x.strip() == 'normal' else 1)
test_df['label'] = test_df['label'].apply(lambda x: 0 if x.strip() == 'normal' else 1)

In [69]:
# -----------------------------
# 5. Split features and labels
# -----------------------------
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

In [70]:
# -----------------------------
# 6. Scale features
# -----------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [71]:
# -----------------------------
# 7. Handle class imbalance using SMOTE
# -----------------------------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("Before SMOTE:", Counter(y_train))
print("After SMOTE:", Counter(y_train_res))

Before SMOTE: Counter({0: 67343, 1: 58630})
After SMOTE: Counter({0: 67343, 1: 67343})


In [72]:
# -----------------------------
# 8. Feature selection (optional, improves performance)
# -----------------------------
selector = SelectKBest(score_func=f_classif, k=30)  # top 30 features
X_train_res = selector.fit_transform(X_train_res, y_train_res)
X_test = selector.transform(X_test)

  f = msb / msw


In [73]:
# -----------------------------
# 9. Train Random Forest
# -----------------------------
rf = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, class_weight='balanced')
rf.fit(X_train_res, y_train_res)

In [74]:
# -----------------------------
# 10. Predictions and Evaluation
# -----------------------------
y_pred = rf.predict(X_test)

In [75]:
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7662792760823279

Confusion Matrix:
 [[9445  266]
 [5003 7830]]

Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.97      0.78      9711
           1       0.97      0.61      0.75     12833

    accuracy                           0.77     22544
   macro avg       0.81      0.79      0.77     22544
weighted avg       0.83      0.77      0.76     22544

