In [2]:
import pandas as pd

# Column names (same as before)
columns = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
    "wrong_fragment","urgent","hot","num_failed_logins","logged_in","num_compromised",
    "root_shell","su_attempted","num_root","num_file_creations","num_shells",
    "num_access_files","num_outbound_cmds","is_host_login","is_guest_login",
    "count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate",
    "same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count",
    "dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
    "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate",
    "label", "difficulty_level"
]

# ✅ Corrected filenames
train_df = pd.read_csv("data/NSL_KDD_Train.csv", names=columns)
test_df = pd.read_csv("data/NSL_KDD_Test.csv", names=columns)

# Display the first few rows
train_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty_level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,


In [6]:
from sklearn.preprocessing import LabelEncoder

# Step 1: (Skip difficulty_level — not present in your dataset)

# Step 2: Encode categorical features
categorical_cols = ['protocol_type', 'service', 'flag']
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    encoders[col] = le

# Step 3: Convert attack labels into binary (Normal vs Attack)
train_df['label'] = train_df['label'].apply(lambda x: 0 if x == 'normal' else 1)
test_df['label'] = test_df['label'].apply(lambda x: 0 if x == 'normal' else 1)

# Step 4: Split features and labels
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']

X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

# ✅ Preview
print("X_train shape:", X_train.shape)
print("y_train value counts:")
print(y_train.value_counts())

X_train shape: (125973, 41)
y_train value counts:
label
1    125973
Name: count, dtype: int64


In [7]:
train_df['label'].unique()

array([1])

In [8]:
# Load raw file without column names to examine structure
df_raw = pd.read_csv("data/NSL_KDD_Train.csv", header=None)

# Show the last few columns to locate the actual label column
df_raw.iloc[0:5, -5:]  # display last 5 columns of first 5 rows

Unnamed: 0,37,38,39,40,41
0,0.0,0.0,0.05,0.0,normal
1,0.0,0.0,0.0,0.0,normal
2,1.0,1.0,0.0,0.0,neptune
3,0.03,0.01,0.0,0.01,normal
4,0.0,0.0,0.0,0.0,normal


In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Step 1: Reload raw data without headers
train_df = pd.read_csv("data/NSL_KDD_Train.csv", header=None)
test_df = pd.read_csv("data/NSL_KDD_Test.csv", header=None)

# Step 2: Assign column names (42 features + label)
columns = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
    "wrong_fragment","urgent","hot","num_failed_logins","logged_in","num_compromised",
    "root_shell","su_attempted","num_root","num_file_creations","num_shells",
    "num_access_files","num_outbound_cmds","is_host_login","is_guest_login",
    "count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate",
    "same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count",
    "dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
    "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate",
    "label"
]

train_df = train_df.iloc[:, :42]  # discard the last column (difficulty if present)
test_df = test_df.iloc[:, :42]
train_df.columns = columns
test_df.columns = columns

# Step 3: Encode categorical features
categorical_cols = ['protocol_type', 'service', 'flag']
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    encoders[col] = le

# Step 4: Encode the label column: normal = 0, attack = 1
train_df['label'] = train_df['label'].apply(lambda x: 0 if x == 'normal' else 1)
test_df['label'] = test_df['label'].apply(lambda x: 0 if x == 'normal' else 1)

# Step 5: Split into features and targets
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

# ✅ Preview
print("X_train shape:", X_train.shape)
print("y_train value counts:")
print(y_train.value_counts())

X_train shape: (125973, 41)
y_train value counts:
label
0    67343
1    58630
Name: count, dtype: int64


SyntaxError: invalid non-printable character U+00A0 (2739124422.py, line 14)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 2: Predict
y_pred = model.predict(X_test)

# Step 3: Evaluate
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.7706706884315118

Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.97      0.78      9711
           1       0.97      0.62      0.75     12833

    accuracy                           0.77     22544
   macro avg       0.81      0.80      0.77     22544
weighted avg       0.83      0.77      0.77     22544


Confusion Matrix:
 [[9434  277]
 [4893 7940]]
