In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [8]:
train_df = pd.read_csv('data/KDDTest+.txt', header=None)

train_columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 
           'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 
           'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 
           'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 
           'num_access_files', 'num_outbound_cmds', 'is_host_login', 
           'is_guest_login', 'count', 'srv_count', 'serror_rate', 
           'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 
           'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 
           'dst_host_srv_count', 'dst_host_same_srv_rate', 
           'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 
           'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 
           'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 
           'dst_host_srv_rerror_rate', 'label', 'difficulty']

train_df.columns = train_columns

print(train_df.head())
print(f'\nDataset shape: {train_df.shape}')

print(f"\nLabel distribution:")
print(train_df['label'].value_counts())

print(f"\nData types:")
print(train_df.dtypes)

   duration protocol_type   service  flag  src_bytes  dst_bytes  land  \
0         0           tcp   private   REJ          0          0     0   
1         0           tcp   private   REJ          0          0     0   
2         2           tcp  ftp_data    SF      12983          0     0   
3         0          icmp     eco_i    SF         20          0     0   
4         1           tcp    telnet  RSTO          0         15     0   

   wrong_fragment  urgent  hot  ...  dst_host_same_srv_rate  \
0               0       0    0  ...                    0.04   
1               0       0    0  ...                    0.00   
2               0       0    0  ...                    0.61   
3               0       0    0  ...                    1.00   
4               0       0    0  ...                    0.31   

   dst_host_diff_srv_rate  dst_host_same_src_port_rate  \
0                    0.06                         0.00   
1                    0.06                         0.00   
2       

In [9]:
#Binary classification (normal vs attack)
train_df['is_attack'] = train_df['label'].apply(lambda x: 0 if x == 'normal' else 1)

print(f'\nNormal: {(train_df['is_attack']==0).sum()}')
print(f'Attack: {(train_df['is_attack']==1).sum()}')

# Get rid of columns we don't need to train the model
X = train_df.drop(['label', 'is_attack', 'difficulty'], axis=1)
y = train_df['is_attack']


Normal: 9711
Attack: 12833


In [10]:
# Encode categorical features (convert text to numbers)
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['protocol_type', 'service', 'flag']

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

print(f"\nAfter encoding, all columns are numeric:")
print(X.dtypes.unique())


After encoding, all columns are numeric:
[dtype('int64') dtype('float64')]


In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"\nData ready! Shape: {X_scaled.shape}")
print(f"Features (X): {X_scaled.shape[0]} samples, {X_scaled.shape[1]} features")
print(f"Labels (y): {y.shape[0]} samples")


Data ready! Shape: (22544, 41)
Features (X): 22544 samples, 41 features
Labels (y): 22544 samples


In [27]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = IsolationForest(contamination=0.5, random_state=69)

print("Training the model...")
model.fit(X_scaled)

predictions = model.predict(X_scaled)

predictions = [1 if x==-1 else 0 for x in predictions]


Training the model...


In [28]:
print("\n=== Results ===")
print(f"Accuracy: {accuracy_score(y, predictions):.2%}")

print("\nConfusion Matrix:")
print(confusion_matrix(y, predictions))
print("(Top-left: correctly identified normal)")
print("(Bottom-right: correctly identified attacks)")

print("\nDetailed Report:")
print(classification_report(y, predictions, target_names=['Normal', 'Attack']))


=== Results ===
Accuracy: 77.02%

Confusion Matrix:
[[7901 1810]
 [3371 9462]]
(Top-left: correctly identified normal)
(Bottom-right: correctly identified attacks)

Detailed Report:
              precision    recall  f1-score   support

      Normal       0.70      0.81      0.75      9711
      Attack       0.84      0.74      0.79     12833

    accuracy                           0.77     22544
   macro avg       0.77      0.78      0.77     22544
weighted avg       0.78      0.77      0.77     22544

