# Network

## 1. Import Libraries

In [32]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## 2. Load Dataset

In [33]:
columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 
           'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 
           'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 
           'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 
           'num_access_files', 'num_outbound_cmds', 'is_host_login', 
           'is_guest_login', 'count', 'srv_count', 'serror_rate', 
           'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 
           'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 
           'dst_host_srv_count', 'dst_host_same_srv_rate', 
           'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 
           'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 
           'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 
           'dst_host_srv_rerror_rate', 'label', 'difficulty']

train_df = pd.read_csv('data/KDDTest+.txt', header=None, names=columns)

print(f"Dataset loaded: {train_df.shape[0]} samples, {train_df.shape[1]} features")
print(f"\nLabel distribution:\n{train_df['label'].value_counts()}")

Dataset loaded: 22544 samples, 43 features

Label distribution:
label
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgetattack       178
portsweep           157
ipsweep             141
httptunnel          133
nmap                 73
pod                  41
buffer_overflow      20
multihop             18
named                17
ps                   15
sendmail             14
xterm                13
rootkit              13
teardrop             12
xlock                 9
land                  7
xsnoop                4
ftp_write             3
loadmodule            2
worm                  2
perl                  2
sqlattack             2
udpstorm              2
phf                   2
imap                  1
Name: count, dtype

## 3. Preprocess Data

In [36]:
#Binary classification (normal vs attack)
train_df['is_attack'] = train_df['label'].apply(lambda x: 0 if x == 'normal' else 1)

print(f'\nNormal: {(train_df['is_attack']==0).sum()}')
print(f'Attack: {(train_df['is_attack']==1).sum()}')

# Get rid of columns we don't need to train the model
X = train_df.drop(['label', 'is_attack', 'difficulty'], axis=1)
y = train_df['is_attack']

# Encode categorical features (convert text to numbers)
categorical_cols = ['protocol_type', 'service', 'flag']

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"\nPreprocessing complete. Data shape: {X_scaled.shape}")


Normal: 9711
Attack: 12833

Preprocessing complete. Data shape: (22544, 41)


## 4. Train Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train , X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=69)

print(f'Training set: {X_train.shape[0]} samples')
print(f'Test set: {X_test.shape[0]} samples')

model = RandomForestClassifier(n_estimators=50, random_state=69) 

# I tried both 1, 10, and 100 estimators. 1 had 97.5% accuracy so the data might be too obvious here. 
# Going with 50 just in case someone tries to use a more complex dataset

print(f'\nTraining the model...')
model.fit(X_train, y_train)
print("Training complete!")


Training set: 15780 samples
Test set: 6764 samples

Training the model...


## 5. Evaluate Model

In [38]:
# Making Predictions
predictions=model.predict(X_test)

print("\n=== Results ===")
print(f"Accuracy: {accuracy_score(y_test, predictions):.2%}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("(Top-left: correctly identified normal)")
print("(Bottom-right: correctly identified attacks)")

print("\nClassification Report:")
print(classification_report(y_test, predictions, target_names=['Normal', 'Attack']))


=== Results ===
Accuracy: 98.76%

Confusion Matrix:
[[2849   44]
 [  40 3831]]
(Top-left: correctly identified normal)
(Bottom-right: correctly identified attacks)

Classification Report:
              precision    recall  f1-score   support

      Normal       0.99      0.98      0.99      2893
      Attack       0.99      0.99      0.99      3871

    accuracy                           0.99      6764
   macro avg       0.99      0.99      0.99      6764
weighted avg       0.99      0.99      0.99      6764

