<a href="https://colab.research.google.com/github/TamirMareli/Network-Traffic-Anomaly-Detection/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==========================================
# MASTER UPGRADE: MULTICLASS MODEL
# ==========================================
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 1. LOAD DATA
print("Loading NSL-KDD Data...")
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label", "difficulty_level"]

train_url = 'https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt'
test_url = 'https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest+.txt'

df_train = pd.read_csv(train_url, header=None, names=col_names)
df_test = pd.read_csv(test_url, header=None, names=col_names)

# 2. DEFINE ATTACK MAPPING (The 4 Categories)
# This dictionary maps every weird attack name to a main category
attack_mapping = {
    'normal': 0,
    # DoS
    'back': 1, 'land': 1, 'neptune': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1, 'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1,
    # Probe
    'satan': 2, 'ipsweep': 2, 'nmap': 2, 'portsweep': 2, 'mscan': 2, 'saint': 2,
    # R2L
    'guess_passwd': 3, 'ftp_write': 3, 'imap': 3, 'phf': 3, 'multihop': 3, 'warezmaster': 3, 'warezclient': 3, 'spy': 3, 'xlock': 3, 'xsnoop': 3, 'snmpguess': 3, 'snmpgetattack': 3, 'httptunnel': 3, 'sendmail': 3, 'named': 3,
    # U2R
    'buffer_overflow': 4, 'loadmodule': 4, 'perl': 4, 'rootkit': 4, 'ps': 4, 'xterm': 4, 'sqlattack': 4
}

# Apply mapping
print("Mapping labels to 5 categories (Normal, DoS, Probe, R2L, U2R)...")
df_train['label_code'] = df_train['label'].map(lambda x: attack_mapping.get(x, 0)) # Default to normal if unknown
df_test['label_code'] = df_test['label'].map(lambda x: attack_mapping.get(x, 0))

# 3. ENCODING & SCALING
categorical_cols = ['protocol_type', 'service', 'flag']
df_train_encoded = pd.get_dummies(df_train, columns=categorical_cols)
df_test_encoded = pd.get_dummies(df_test, columns=categorical_cols)
df_test_encoded = df_test_encoded.reindex(columns=df_train_encoded.columns, fill_value=0)

X_train = df_train_encoded.drop(['label', 'label_code', 'difficulty_level'], axis=1)
y_train = df_train['label_code']
X_test = df_test_encoded.drop(['label', 'label_code', 'difficulty_level'], axis=1)
y_test = df_test['label_code']

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# 4. TRAIN MULTICLASS XGBOOST
print("Training Multiclass XGBoost Model...")
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    objective='multi:softmax',  # This tells XGBoost to predict categories
    num_class=5,                # We have 5 categories (0-4)
    eval_metric='mlogloss'
)
xgb_model.fit(X_train_scaled, y_train)

# 5. EVALUATE
print("Evaluating...")
y_pred = xgb_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=['Normal', 'DoS', 'Probe', 'R2L', 'U2R']))

# 6. SAVE ARTIFACTS
print("Saving files...")
joblib.dump(xgb_model, 'xgb_model_multi.pkl') # Note the new name
joblib.dump(scaler, 'scaler.pkl')

# Save a bigger sample for the live simulation (100 rows)
sample_data = X_test.head(100)
sample_data.to_csv('simulation_input.csv', index=False)

print("\nDONE! Download these 3 files:")
print("1. xgb_model_multi.pkl")
print("2. scaler.pkl")
print("3. simulation_input.csv")

Loading NSL-KDD Data...
Mapping labels to 5 categories (Normal, DoS, Probe, R2L, U2R)...
Training Multiclass XGBoost Model...
Evaluating...
              precision    recall  f1-score   support

      Normal       0.67      0.97      0.80      9713
         DoS       0.96      0.83      0.89      7458
       Probe       0.80      0.64      0.71      2421
         R2L       0.99      0.06      0.11      2885
         U2R       0.73      0.12      0.21        67

    accuracy                           0.77     22544
   macro avg       0.83      0.52      0.54     22544
weighted avg       0.82      0.77      0.73     22544

Saving files...

DONE! Download these 3 files:
1. xgb_model_multi.pkl
2. scaler.pkl
3. simulation_input.csv
