In [1]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay


In [None]:
# Load and merge dataset files
data_dir = 'CICIDS2017'  

all_files = glob.glob(os.path.join(data_dir, '*.csv'))
df_list = []
for file in all_files:
    try:
        df = pd.read_csv(file, encoding="ISO-8859-1", encoding_errors="replace", low_memory=False)  # Fix encoding issues
        df_list.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")  # Debugging info

df = pd.concat(df_list, ignore_index=True)


In [None]:
# Handle missing values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)


In [None]:
# Checker
print(df.head())  # Show first few rows


In [None]:
# Strip spaces from all column names
df.columns = df.columns.str.strip()


In [None]:
# Encode categorical labels
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

In [None]:
# Convert both keys and values to standard Python types
label_mapping_fixed = {str(key): int(value) for key, value in label_mapping.items()}

# Save as JSON
import json
with open("label_mapping.json", "w") as f:
    json.dump(label_mapping_fixed, f)

print("Label encoding saved as label_mapping.json")


In [None]:
# Feature selection
drop_columns = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp']
df.drop(drop_columns, axis=1, inplace=True, errors='ignore')

In [None]:
# Normalize numerical features
scaler = StandardScaler()
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [None]:
# Split dataset
X = df.drop('Label', axis=1)
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [None]:
# Train XGBoost Model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_jobs=-1)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

In [None]:
# Model Evaluation
def evaluate_model(y_true, y_pred, model_name):
    print(f"{model_name} Classification Report:")
    print(classification_report(y_true, y_pred))
    
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_mapping.keys())
    disp.plot(cmap=plt.cm.Blues, xticks_rotation=45)
    plt.title(f"Confusion Matrix - {model_name}")
    plt.show()

evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_xgb, "XGBoost")

from sklearn.metrics import accuracy_score

rf_accuracy = accuracy_score(y_test, y_pred_rf)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)

print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")



In [None]:
# Feature Importance
rf_importances = rf_model.feature_importances_
rf_indices = np.argsort(rf_importances)[-10:]

plt.figure(figsize=(10, 6))
plt.barh(range(len(rf_indices)), rf_importances[rf_indices], color='skyblue')
plt.yticks(range(len(rf_indices)), [X.columns[i] for i in rf_indices])
plt.title('Top 10 Feature Importances (Random Forest)')
plt.show()

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

# Binarize the labels (convert multiclass to multiple binary columns)
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
y_pred_rf_bin = label_binarize(y_pred_rf, classes=np.unique(y_test))
y_pred_xgb_bin = label_binarize(y_pred_xgb, classes=np.unique(y_test))

# Compute ROC-AUC Score for each class and take the average
rf_auc = roc_auc_score(y_test_bin, y_pred_rf_bin, average="macro")
xgb_auc = roc_auc_score(y_test_bin, y_pred_xgb_bin, average="macro")

print(f"Random Forest ROC-AUC Score: {rf_auc:.4f}")
print(f"XGBoost ROC-AUC Score: {xgb_auc:.4f}")
