In [None]:
!pip install pandas numpy joblib shap matplotlib seaborn scikit-learn xgboost lime

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import joblib
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import os

# ========== Load and Clean Function ==========
def load_and_prepare(file):
    df = pd.read_csv(file)
    df.columns = df.columns.str.strip().str.lower()
    label_col = next((col for col in df.columns if 'label' in col or 'class' in col), None)
    if label_col is None:
        raise ValueError(f"No label column found in {file}")
    df.rename(columns={label_col: 'label'}, inplace=True)
    return df

# ========== Load and Combine Data ==========
csv_files = [
    "data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    "data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "data/Friday-WorkingHours-Morning.pcap_ISCX.csv",
    "data/Monday-WorkingHours.pcap_ISCX.csv",
    "data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    "data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "data/Tuesday-WorkingHours.pcap_ISCX.csv",
    "data/Wednesday-workingHours.pcap_ISCX.csv"
]

print("\n📂 Reading and cleaning CSV files...")
dfs = []
for file in csv_files:
    try:
        df = load_and_prepare(file)
        dfs.append(df)
    except Exception as e:
        print(f"⚠️ Skipping {file}: {e}")

df_all = pd.concat(dfs, ignore_index=True)
print("Labels before encoding:", df_all['label'].unique())

# ========== Label Encoding ==========
label_encoder = LabelEncoder()
df_all['label'] = label_encoder.fit_transform(df_all['label'])
print("Encoded classes:", label_encoder.classes_)

# ========== Feature Prep ==========
X = df_all.drop(columns=['label'])
y = df_all['label']

X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(0, inplace=True)

# ========== Train-Test Split ==========
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ========== Feature Scaling ==========
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
os.makedirs("models", exist_ok=True)
joblib.dump(scaler, "models/scaler.pkl")

# ========== Feature Selection ==========
print("\n🎯 Performing Feature Selection...")
selector = SelectKBest(f_classif, k=20)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)
selected_indices = selector.get_support(indices=True)
selected_feature_names = X.columns[selected_indices]
joblib.dump(selector, "selector.pkl")

# ========== Save test data and selector ==========
joblib.dump(X_test, "X_test.pkl")
joblib.dump(y_test, "y_test.pkl")

# ========== Random Forest ==========
print("\n🌲 Training Random Forest...")
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None]
}, cv=3, n_jobs=-1, verbose=1)
rf_grid.fit(X_train_selected, y_train)
rf_best = rf_grid.best_estimator_
rf_preds = rf_best.predict(X_test_selected)
print(f"✅ RF Accuracy: {accuracy_score(y_test, rf_preds):.4f}")
print(classification_report(y_test, rf_preds))
joblib.dump(rf_best, "models/random_forest.pkl")

# ========== XGBoost ==========
print("\n📦 Training XGBoost...")
xgb_random = RandomizedSearchCV(
    xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False),
    {
        'n_estimators': [100, 150],
        'max_depth': [4, 6],
        'learning_rate': [0.1, 0.2]
    },
    n_iter=3,
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=1
)
xgb_random.fit(X_train_selected, y_train)
xgb_best = xgb_random.best_estimator_
xgb_preds = xgb_best.predict(X_test_selected)
print(f"✅ XGB Accuracy: {accuracy_score(y_test, xgb_preds):.4f}")
print(classification_report(y_test, xgb_preds))
joblib.dump(xgb_best, "models/xgboost.pkl")

# ========== SHAP Explainability ==========
print("\n🧠 Generating SHAP summary plot...")
explainer = shap.Explainer(xgb_best)
shap_values = explainer(X_test_selected)
shap.summary_plot(shap_values, pd.DataFrame(X_test_selected, columns=selected_feature_names), show=False)
plt.savefig("shap_summary.png")
plt.close()
print("SHAP summary plot saved as shap_summary.png")

# ========== Deep Neural Network ==========
print("\n🧠 Training Deep Neural Network...")
num_classes = len(np.unique(y_train))
model = Sequential([
    Dense(128, input_shape=(X_train_selected.shape[1],), activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Add a callback to show progress after each epoch
class PrintEpochProgress(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f"Epoch {epoch+1}: loss={logs['loss']:.4f}, acc={logs['accuracy']:.4f}, val_loss={logs.get('val_loss', 0):.4f}, val_acc={logs.get('val_accuracy', 0):.4f}")

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    X_train_selected, y_train,
    epochs=20,
    batch_size=32,
    verbose=1,  # Shows progress bar
    validation_split=0.1,  # Optional: shows validation metrics
    callbacks=[PrintEpochProgress(), early_stop]
)
dnn_preds = np.argmax(model.predict(X_test_selected), axis=1)
print(f"✅ DNN Accuracy: {accuracy_score(y_test, dnn_preds):.4f}")
print(classification_report(y_test, dnn_preds))
model.save("models/dnn_model.keras")

# ========== Accuracy Summary ==========
print("\n📊 Final Accuracy Comparison:")
print(f"Random Forest : {accuracy_score(y_test, rf_preds):.4f}")
print(f"XGBoost       : {accuracy_score(y_test, xgb_preds):.4f}")
print(f"DNN           : {accuracy_score(y_test, dnn_preds):.4f}")

In [None]:
import pandas as pd
import numpy as np
import joblib
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
import xgboost as xgb
import os
import warnings
warnings.filterwarnings("ignore")

# ========== Load and Clean Function ==========
def load_and_prepare(file):
    df = pd.read_csv(file)
    df.columns = df.columns.str.strip().str.lower()
    label_col = next((col for col in df.columns if 'label' in col or 'class' in col), None)
    if label_col is None:
        raise ValueError(f"No label column found in {file}")
    df.rename(columns={label_col: 'label'}, inplace=True)
    return df

# ========== Load and Combine Data ==========
csv_files = [
    "data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    "data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "data/Friday-WorkingHours-Morning.pcap_ISCX.csv",
    "data/Monday-WorkingHours.pcap_ISCX.csv",
    "data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    "data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "data/Tuesday-WorkingHours.pcap_ISCX.csv",
    "data/Wednesday-workingHours.pcap_ISCX.csv"
]

print("\n📂 Reading and cleaning CSV files...")
dfs = []
for file in csv_files:
    try:
        df = load_and_prepare(file)
        dfs.append(df)
    except Exception as e:
        print(f"⚠️ Skipping {file}: {e}")

df_all = pd.concat(dfs, ignore_index=True)
print("Labels before encoding:", df_all['label'].unique())

# ========== Label Encoding ==========
label_encoder = LabelEncoder()
df_all['label'] = label_encoder.fit_transform(df_all['label'])
print("Encoded classes:", label_encoder.classes_)

# ========== Feature Prep ==========
X = df_all.drop(columns=['label'])
y = df_all['label']

X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(0, inplace=True)

# ========== Train-Test Split ==========
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ========== Feature Scaling ==========
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
os.makedirs("models", exist_ok=True)
joblib.dump(scaler, "models/scaler.pkl")

# ========== Feature Selection ==========
print("\n🎯 Performing Feature Selection...")
selector = SelectKBest(f_classif, k=20)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)
selected_indices = selector.get_support(indices=True)
selected_feature_names = X.columns[selected_indices]
joblib.dump(selector, "models/selector.pkl")

# ========== Save test data and selector ==========
joblib.dump(X_test, "models/X_test.pkl")
joblib.dump(y_test, "models/y_test.pkl")

# ========== XGBoost ==========
print("\n📦 Training XGBoost with early stopping and progress...")
xgb_model = xgb.XGBClassifier(
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1
)
eval_set = [(X_train_selected, y_train), (X_test_selected, y_test)]
xgb_model.fit(
    X_train_selected, y_train,
    eval_set=eval_set,
    early_stopping_rounds=10,
    verbose=True
)
xgb_preds = xgb_model.predict(X_test_selected)
print(f"✅ XGB Accuracy: {accuracy_score(y_test, xgb_preds):.4f}")
print(classification_report(y_test, xgb_preds))
joblib.dump(xgb_model, "models/xgboost.pkl")

# ========== SHAP Explainability ==========
print("\n🧠 Generating SHAP summary plot...")
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_test_selected)
shap.summary_plot(shap_values, pd.DataFrame(X_test_selected, columns=selected_feature_names), show=False)
plt.savefig("shap_summary.png")
plt.close()
print("SHAP summary plot saved as shap_summary.png")

In [None]:
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import xgboost as xgb
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import shap
import lime
import lime.lime_tabular
import seaborn as sns
import matplotlib.pyplot as plt

# Load test data, selector, scaler, and models
X_test = joblib.load("models/X_test.pkl")
y_test = joblib.load("models/y_test.pkl")
selector = joblib.load("models/selector.pkl")
scaler = joblib.load("models/scaler.pkl")
xgb_model = joblib.load("models/xgboost.pkl")
dnn_model = load_model("models/dnn_model.keras")
rf_model = joblib.load("models/random_forest.pkl")
X_train_selected = joblib.load("models/X_train_selected.pkl")
selected_feature_names = joblib.load("models/selected_feature_names.pkl")
y_train = joblib.load("models/y_train.pkl")

# Scale and select features
X_test_scaled = scaler.transform(X_test)
X_test_selected = X_test_scaled[:, selector.get_support(indices=True)]

# Random Forest predictions
rf_preds = rf_model.predict(X_test_selected)
print("Random Forest Test Results:")
print(f"Accuracy: {accuracy_score(y_test, rf_preds):.4f}")
print(classification_report(y_test, rf_preds))

# XGBoost predictions
xgb_preds = xgb_model.predict(X_test_selected)
print("XGBoost Test Results:")
print(f"Accuracy: {accuracy_score(y_test, xgb_preds):.4f}")
print(classification_report(y_test, xgb_preds))


# DNN predictions
dnn_probs = dnn_model.predict(X_test_selected)
dnn_preds = np.argmax(dnn_probs, axis=1)
print("DNN Test Results:")
print(f"Accuracy: {accuracy_score(y_test, dnn_preds):.4f}")
print(classification_report(y_test, dnn_preds))


# Prepare LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train_selected),
    feature_names=list(selected_feature_names),
    class_names=[str(cls) for cls in np.unique(y_train)],
    mode='classification'
)

# Pick a test sample to explain (e.g., the first one)
for i in range(5):  # Explain first 5 samples
    exp = explainer.explain_instance(
        X_test_selected[i],
        xgb_model.predict_proba,
        num_features=10
    )
    exp.save_to_file(f'html/lime_xgboost_explanation_{i}.html')
    print(f"LIME explanation for sample {i} saved as lime_xgboost_explanation_{i}.html")
    print(exp.as_list())

exp.save_to_file('html/lime_xgboost_explanation.html')
print("LIME explanation saved as lime_xgboost_explanation.html")


# SHAP summary plot for XGBoost
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_test_selected)
shap.summary_plot(shap_values, pd.DataFrame(X_test_selected, columns=X_test.columns[selector.get_support()]), show=False)
plt.savefig("Output/shap_summary.png")
plt.close()
print("SHAP summary plot saved as shap_summary.png")


# Plot confusion matrix for Random Forest
cm = confusion_matrix(y_test, xgb_preds)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='YlGnBu', cbar=True,xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('XGBoost Confusion Matrix (Seaborn Heatmap)')
plt.tight_layout()
plt.savefig("Output/xgboost_confusion_matrix.png")
plt.close()
print("XGBoost confusion matrix plot saved as xgboost_confusion_matrix.png")

# Plot classification report as a heatmap
report = classification_report(y_test, xgb_preds, output_dict=True)
report_df = pd.DataFrame(report).transpose()
plt.figure(figsize=(10, 6))
sns.heatmap(report_df.iloc[:-1, :-1], annot=True, cmap="YlGnBu")
plt.title("XGBoost Classification Report Heatmap")
plt.savefig("Output/xgboost_classification_report.png")
plt.close()
print("XGBoost classification report heatmap saved as xgboost_classification_report.png")


# Save all accuracies to results.tex
rf_accuracy = accuracy_score(y_test, rf_preds)
xgb_accuracy = accuracy_score(y_test, xgb_preds)
dnn_accuracy = accuracy_score(y_test, dnn_preds)

with open("Result/results.tex", "w") as f:
    f.write("\\section*{Model Accuracy Results}\n")
    f.write(f"\\textbf{{Random Forest Accuracy}}: {rf_accuracy:.4f}\\\\\n")
    f.write(f"\\textbf{{XGBoost Accuracy}}: {xgb_accuracy:.4f}\\\\\n")
    f.write(f"\\textbf{{DNN Accuracy}}: {dnn_accuracy:.4f}\\\\\n")