In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import warnings
import requests
import json
import os

warnings.filterwarnings("ignore")

In [None]:
# Step 1. Load Dataset
df = pd.read_csv('CIC_IDS_2017_cleaned.csv')

# Define identifiers
identifiers = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp', 'Source Port', 'Destination Port', 'Protocol']
label_col = 'Label'

# Label Encoding
label_encoder = LabelEncoder()
df[label_col] = label_encoder.fit_transform(df[label_col])
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)

Label Mapping: {'BENIGN': np.int64(0), 'Bot': np.int64(1), 'DDoS': np.int64(2), 'DoS Hulk': np.int64(3), 'DoS Slowhttptest': np.int64(4), 'DoS slowloris': np.int64(5), 'FTP-Patator': np.int64(6), 'Infiltration': np.int64(7), 'PortScan': np.int64(8), 'SSH-Patator': np.int64(9), 'Web Attack \x96 Brute Force': np.int64(10), 'Web Attack \x96 Sql Injection': np.int64(11), 'Web Attack \x96 XSS': np.int64(12)}


In [None]:
# Step 2. Dataset Downsample
X_full = df.drop(columns=[label_col])
y_full = df[label_col]

X_down, _, y_down, _ = train_test_split(
    X_full, y_full,
    train_size=0.05,  # 5% of original dataset
    stratify=y_full,
    random_state=42
)

In [None]:
# Step 3. Split Features, Metadata, Labels
# Separate ML features and metadata
X_down_features = X_down.drop(columns=identifiers)
meta_down = X_down[identifiers]

X_train, X_test, y_train, y_test, meta_train, meta_test = train_test_split(
    X_down_features, y_down, meta_down,
    test_size=0.05,  # small test set
    random_state=42
)

In [None]:
# Step 4. Train Models
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)

def evaluate_model(name, y_true, y_pred):
    print(f"\n=== {name} Evaluation ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))

evaluate_model("Random Forest", y_test, rf_preds)
evaluate_model("XGBoost", y_test, xgb_preds)


=== Random Forest Evaluation ===
Accuracy: 0.9980882864094542
Confusion Matrix:
 [[4799    0    0    1    0    1    0    0    0    0]
 [   3    3    0    0    0    0    0    0    0    0]
 [   2    0  318    0    0    0    0    0    0    0]
 [   4    0    0  215    0    0    0    0    0    0]
 [   0    0    0    0   14    0    0    0    0    0]
 [   0    0    0    0    0   11    0    0    0    0]
 [   0    0    0    0    0    0   15    0    0    0]
 [   0    0    0    0    0    0    0  356    0    0]
 [   0    0    0    0    0    0    0    0   11    0]
 [   0    0    0    0    0    0    0    0    0    1]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4801
           1       1.00      0.50      0.67         6
           2       1.00      0.99      1.00       320
           3       1.00      0.98      0.99       219
           4       1.00      1.00      1.00        14
           5       0.92      1.00     

In [None]:
# Step 5. Prepare Predictions with Metadata
predicted_label_names = [label_encoder.inverse_transform([lbl])[0] for lbl in xgb_preds]

predictions_df = pd.concat([
    meta_test.reset_index(drop=True),
    pd.DataFrame({
        "True_Label": label_encoder.inverse_transform(y_test),
        "Predicted_Label": predicted_label_names
    })
], axis=1)

In [None]:
# Step 6. Gemini API Setup
GEMINI_API_KEY = "AIzaSyBNnV4Qix9NDA8UQpCov9SaurNWGy2dqzU"
GEMINI_API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=" + GEMINI_API_KEY
headers = {"Content-Type": "application/json"}

def generate_playbook(incident_type, src_ip='Unknown', dst_ip='Unknown', protocol='Unknown'):
    prompt = f"""
A '{incident_type}' incident has been detected.
Source IP: {src_ip}
Destination IP: {dst_ip}
Protocol: {protocol}

Suggest a concise, practical response playbook in strictly less that 2500 characters, covering:
- Identification
- Containment
- Eradication
- Recovery
"""
    body = {"contents": [{"parts": [{"text": prompt.strip()}]}]}

    try:
        response = requests.post(GEMINI_API_URL, headers=headers, json=body)
        response.raise_for_status()
        data = response.json()
        return data["candidates"][0]["content"]["parts"][0]["text"]
    except Exception as e:
        print("Error from Gemini:", e)
        return "Failed to generate response"

In [None]:
#Step 7. Generate Playbooks for CSV output
max_playbook_length = 2500  # approximate max characters per playbook
incident_playbooks = {}

unique_incidents = predictions_df["Predicted_Label"].unique()

for incident in unique_incidents:
    if incident.lower() == "benign":
        continue

    # Take first occurrence for metadata
    row = predictions_df[predictions_df["Predicted_Label"] == incident].iloc[0]
    playbook_text = generate_playbook(
        incident,
        src_ip=row.get("Source IP", "Unknown"),
        dst_ip=row.get("Destination IP", "Unknown"),
        protocol=row.get("Protocol", "Unknown")
    )

    # Shorten if too long
    if len(playbook_text) > max_playbook_length:
        playbook_text = playbook_text[:max_playbook_length] + "..."

    incident_playbooks[incident] = playbook_text

# Map the playbooks to all predictions
predictions_df["Response_Playbook"] = predictions_df["Predicted_Label"].map(incident_playbooks)

# Save as single CSV
output_csv = "Playbooks.csv"
predictions_df.to_csv(output_csv, index=False)
print(f"\n✅ All predictions and playbooks saved in {output_csv}")


✅ All predictions and playbooks saved in Playbooks.csv
