In [None]:
!pip uninstall torch torchvision torchaudio sentence-transformers transformers

In [None]:

!pip install sentence-transformers torch pandas scikit-learn joblib requests lightgbm torchvision

In [None]:
import pandas as pd
import json
import gzip
import os
import requests
import joblib
import numpy as np
from pathlib import Path
from datetime import datetime
import csv
import urllib.request

# Text Embedding and ML Models
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# The classifiers we will compare
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from lightgbm.callback import early_stopping

# Import SMOTE for handling class imbalance
# You may need to install this: pip install imbalanced-learn
from imblearn.over_sampling import SMOTE


# --- Configuration for file paths ---
# Base directories
DATA_DIR = Path("data")
MODELS_DIR = Path("models")
VERSION = "1.0"


# Subdirectories for data
NVD_DATA_DIR = DATA_DIR / VERSION / "nvd_data"
GARAK_DATA_DIR = DATA_DIR / VERSION / "garak"

# Specific file paths
PARSED_DATA_PATH = NVD_DATA_DIR / f"all_nvd_cves.pkl"
GARAK_REPORT_JSONL = GARAK_DATA_DIR / "garak.report.jsonl"
GARAK_REPORT_CSV = GARAK_DATA_DIR / "garak_report_flat.csv"

# Model file paths
MODEL_PATH = MODELS_DIR /VERSION/ "best_cvss_classifier_historic.pkl"
LABEL_ENCODER_PATH = MODELS_DIR /VERSION /"cvss_label_encoder_historic.pkl"


# ----------------------------------------
# STEP 1A: Process Garak Report
# ----------------------------------------
def process_garak_report():
    """
    Downloads a sample Garak report if not present, and converts it
    from .jsonl format to a flattened .csv file.
    """
    GARAK_DATA_DIR.mkdir(parents=True, exist_ok=True)

    url = "https://gist.githubusercontent.com/shubhobm/9fa52d71c8bb36bfb888eee2ba3d18f2/raw/ef1808e6d3b26002d9b046e6c120d438adf49008/gpt35-0906.report.jsonl"
    if not GARAK_REPORT_JSONL.exists():
        print("Downloading sample Garak report...")
        urllib.request.urlretrieve(url, GARAK_REPORT_JSONL)
        print(f"✅ Downloaded: {GARAK_REPORT_JSONL}")

    def parse_status(status_code):
        return {1: "Pass", 2: "Fail"}.get(status_code, "Not Evaluated")

    def extract_input_output(record):
        turns = record.get("notes", {}).get("turns", [])
        if turns:
            attacker = " | ".join([msg.strip().replace("\n", " ") for role, msg in turns if role == "probe"])
            bot = " | ".join([msg.strip().replace("\n", " ") for role, msg in turns if role == "model"])
            return attacker, bot
        prompt = record.get("prompt", "").strip().replace("\n", " ")
        outputs = " | ".join([o.strip().replace("\n", " ") for o in record.get("outputs", [])])
        return prompt, outputs

    with open(GARAK_REPORT_JSONL, "r", encoding="utf-8") as infile, \
         open(GARAK_REPORT_CSV, "w", newline='', encoding="utf-8") as outfile:

        fieldnames = ["uuid", "probe_classname", "attacker_input", "target_bot_response", "status", "goal", "trigger"]
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        for line in infile:
            record = json.loads(line)
            if record.get("entry_type") != "attempt":
                continue

            attacker_input, bot_response = extract_input_output(record)
            writer.writerow({
                "uuid": record.get("uuid", ""),
                "probe_classname": record.get("probe_classname", ""),
                "attacker_input": attacker_input,
                "target_bot_response": bot_response,
                "status": parse_status(record.get("status")),
                "goal": record.get("goal", ""),
                "trigger": record.get("notes", {}).get("trigger", "")
            })
    print(f"✅ Garak report successfully converted to: {GARAK_REPORT_CSV}")


# ----------------------------------------
# STEP 1B: Download and Parse All Historical NVD Data
# ----------------------------------------
def download_and_parse_all_nvd_data():
    """
    Downloads all NVD CVE data, parses them, removes duplicates, and saves
    the result to a pickle file.
    """
    NVD_DATA_DIR.mkdir(parents=True, exist_ok=True)
    BASE_URL = "https://nvd.nist.gov/feeds/json/cve/1.1/"
    START_YEAR, CURRENT_YEAR = 2025, datetime.now().year

    print("--- Starting NVD Data Download ---")
    # This section remains the same as it's already optimized
    for year in range(START_YEAR, CURRENT_YEAR + 1):
        filename = f"nvdcve-1.1-{year}.json.gz"
        download_path = NVD_DATA_DIR / filename
        if download_path.exists(): continue
        print(f"Downloading: {filename}")
        try:
            response = requests.get(f"{BASE_URL}{filename}", stream=True, timeout=30)
            if response.status_code == 200:
                with open(download_path, 'wb') as f: f.writelines(response.iter_content(8192))
            else: print(f" -> Failed: HTTP {response.status_code}")
        except requests.RequestException as e: print(f" -> Error: {e}")

    print("\n--- Starting NVD Data Parsing ---")
    parsed_cve_list = []
    for file_path in sorted(NVD_DATA_DIR.glob('*.json.gz')):
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            cve_data = json.load(f)
        for item in cve_data.get("CVE_Items", []):
            description = next((d["value"] for d in item.get("cve", {}).get("description", {}).get("description_data", []) if d.get("lang") == "en"), "")
            impact = item.get("impact", {})
            severity = impact.get('baseMetricV3', {}).get('cvssV3', {}).get('baseSeverity') or impact.get('baseMetricV2', {}).get('severity')
            if description and severity:
                parsed_cve_list.append({"description": description.strip(), "severity": severity.strip().capitalize()})

    df = pd.DataFrame(parsed_cve_list)
    print(f"\nEntries before duplicate removal: {len(df)}")
    df.drop_duplicates(subset=['description'], keep='last', inplace=True)
    print(f"Entries after duplicate removal: {len(df)}")
    df.to_pickle(PARSED_DATA_PATH)
    print(f"✅ Parsing Complete. Saved {len(df)} unique entries to {PARSED_DATA_PATH}")


# ----------------------------------------
# STEP 2: Find the Best Classifier and Train It
# ----------------------------------------
def train_and_evaluate_models():
    """
    Loads data, trains classifiers on a balanced dataset, finds the best one,
    and saves it along with the label encoder.
    """
    if not PARSED_DATA_PATH.exists():
        print(f"Error: Parsed data not found. Run data preparation first.")
        return None, None, None

    print(f"\n--- Training and Evaluation ---")
    df = pd.read_pickle(PARSED_DATA_PATH).dropna()
    print(f"Training on {len(df)} valid NVD entries.")

    le = LabelEncoder()
    y = le.fit_transform(df['severity'])

    print("Loading embedding model: 'all-MiniLM-L6-v2'...")
    embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    print("\nEncoding all descriptions... (This may take a while)")
    X = embed_model.encode(df['description'].tolist(), show_progress_bar=True)

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

    print("\nApplying SMOTE to balance the training data...")
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    classifiers = {
        "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42),
        "Random Forest": RandomForestClassifier(n_jobs=-1, random_state=42),
        "LightGBM (Tuned)": lgb.LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            num_leaves=31,
            random_state=42,
            feature_fraction=0.8,
            bagging_fraction=0.8,
            bagging_freq=1
        )
    }

    best_f1, best_model_name, best_classifier_obj = -1, "", None
    for name, clf in classifiers.items():
        print(f"\n--- Training {name} ---")
        fit_params = {}
        if "LightGBM" in name:
            fit_params = {"eval_set": [(X_test, y_test)], "callbacks": [early_stopping(10, verbose=False)]}

        clf.fit(X_train_resampled, y_train_resampled, **fit_params)

        y_pred = clf.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        f1_score = report["weighted avg"]["f1-score"]
        if f1_score > best_f1:
            best_f1, best_model_name, best_classifier_obj = f1_score, name, clf
        print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))

    print(f"\n🏆 Best performing model is: {best_model_name} with F1-Score: {best_f1:.4f}")

    # Save the best model that was trained and validated
    MODELS_DIR.mkdir(parents=True, exist_ok=True)
    MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
    joblib.dump(best_classifier_obj, MODEL_PATH)
    joblib.dump(le, LABEL_ENCODER_PATH)
    print(f"✅ Best model saved to {MODEL_PATH}")

    return embed_model, best_classifier_obj, le

# ----------------------------------------
# STEP 3: Predict using a saved model
# ----------------------------------------
def predict_on_garak(embed_model=None, classifier=None, label_encoder=None):
    """
    Loads a saved model or uses a passed one to predict severity on a Garak report.
    """
    if not GARAK_REPORT_CSV.exists():
        print(f"Error: Garak CSV not found. Run data preparation first.")
        return

    # Load models if they weren't passed from the training step
    if not all([embed_model, classifier, label_encoder]):
        print(f"Loading models from disk...")
        if not MODEL_PATH.exists():
            print(f"Error: Model file not found at {MODEL_PATH}. Please train a model first.")
            return
        classifier = joblib.load(MODEL_PATH)
        label_encoder = joblib.load(LABEL_ENCODER_PATH)
        embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

    print("\n--- Prediction on Garak Report ---")
    df = pd.read_csv(GARAK_REPORT_CSV)
    df["full_text"] = df["attacker_input"].fillna('') + " " + df["target_bot_response"].fillna('')

    print("Embedding Garak report for prediction...")
    embeddings = embed_model.encode(df["full_text"].tolist(), show_progress_bar=True)

    probabilities = classifier.predict_proba(embeddings)
    df["predicted_severity"] = label_encoder.inverse_transform(np.argmax(probabilities, axis=1))
    df["confidence_score"] = np.round(np.max(probabilities, axis=1), 4)
    for i, name in enumerate(label_encoder.classes_):
        df[f'prob_{name.lower()}'] = np.round(probabilities[:, i], 4)

    output_path = GARAK_DATA_DIR / "garak_with_severity_historic.csv"
    df.to_csv(output_path, index=False)
    print(f"✅ Predictions saved to {output_path}")

    # --- Generate Final Vulnerability Score for the Entire Report ---
    print("\n--- Final Vulnerability Score (for FAILED test cases) ---")

    # Filter for only failed test cases
    failed_df = df[df['status'] == 'Fail'].copy()

    if failed_df.empty:
        print("No failed test cases found in the report. No score to calculate.")
        return

    print(f"Calculating score based on {len(failed_df)} failed test cases (out of {len(df)} total).")

    # Define a mapping from severity to a numerical weight
    severity_map = {'Critical': 10, 'High': 7, 'Medium': 4, 'Low': 1}

    # Get the counts of each predicted severity from the FAILED cases
    severity_counts = failed_df['predicted_severity'].value_counts()

    # Calculate the final score by weighting the counts
    total_score = sum(count * severity_map.get(s, 0) for s, count in severity_counts.items())

    # Normalize the score based on the number of FAILED cases
    max_possible_score = len(failed_df) * 10
    normalized_score = (total_score / max_possible_score) * 100 if max_possible_score > 0 else 0

    print("\nSeverity Distribution (of Failures):")
    print(severity_counts)
    print(f"\nTotal Raw Risk Score (from Failures): {total_score}")
    print(f"Normalized Report Vulnerability Score (0-100): {normalized_score:.2f}")

# ----------------------------------------
# MAIN EXECUTION WORKFLOW
# ----------------------------------------
def run_workflow(run_data_prep=False, run_training=False, run_prediction=False):
    """
    Controls the main execution flow of the script.
    """
    print("--- CVE Severity Prediction Workflow ---")

    if run_data_prep:
        print("\n=== STAGE 1: DATA PREPARATION ===")
        process_garak_report()
        download_and_parse_all_nvd_data()

    embed_model, classifier, label_encoder = None, None, None
    if run_training:
        print("\n=== STAGE 2: MODEL TRAINING ===")
        embed_model, classifier, label_encoder = train_and_evaluate_models()

    if run_prediction:
        print("\n=== STAGE 3: PREDICTION ===")
        # Pass the trained models to avoid re-loading if they exist
        predict_on_garak(embed_model, classifier, label_encoder)

    print("\n--- Workflow Finished ---")


if __name__ == "__main__":
    # --- Configure your desired workflow here ---
    # Set the flags to True for the steps you want to run.

    # Example 1: Run the full pipeline from start to finish
    run_workflow(run_data_prep=True, run_training=True, run_prediction=True)

    # Example 2: Just run prediction using existing models
    #run_workflow(run_prediction=True)

    # Example 3: Prepare data and then train, but don't predict
    # run_workflow(run_data_prep=True, run_training=True)


In [None]:
!zip -r data.zip data/
!zip -r models.zip models/

In [None]:
!rm -rf data/
!rm -rf models/