In [None]:
!pip uninstall torch torchvision torchaudio sentence-transformers transformers

In [None]:

!pip install sentence-transformers torch pandas scikit-learn joblib requests lightgbm torchvision

In [23]:
import pandas as pd
import json
import gzip
import os
import requests
import joblib
import numpy as np
from pathlib import Path
from datetime import datetime
import csv
import urllib.request
import zipfile

# Text Embedding and ML Models
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# The classifiers we will compare
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from lightgbm.callback import early_stopping

# imblearn is no longer needed for undersampling as we are doing it manually
# from imblearn.over_sampling import SMOTE


# --- Configuration for file paths ---
# Base directories
DATA_DIR = Path("data")
MODELS_DIR = Path("models")
VERSION = "5.0"


# Subdirectories for data
NVD_DATA_DIR = DATA_DIR / VERSION / "nvd_data"
GARAK_DATA_DIR = DATA_DIR / VERSION / "garak"

# Specific file paths
PARSED_DATA_PATH = NVD_DATA_DIR / f"all_nvd_cves.pkl"
GARAK_REPORT_JSONL = GARAK_DATA_DIR / "garak.report.jsonl"
GARAK_REPORT_CSV = GARAK_DATA_DIR / "garak_report_flat.csv"

# Model file paths
MODEL_PATH = MODELS_DIR /VERSION/ "best_cvss_classifier_historic.pkl"
LABEL_ENCODER_PATH = MODELS_DIR /VERSION /"cvss_label_encoder_historic.pkl"


# ----------------------------------------
# STEP 1A: Process Garak Report
# ----------------------------------------
def process_garak_report():
    """
    Downloads a sample Garak report if not present, and converts it
    from .jsonl format to a flattened .csv file.
    """
    GARAK_DATA_DIR.mkdir(parents=True, exist_ok=True)

    url = "https://gist.githubusercontent.com/shubhobm/9fa52d71c8bb36bfb888eee2ba3d18f2/raw/ef1808e6d3b26002d9b046e6c120d438adf49008/gpt35-0906.report.jsonl"
    if not GARAK_REPORT_JSONL.exists():
        print("Downloading sample Garak report...")
        urllib.request.urlretrieve(url, GARAK_REPORT_JSONL)
        print(f"✅ Downloaded: {GARAK_REPORT_JSONL}")

    def parse_status(status_code):
        return {1: "Pass", 2: "Fail"}.get(status_code, "Not Evaluated")

    def extract_input_output(record):
        turns = record.get("notes", {}).get("turns", [])
        if turns:
            attacker = " | ".join([msg.strip().replace("\n", " ") for role, msg in turns if role == "probe"])
            bot = " | ".join([msg.strip().replace("\n", " ") for role, msg in turns if role == "model"])
            return attacker, bot
        prompt = record.get("prompt", "").strip().replace("\n", " ")
        outputs = " | ".join([o.strip().replace("\n", " ") for o in record.get("outputs", [])])
        return prompt, outputs

    with open(GARAK_REPORT_JSONL, "r", encoding="utf-8") as infile, \
         open(GARAK_REPORT_CSV, "w", newline='', encoding="utf-8") as outfile:

        fieldnames = ["uuid", "probe_classname", "attacker_input", "target_bot_response", "status", "goal", "trigger"]
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        for line in infile:
            record = json.loads(line)
            if record.get("entry_type") != "attempt":
                continue

            attacker_input, bot_response = extract_input_output(record)
            writer.writerow({
                "uuid": record.get("uuid", ""),
                "probe_classname": record.get("probe_classname", ""),
                "attacker_input": attacker_input,
                "target_bot_response": bot_response,
                "status": parse_status(record.get("status")),
                "goal": record.get("goal", ""),
                "trigger": record.get("notes", {}).get("trigger", "")
            })
    print(f"✅ Garak report successfully converted to: {GARAK_REPORT_CSV}")


# ----------------------------------------
# STEP 1B: Download and Parse All Historical NVD Data
# ----------------------------------------
def download_and_parse_all_nvd_data():
    """
    Downloads all NVD CVE data, parses them, removes duplicates, and saves
    the result to a pickle file.
    """
    NVD_DATA_DIR.mkdir(parents=True, exist_ok=True)
    BASE_URL = "https://nvd.nist.gov/feeds/json/cve/1.1/"
    START_YEAR, CURRENT_YEAR = 2002, datetime.now().year

    print("--- Starting NVD Data Download ---")
    for year in range(START_YEAR, CURRENT_YEAR + 1):
        filename = f"nvdcve-1.1-{year}.json.gz"
        download_path = NVD_DATA_DIR / filename
        if download_path.exists(): continue
        print(f"Downloading: {filename}")
        try:
            response = requests.get(f"{BASE_URL}{filename}", stream=True, timeout=30)
            if response.status_code == 200:
                with open(download_path, 'wb') as f: f.writelines(response.iter_content(8192))
            else: print(f" -> Failed: HTTP {response.status_code}")
        except requests.RequestException as e: print(f" -> Error: {e}")

    print("\n--- Starting NVD Data Parsing ---")
    parsed_cve_list = []
    for file_path in sorted(NVD_DATA_DIR.glob('*.json.gz')):
        print(f"Parsing {file_path.name}...")
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            cve_data = json.load(f)
        for item in cve_data.get("CVE_Items", []):
            description = next((d["value"] for d in item.get("cve", {}).get("description", {}).get("description_data", []) if d.get("lang") == "en"), "")
            impact = item.get("impact", {})
            severity = impact.get('baseMetricV3', {}).get('cvssV3', {}).get('baseSeverity') or impact.get('baseMetricV2', {}).get('severity')
            # MODIFIED: Also extract the published date to identify recent CVEs
            published_date = item.get('publishedDate')
            if description and severity and published_date:
                parsed_cve_list.append({
                    "description": description.strip(),
                    "severity": severity.strip().capitalize(),
                    "publishedDate": published_date
                })

    df = pd.DataFrame(parsed_cve_list)
    print(f"\nEntries before duplicate removal: {len(df)}")
    df.drop_duplicates(subset=['description'], keep='last', inplace=True)
    print(f"Entries after duplicate removal: {len(df)}")
    df.to_pickle(PARSED_DATA_PATH)
    print(f"✅ Parsing Complete. Saved {len(df)} unique entries to {PARSED_DATA_PATH}")


# ----------------------------------------
# STEP 2: Find the Best Classifier and Train It
# ----------------------------------------
def train_and_evaluate_models():
    """
    Loads data, creates a balanced dataset by undersampling the majority classes
    (keeping the most recent data), then splits this balanced set for training
    and evaluation.
    """
    if not PARSED_DATA_PATH.exists():
        print(f"Error: Parsed data not found at {PARSED_DATA_PATH}. Please run the 'data_prep' workflow first.")
        return None, None, None

    print(f"\n--- Training and Evaluation with Undersampling ---")
    df = pd.read_pickle(PARSED_DATA_PATH).dropna()
    print(f"Loaded {len(df)} valid NVD entries.")

    # --- CORRECTED METHOD: Undersample BEFORE splitting ---
    # 1. Convert 'publishedDate' to datetime objects to allow sorting
    df['publishedDate'] = pd.to_datetime(df['publishedDate'])

    # 2. Sort by date so newest entries are first
    df = df.sort_values('publishedDate', ascending=False)

    # 3. Determine the size of the smallest class
    n_samples = df['severity'].value_counts().min()
    print(f"\nSmallest class size is {n_samples}. Undersampling all classes to this size.")
    print("Original dataset distribution:\n", df['severity'].value_counts())

    # 4. Group by severity and take the `n_samples` most recent entries from each group
    df_balanced = df.groupby('severity').head(n_samples)

    # 5. Shuffle the balanced dataset to ensure randomness
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

    print("\nBalanced dataset distribution:\n", df_balanced['severity'].value_counts())

    # 6. Prepare the final text list and labels from the balanced dataframe
    X_text = df_balanced['description'].tolist()
    y_labels = df_balanced['severity'].tolist()

    # --- End of Undersampling Modification ---

    print("\nLoading embedding model: 'all-mpnet-base-v2'...")
    embed_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

    print("Encoding all descriptions from the balanced dataset...")
    X_embeddings = embed_model.encode(X_text, show_progress_bar=True)

    # Encode the labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(y_labels)

    # 7. Now split the balanced and encoded data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_embeddings, y_encoded, test_size=0.25, random_state=42, stratify=y_encoded
    )
    print(f"\nTraining on {len(X_train)} samples, testing on {len(X_test)} samples.")

    classifiers = {
        "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42),
        "Random Forest": RandomForestClassifier(n_jobs=-1, random_state=42, n_estimators=100),
        "LightGBM (Tuned)": lgb.LGBMClassifier(
            n_estimators=1000,
            learning_rate=0.05,
            num_leaves=31,
            random_state=42,
            n_jobs=-1,
            colsample_bytree=0.8,
            subsample=0.8,
            reg_alpha=0.1,
            reg_lambda=0.1
        )
    }

    best_f1, best_model_name, best_classifier_obj = -1, "", None
    for name, clf in classifiers.items():
        print(f"\n--- Training {name} ---")
        fit_params = {}
        if "LightGBM" in name:
            # The validation set for early stopping should come from the split data
            fit_params = {"eval_set": [(X_test, y_test)], "callbacks": [early_stopping(20, verbose=False)]}

        clf.fit(X_train, y_train, **fit_params)

        y_pred = clf.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        f1_score = report["weighted avg"]["f1-score"]
        if f1_score > best_f1:
            best_f1, best_model_name, best_classifier_obj = f1_score, name, clf
        print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))

    print(f"\n🏆 Best performing model is: {best_model_name} with F1-Score: {best_f1:.4f}")

    MODELS_DIR.mkdir(parents=True, exist_ok=True)
    MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
    joblib.dump(best_classifier_obj, MODEL_PATH)
    joblib.dump(le, LABEL_ENCODER_PATH)
    print(f"✅ Best model saved to {MODEL_PATH}")

    return embed_model, best_classifier_obj, le

# ----------------------------------------
# STEP 3: Predict using a saved model
# ----------------------------------------
def predict_on_garak(embed_model=None, classifier=None, label_encoder=None):
    if not GARAK_REPORT_CSV.exists():
        print(f"Error: Garak CSV not found. Please run the 'data_prep' workflow first.")
        return

    if not all([embed_model, classifier, label_encoder]):
        print(f"Loading models from disk...")
        if not MODEL_PATH.exists():
            print(f"Error: Model file not found at {MODEL_PATH}. Please run the 'train' workflow first.")
            return
        classifier = joblib.load(MODEL_PATH)
        label_encoder = joblib.load(LABEL_ENCODER_PATH)
        # Use the same model for consistency
        embed_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

    print("\n--- Prediction on Garak Report ---")
    df = pd.read_csv(GARAK_REPORT_CSV)
    df["full_text"] = df["attacker_input"].fillna('') + " " + df["target_bot_response"].fillna('')

    print("Embedding Garak report for prediction...")
    embeddings = embed_model.encode(df["full_text"].tolist(), show_progress_bar=True)

    probabilities = classifier.predict_proba(embeddings)
    df["predicted_severity"] = label_encoder.inverse_transform(np.argmax(probabilities, axis=1))
    df["confidence_score"] = np.round(np.max(probabilities, axis=1), 4)
    for i, name in enumerate(label_encoder.classes_):
        df[f'prob_{name.lower()}'] = np.round(probabilities[:, i], 4)

    output_path = GARAK_DATA_DIR / "garak_with_severity_historic.csv"
    df.to_csv(output_path, index=False)
    print(f"✅ Predictions saved to {output_path}")

    print("\n--- Final Vulnerability Score (for FAILED test cases) ---")
    failed_df = df[df['status'] == 'Fail'].copy()

    if failed_df.empty:
        print("No failed test cases found in the report. No score to calculate.")
        return

    print(f"Calculating score based on {len(failed_df)} failed test cases (out of {len(df)} total).")
    severity_map = {'Critical': 10, 'High': 7, 'Medium': 4, 'Low': 1}
    severity_counts = failed_df['predicted_severity'].value_counts()
    total_score = sum(count * severity_map.get(s, 0) for s, count in severity_counts.items())
    max_possible_score = len(failed_df) * 10
    normalized_score = (total_score / max_possible_score) * 100 if max_possible_score > 0 else 0

    print("\nSeverity Distribution (of Failures):")
    print(severity_counts)
    print(f"\nTotal Raw Risk Score (from Failures): {total_score}")
    print(f"Normalized Report Vulnerability Score (0-100): {normalized_score:.2f}")

# ----------------------------------------
# STEP 4: Create a ZIP Archive of the Results
# ----------------------------------------
def create_archive():
    print("\n--- Creating ZIP Archive ---")
    archive_name = "cve_prediction_archive"

    with zipfile.ZipFile(f"{archive_name}.zip", 'w', zipfile.ZIP_DEFLATED) as zipf:
        for folder in [DATA_DIR, MODELS_DIR]:
            if folder.exists() and folder.is_dir():
                for file_path in folder.rglob('*'):
                    zipf.write(file_path, arcname=file_path.relative_to(Path.cwd()))
                print(f"Archived folder: {folder}")
            else:
                print(f"Warning: Folder '{folder}' not found. Skipping.")

    print(f"✅ Archive created successfully: {archive_name}.zip")


# ----------------------------------------
# MAIN EXECUTION WORKFLOW
# ----------------------------------------
def run_workflow(data_prep=False, train=False, predict=False, archive=False):
    """
    Controls the main execution flow of the script.
    Set flags to True for the steps you want to run.
    """
    print("--- CVE Severity Prediction Workflow ---")

    if data_prep:
        print("\n=== STAGE 1: DATA PREPARATION ===")
        process_garak_report()
        download_and_parse_all_nvd_data()

    embed_model, classifier, label_encoder = None, None, None
    if train:
        print("\n=== STAGE 2: MODEL TRAINING ===")
        embed_model, classifier, label_encoder = train_and_evaluate_models()

    if predict:
        print("\n=== STAGE 3: PREDICTION ===")
        predict_on_garak(embed_model, classifier, label_encoder)

    if archive:
        print("\n=== STAGE 4: ARCHIVING ===")
        create_archive()

    print("\n--- Workflow Finished ---")


if __name__ == "__main__":
    # --- Configure your desired workflow here ---

    # To get good results, you MUST run data_prep=True first.
    # On the first run, it's recommended to do all three main steps.
    run_workflow(data_prep=True, train=True, predict=True)

    # After the first run, you can comment out the line above and
    # uncomment one of the following lines to run specific tasks.

    # Example: Just run prediction using existing models
    # run_workflow(predict=True)

    # Example: Just create an archive of existing results
    # run_workflow(archive=True)

--- CVE Severity Prediction Workflow ---

=== STAGE 1: DATA PREPARATION ===
Downloading sample Garak report...
✅ Downloaded: data/5.0/garak/garak.report.jsonl
✅ Garak report successfully converted to: data/5.0/garak/garak_report_flat.csv
--- Starting NVD Data Download ---
Downloading: nvdcve-1.1-2002.json.gz
Downloading: nvdcve-1.1-2003.json.gz
Downloading: nvdcve-1.1-2004.json.gz
Downloading: nvdcve-1.1-2005.json.gz
Downloading: nvdcve-1.1-2006.json.gz
Downloading: nvdcve-1.1-2007.json.gz
Downloading: nvdcve-1.1-2008.json.gz
Downloading: nvdcve-1.1-2009.json.gz
Downloading: nvdcve-1.1-2010.json.gz
Downloading: nvdcve-1.1-2011.json.gz
Downloading: nvdcve-1.1-2012.json.gz
Downloading: nvdcve-1.1-2013.json.gz
Downloading: nvdcve-1.1-2014.json.gz
Downloading: nvdcve-1.1-2015.json.gz
Downloading: nvdcve-1.1-2016.json.gz
Downloading: nvdcve-1.1-2017.json.gz
Downloading: nvdcve-1.1-2018.json.gz
Downloading: nvdcve-1.1-2019.json.gz
Downloading: nvdcve-1.1-2020.json.gz
Downloading: nvdcve-1.1-

Batches:   0%|          | 0/357 [00:00<?, ?it/s]


Training on 8565 samples, testing on 2855 samples.

--- Training Logistic Regression ---
              precision    recall  f1-score   support

    Critical       0.65      0.74      0.69       714
        High       0.53      0.49      0.51       713
         Low       0.73      0.70      0.71       714
      Medium       0.64      0.64      0.64       714

    accuracy                           0.64      2855
   macro avg       0.64      0.64      0.64      2855
weighted avg       0.64      0.64      0.64      2855


--- Training Random Forest ---
              precision    recall  f1-score   support

    Critical       0.69      0.68      0.69       714
        High       0.56      0.50      0.53       713
         Low       0.67      0.77      0.72       714
      Medium       0.67      0.65      0.66       714

    accuracy                           0.65      2855
   macro avg       0.65      0.65      0.65      2855
weighted avg       0.65      0.65      0.65      2855


--- Tra



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.168445 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 8565, number of used features: 768
[LightGBM] [Info] Start training from score -1.386411
[LightGBM] [Info] Start training from score -1.385944
[LightGBM] [Info] Start training from score -1.386411
[LightGBM] [Info] Start training from score -1.386411




              precision    recall  f1-score   support

    Critical       0.72      0.75      0.73       714
        High       0.59      0.57      0.58       713
         Low       0.75      0.76      0.76       714
      Medium       0.71      0.70      0.71       714

    accuracy                           0.69      2855
   macro avg       0.69      0.69      0.69      2855
weighted avg       0.69      0.69      0.69      2855


🏆 Best performing model is: LightGBM (Tuned) with F1-Score: 0.6940
✅ Best model saved to models/5.0/best_cvss_classifier_historic.pkl

=== STAGE 3: PREDICTION ===

--- Prediction on Garak Report ---
Embedding Garak report for prediction...


Batches:   0%|          | 0/190 [00:00<?, ?it/s]



✅ Predictions saved to data/5.0/garak/garak_with_severity_historic.csv

--- Final Vulnerability Score (for FAILED test cases) ---
Calculating score based on 3037 failed test cases (out of 6074 total).

Severity Distribution (of Failures):
predicted_severity
Low         2867
High          85
Medium        60
Critical      25
Name: count, dtype: int64

Total Raw Risk Score (from Failures): 3952
Normalized Report Vulnerability Score (0-100): 13.01

--- Workflow Finished ---


In [24]:
!zip -r data.zip data/
!zip -r models.zip models/

  adding: data/ (stored 0%)
  adding: data/5.0/ (stored 0%)
  adding: data/5.0/garak/ (stored 0%)
  adding: data/5.0/garak/garak_with_severity_historic.csv (deflated 90%)
  adding: data/5.0/garak/garak.report.jsonl (deflated 85%)
  adding: data/5.0/garak/garak_report_flat.csv (deflated 83%)
  adding: data/5.0/nvd_data/ (stored 0%)
  adding: data/5.0/nvd_data/nvdcve-1.1-2018.json.gz (deflated 2%)
  adding: data/5.0/nvd_data/nvdcve-1.1-2003.json.gz (stored 0%)
  adding: data/5.0/nvd_data/nvdcve-1.1-2006.json.gz (deflated 2%)
  adding: data/5.0/nvd_data/nvdcve-1.1-2023.json.gz (deflated 3%)
  adding: data/5.0/nvd_data/nvdcve-1.1-2022.json.gz (deflated 3%)
  adding: data/5.0/nvd_data/nvdcve-1.1-2017.json.gz (deflated 4%)
  adding: data/5.0/nvd_data/nvdcve-1.1-2016.json.gz (deflated 3%)
  adding: data/5.0/nvd_data/nvdcve-1.1-2014.json.gz (deflated 2%)
  adding: data/5.0/nvd_data/nvdcve-1.1-2013.json.gz (deflated 2%)
  adding: data/5.0/nvd_data/all_nvd_cves.pkl (deflated 78%)
  adding: data/

In [22]:
!rm -rf data/
!rm -rf models/