In [1]:
import json
from tqdm import tqdm
import torch
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

# Initialize models
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

sentence_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
kw_model = KeyBERT(model=sentence_model)

def extract_keywords(kw_model, text):
    if not text or not text.strip():
        return []
    try:
        keywords = kw_model.extract_keywords(
            text,
            keyphrase_ngram_range=(1, 2),
            stop_words='english',
            top_n=5,
            use_mmr=True,
            diversity=0.5
        )
        return [kw[0] for kw in keywords]
    except Exception as e:
        print(f"\nError extracting keywords: {str(e)}")
        return []

def extract_platform(cpe_uri):
    parts = cpe_uri.split(":")
    if len(parts) >= 5:
        return f"{parts[3]}:{parts[4]}"
    return ""

def get_cvss_metrics(item):
    """Extract CVSS metrics from CVE item."""
    cvss_v3 = item.get("impact", {}).get("baseMetricV3", {}).get("cvssV3", {})
    cvss_v2 = item.get("impact", {}).get("baseMetricV2", {}).get("cvssV2", {})
    
    # Get CWE if available
    cwe_data = item.get("cve", {}).get("problemtype", {}).get("problemtype_data", [{}])[0].get("description", [])
    cwe_id = cwe_data[0]["value"] if cwe_data else "N/A"
    
    return {
        "cwe_id": cwe_id,
        "base_score": cvss_v3.get("baseScore", cvss_v2.get("baseScore")),
        "severity": cvss_v3.get("baseSeverity", cvss_v2.get("severity", "N/A")),
        "attack_vector": cvss_v3.get("attackVector", cvss_v2.get("accessVector", "N/A")),
        "attack_complexity": cvss_v3.get("attackComplexity", cvss_v2.get("accessComplexity", "N/A")),
        "privileges_required": cvss_v3.get("privilegesRequired", "N/A"),
        "user_interaction": cvss_v3.get("userInteraction", "N/A"),
        "scope": cvss_v3.get("scope", "N/A"),
        "confidentiality_impact": cvss_v3.get("confidentialityImpact", 
                                            cvss_v2.get("confidentialityImpact", "N/A")),
        "integrity_impact": cvss_v3.get("integrityImpact", 
                                       cvss_v2.get("integrityImpact", "N/A")),
        "availability_impact": cvss_v3.get("availabilityImpact", 
                                          cvss_v2.get("availabilityImpact", "N/A")),
        "exploitability_score": cvss_v3.get("exploitabilityScore", 
                                          cvss_v2.get("exploitabilityScore", "N/A")),
        "impact_score": cvss_v3.get("impactScore", 
                                   cvss_v2.get("impactScore", "N/A"))
    }

# Load CVE data
input_file = "Non_Processed_CVEs/nvdcve-1.1-2025.json"
print(f"Loading data from {input_file}...")     
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

cve_items = data["CVE_Items"]
total_cves = len(cve_items)
print(f"Found {total_cves} CVE entries")

# Process CVEs
records = []
with tqdm(total=total_cves, desc="Processing CVEs", unit="CVE") as pbar:
    for i, item in enumerate(cve_items, 1):
        try:
            cve_id = item["cve"]["CVE_data_meta"]["ID"]
            pbar.set_description(f"({i}/{total_cves}) Processing CVE ID: {cve_id}")
            
            # Description
            desc_data = item["cve"]["description"]["description_data"]
            description = next((d["value"] for d in desc_data if d["lang"] == "en"), "")

            # Keywords
            keywords = extract_keywords(kw_model, description)

            # Extract all CVSS metrics at once
            cvss_metrics = get_cvss_metrics(item)

            # Affected platforms and products
            platforms = set()
            products = []
            for node in item.get("configurations", {}).get("nodes", []):
                for cpe in node.get("cpe_match", []):
                    if cpe.get("vulnerable"):
                        uri = cpe.get("cpe23Uri", "")
                        products.append(uri)
                        platform = extract_platform(uri)
                        if platform:
                            platforms.add(platform)

            # Combine all data
            record = {
                "cve_id": cve_id,
                "description": description,
                "keywords": keywords,
                "platforms": list(platforms),
                "affected_products": products,
                "published_date": item.get("publishedDate"),
                "last_modified_date": item.get("lastModifiedDate"),
                **cvss_metrics  # This adds all CVSS metrics to the record
            }
            
            records.append(record)
            
        except Exception as e:
            print(f"\nError processing {cve_id if 'cve_id' in locals() else 'unknown CVE'}: {str(e)}")
            continue
        finally:
            pbar.update(1)

# Save the processed data
output_file = "Processed_CVEs/processed_cves_2025.json"
print(f"\nSaving processed data to {output_file}...")
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(records, f, ensure_ascii=False, indent=2)

print("Processing complete!")
print(f"Successfully processed {len(records)} out of {total_cves} CVEs")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu
Loading data from Non_Processed_CVEs/nvdcve-1.1-2025.json...


FileNotFoundError: [Errno 2] No such file or directory: 'Non_Processed_CVEs/nvdcve-1.1-2025.json'