In [1]:
from pymongo import MongoClient

uri = "mongodb://localhost:27017"
client = MongoClient(uri)

try:
    client.admin.command("ping")
    print("Connected to MongoDB!")
except :
    print("❌ Could not connect to MongoDB:")



Connected to MongoDB!


In [2]:
# List available databases
db_list = client.list_database_names()
print("Databases:", db_list)

# Check if your database exists
db_name = "nvdcve"
if db_name in db_list:
    db = client[db_name]
    print(f"✅ Database '{db_name}' exists.")
    
    # List collections in that database
    collection_list = db.list_collection_names()
    print("Collections:", collection_list)

    # Check if your collection exists
    collection_name = "nvdcve"
    if collection_name in collection_list:
        print(f"✅ Collection '{collection_name}' exists.")
    else:
        print(f"❌ Collection '{collection_name}' does not exist.")
else:
    print(f"❌ Database '{db_name}' does not exist.")


Databases: ['admin', 'config', 'foss_github_data', 'local', 'nvdcve']
✅ Database 'nvdcve' exists.
Collections: ['nvdcve']
✅ Collection 'nvdcve' exists.


In [3]:
### Get the CVE database ###
CVE_DB = client["nvdcve"]

### Get the NVD CVE collection ###
NVD_CVE_COLLECTION = CVE_DB["nvdcve"]


### Get a sample of the collection documents
for doc in NVD_CVE_COLLECTION.find().limit(1):
    print(doc)


{'_id': ObjectId('68000746fe426ba2c401febe'), 'cve': {'data_type': 'CVE', 'data_format': 'MITRE', 'data_version': '4.0', 'CVE_data_meta': {'ID': 'CVE-1999-0001', 'ASSIGNER': 'cve@mitre.org'}, 'problemtype': {'problemtype_data': [{'description': [{'lang': 'en', 'value': 'CWE-20'}]}]}, 'references': {'reference_data': [{'url': 'http://www.openbsd.org/errata23.html#tcpfix', 'name': 'http://www.openbsd.org/errata23.html#tcpfix', 'refsource': '', 'tags': []}, {'url': 'http://www.openbsd.org/errata23.html#tcpfix', 'name': 'http://www.openbsd.org/errata23.html#tcpfix', 'refsource': '', 'tags': []}, {'url': 'http://www.osvdb.org/5707', 'name': '5707', 'refsource': '', 'tags': []}, {'url': 'http://www.osvdb.org/5707', 'name': '5707', 'refsource': '', 'tags': []}]}, 'description': {'description_data': [{'lang': 'en', 'value': 'ip_input.c in BSD-derived TCP/IP implementations allows remote attackers to cause a denial of service (crash or hang) via crafted packets.'}]}}, 'configurations': {'CVE_da

In [None]:
import pandas as pd
import re


### Regex pattern for getting vendor product combos
pattern = re.compile(r'cpe:2\.3:[aho]:([^:]+):([^:]+)')

### Extract Product Vendor Combos (CPE fields) ###
### Extract CVE Descriptions ###

results = []
for doc in NVD_CVE_COLLECTION.find({}):
    cve_id = doc.get("cve", {}).get("CVE_data_meta", {}).get("ID", None)
    
    # Defensive access to nested lists
    cwe = None
    try:
        cwe = doc["cve"]["problemtype"]["problemtype_data"][0]["description"][0]["value"]
    except (KeyError, IndexError):
        pass

    description = None
    try:
        description = doc["cve"]["description"]["description_data"][0]["value"]
    except (KeyError, IndexError):
        pass

    cpes = []
    try:
        for node in doc["configurations"]["nodes"]:
            for match in node.get("cpe_match", []):
                cpes.append(match["cpe23Uri"])
    except (KeyError, TypeError):
        pass

    results.append({
        "cve_id": cve_id,
        "cwe": cwe,
        "description": description,
        "cpes": cpes
    })

print(results)

# for entry in results:
#     for cpe in entry["cpes"]:

vendor_product_set = {":".join(cpe.split(":")[3:5]) for cpe in results[0]["cpes"]}
print(vendor_product_set)

[{'cve_id': 'CVE-1999-0001', 'cwe': 'CWE-20', 'description': 'ip_input.c in BSD-derived TCP/IP implementations allows remote attackers to cause a denial of service (crash or hang) via crafted packets.', 'cpes': ['cpe:2.3:o:freebsd:freebsd:2.2.5:*:*:*:*:*:*:*', 'cpe:2.3:o:freebsd:freebsd:2.2.2:*:*:*:*:*:*:*', 'cpe:2.3:o:freebsd:freebsd:2.1.7:*:*:*:*:*:*:*', 'cpe:2.3:o:freebsd:freebsd:2.2.3:*:*:*:*:*:*:*', 'cpe:2.3:o:freebsd:freebsd:2.0.5:*:*:*:*:*:*:*', 'cpe:2.3:o:freebsd:freebsd:1.1.5.1:*:*:*:*:*:*:*', 'cpe:2.3:o:bsdi:bsd_os:3.1:*:*:*:*:*:*:*', 'cpe:2.3:o:freebsd:freebsd:1.0:*:*:*:*:*:*:*', 'cpe:2.3:o:freebsd:freebsd:2.2:*:*:*:*:*:*:*', 'cpe:2.3:o:freebsd:freebsd:2.2.8:*:*:*:*:*:*:*', 'cpe:2.3:o:freebsd:freebsd:2.2.4:*:*:*:*:*:*:*', 'cpe:2.3:o:freebsd:freebsd:2.2.6:*:*:*:*:*:*:*', 'cpe:2.3:o:freebsd:freebsd:2.1.6:*:*:*:*:*:*:*', 'cpe:2.3:o:freebsd:freebsd:1.1:*:*:*:*:*:*:*', 'cpe:2.3:o:freebsd:freebsd:2.1.6.1:*:*:*:*:*:*:*', 'cpe:2.3:o:freebsd:freebsd:2.1.7.1:*:*:*:*:*:*:*', 'cpe:2.3:o

In [None]:
"""
cve_id: id,
cwe_val: cwe,
cve description: text,
set of unique cpes with versions: dict[vendor product : list[versions]],
foss_vec_matches: list[dict[name of model (Nomic): {'foss_project_name': 'FFmpeg FFmpeg',
  'vector_certainty': 0.9790698885917664,
  'vector_distance': 0.041860222816467285}]]
                        
                            

"""