### Creating queries
### Next step is to query all 300K-ish CVE entires in the NVD and store the top 3 matches into mongo."

In [1]:
import sys
from pathlib import Path

project_root = Path().resolve()
while not (project_root / 'embedding_pipeline').exists() and project_root != project_root.parent:
    project_root = project_root.parent

sys.path.append(str(project_root))

In [2]:
### Local query test first ###
NAME = "FOSS_vectors"

from embedding_pipeline.weaviate_db.weaviate_config import connect_to_local_weaviate_client
from embedding_pipeline.weaviate_db.weaviate_config import verify_weaviate_client_ready
from embedding_pipeline.weaviate_db.weaviate_config import close_weaviate_client
from embedding_pipeline.weaviate_db.weaviate_config import list_weaviate_collections, inspect_collection_properties, retrieve_existing_weaviate_collection

local_client = connect_to_local_weaviate_client()
list_weaviate_collections(local_client)


Connecting to local client...
Connected to local weaviate client--> is ready: True
Collection names:
- FOSS_vectors


In [4]:
from embedding_pipeline.weaviate_db.weaviate_query_operations import query_weaviate_collection
from embedding_pipeline.weaviate_db.weaviate_query_operations import get_query_vector_responses
from embedding_pipeline.embedding_models.nomic_embed import embed_prompt_with_nomic
from embedding_pipeline.embedding_models.DISTIL_BERT_embed import embed_prompt_with_distil_bert
from embedding_pipeline.embedding_models.SBERT_mini_lm_l6_embed import embed_prompt_with_sbert_mini_l6
from embedding_pipeline.embedding_models.SBERT_mini_lm_l12_embed import embed_prompt_with_sbert_mini_l12
from pprint import pprint

local_foss_collection = retrieve_existing_weaviate_collection(collection_name=NAME,weaviate_client=local_client)

### Embed test query
TEST_PROMPT = "freebsd freebsd"

nomic_vec = embed_prompt_with_nomic(prompt=TEST_PROMPT)

### embedding with distil bert --> acutually embedding with sbert l6
### Dimension of l6 should be 384
MIXED_UP_sbert_l6_vec= embed_prompt_with_distil_bert(prompt=TEST_PROMPT)

### embedding with sbert l6 --> actually embedding with distil bert *** I'll fix error later
### Dimension of distil should be 768
MIXED_UP_distil_vec = embed_prompt_with_sbert_mini_l6(prompt=TEST_PROMPT)

sbert_l12_vec = embed_prompt_with_sbert_mini_l12(prompt=TEST_PROMPT)

# Check the dimensionality of your embedding vectors
print(f"Nomic vector dimension: {len(nomic_vec)}")
print(f"DistilBERT vector dimension: {len(MIXED_UP_distil_vec)}")
print(f"SBERT L6 vector dimension: {len(MIXED_UP_sbert_l6_vec)}")
print(f"SBERT L12 vector dimension: {len(sbert_l12_vec)}")


schema = local_foss_collection.config.get()

# # Print the vector configurations
# for vector_name, vector_config in schema.vector_config.items():
#     print(f"Vector name: {vector_name}")
#     if hasattr(vector_config, "dimensions") and vector_config.dimensions is not None:
#         print(f"Dimensions: {vector_config.dimensions}")
#     else:
#         print("Dimensions not explicitly set (using model default)")

for item in local_foss_collection.iterator(include_vector=True):
    for vector_name, vector in item.vector.items():
        print(f"Vector name: {vector_name}")
        print(f"Dimensions: {len(vector)}")
    break  # Just check the first object


Nomic vector dimension: 768
DistilBERT vector dimension: 384
SBERT L6 vector dimension: 768
SBERT L12 vector dimension: 384
Vector name: bge_large_description_vec
Dimensions: 1024
Vector name: e5_large_description_vec
Dimensions: 1024
Vector name: gte_large_description_vec
Dimensions: 1024
Vector name: roberta_large_description_vec
Dimensions: 1024
Vector name: sbert_minilm_l12_v2_name_vec
Dimensions: 384
Vector name: sbert_minilm_l6_v2_name_vec
Dimensions: 384
Vector name: sbert_mpnet_base_v2_description_vec
Dimensions: 768
Vector name: distil_bert_name_vec
Dimensions: 768
Vector name: gte_large_name_vec
Dimensions: 1024
Vector name: ollama_nomic_name_vec
Dimensions: 768


In [5]:
### Get the query object ###
nomic_query_return = query_weaviate_collection(nomic_vec,target_name_vector_query="ollama_nomic_name_vec",weaviate_client=local_client,collection_name=NAME)
distil_query_return = query_weaviate_collection(MIXED_UP_distil_vec,target_name_vector_query="sbert_minilm_l6_v2_name_vec",weaviate_client=local_client,collection_name=NAME)
sbert_l6_query_return = query_weaviate_collection(MIXED_UP_sbert_l6_vec,target_name_vector_query="distil_bert_name_vec",weaviate_client=local_client,collection_name=NAME)
sbert_l12_query_return = query_weaviate_collection(sbert_l12_vec,target_name_vector_query="sbert_minilm_l12_v2_name_vec",weaviate_client=local_client,collection_name=NAME)


In [5]:
### Get the import vector info ###
nomic_response = get_query_vector_responses(response=nomic_query_return)
distil_response = get_query_vector_responses(response=distil_query_return)
sbert_l6_response = get_query_vector_responses(response=sbert_l6_query_return)
sbert_l12_response = get_query_vector_responses(response=sbert_l12_query_return)



### Check out the research results!!! ###
print("#######################################")
print("#######################################")
print("NOMIC")
pprint(nomic_response)
print("#######################################")
print("#######################################")
print("DISTIL")
pprint(distil_response)
print("#######################################")
print("#######################################")
print("SBERT L6")
pprint(sbert_l6_response)
print("#######################################")
print("#######################################")
print("SBERT L12")
pprint(sbert_l12_response)

#######################################
#######################################
NOMIC
[{'foss_project_name': 'TensorFlow',
  'vector_certainty': 0.48803067207336426,
  'vector_distance': 1.0239386558532715}]
#######################################
#######################################
DISTIL
[{'foss_project_name': 'TensorFlow',
  'vector_certainty': 0.5127314329147339,
  'vector_distance': 0.9745371341705322}]
#######################################
#######################################
SBERT L6
[{'foss_project_name': 'TensorFlow',
  'vector_certainty': 0.4823462963104248,
  'vector_distance': 1.0353074073791504}]
#######################################
#######################################
SBERT L12
[{'foss_project_name': 'TensorFlow',
  'vector_certainty': 0.4771720767021179,
  'vector_distance': 1.0456558465957642}]


In [7]:
### Querying remote weaviate database ###
from embedding_pipeline.weaviate_db.weaviate_config import create_remote_weaviate_client

remote_client = create_remote_weaviate_client()
### Get the query object ###
nomic_query_return_remote = query_weaviate_collection(nomic_vec,target_name_vector_query="ollama_nomic_name_vec",weaviate_client=remote_client,collection_name=NAME)
distil_query_return_remote = query_weaviate_collection(MIXED_UP_distil_vec,target_name_vector_query="distil_bert_name_vec",weaviate_client=remote_client,collection_name=NAME)
sbert_l6_query_return_remote = query_weaviate_collection(MIXED_UP_sbert_l6_vec,target_name_vector_query="sbert_minilm_l6_v2_name_vec",weaviate_client=remote_client,collection_name=NAME)
sbert_l12_query_return_remote = query_weaviate_collection(sbert_l12_vec,target_name_vector_query="sbert_minilm_l12_v2_name_vec",weaviate_client=remote_client,collection_name=NAME)


            Please make sure to close the connection using `client.close()`.
  remote_client = create_remote_weaviate_client()


In [8]:
### Get the import vector info ###
nomic_response = get_query_vector_responses(response=nomic_query_return_remote)
distil_response = get_query_vector_responses(response=distil_query_return_remote)
sbert_l6_response = get_query_vector_responses(response=sbert_l6_query_return_remote)
sbert_l12_response = get_query_vector_responses(response=sbert_l12_query_return_remote)



### Check out the research results!!! ###
print("#######################################")
print("#######################################")
print("NOMIC")
pprint(nomic_response)
print("#######################################")
print("#######################################")
print("DISTIL")
pprint(distil_response)
print("#######################################")
print("#######################################")
print("SBERT L6")
pprint(sbert_l6_response)
print("#######################################")
print("#######################################")
print("SBERT L12")
pprint(sbert_l12_response)

#######################################
#######################################
NOMIC
[{'foss_project_name': 'freebsd freebsd-src',
  'vector_certainty': 0.8863022327423096,
  'vector_distance': 0.22739553451538086},
 {'foss_project_name': 'FreeCAD FreeCAD',
  'vector_certainty': 0.8728582262992859,
  'vector_distance': 0.2542835474014282},
 {'foss_project_name': 'Freeboard freeboard',
  'vector_certainty': 0.833430826663971,
  'vector_distance': 0.3331383466720581}]
#######################################
#######################################
DISTIL
[{'foss_project_name': 'freebsd freebsd-src',
  'vector_certainty': 0.675735592842102,
  'vector_distance': 0.6485288143157959},
 {'foss_project_name': 'briannesbitt Carbon',
  'vector_certainty': 0.6681805849075317,
  'vector_distance': 0.6636387705802917},
 {'foss_project_name': 'DIYgod RSSHub',
  'vector_certainty': 0.666953980922699,
  'vector_distance': 0.666092038154602}]
#######################################
####################

In [None]:
### Try name + description queries ###
from embedding_pipeline.embedding_models.BGE_large_embed import embed_prompt_with_bge_large
from embedding_pipeline.embedding_models.E5_large_embed import embed_prompt_with_e5_large
from embedding_pipeline.embedding_models.SBERT_mpnet_embed import embed_prompt_with_sbert_mpnet
from embedding_pipeline.embedding_models.ROBERTA_large_embed import embed_prompt_with_roberta_large



DESCR_TEST_PROMPT_1 = "Linux, BSD, OSX and Windows"
DESCR_TEST_PROMPT_2 = "A vulnerability was found in Nothings stb up to f056911. It has been declared as critical. Affected by this vulnerability is the function stbhw_build_tileset_from_image. The manipulation of the argument h_count/v_count leads to out-of-bounds read. The attack can be launched remotely. This product takes the approach of rolling releases to provide continious delivery. Therefore, version details for affected and updated releases are not available. The vendor was contacted early about this disclosure but did not respond in any way."

bge_vec = embed_prompt_with_bge_large(prompt=DESCR_TEST_PROMPT_1)
e5_vec = embed_prompt_with_e5_large(prompt=DESCR_TEST_PROMPT_1)
sbert_mpnet_vec = embed_prompt_with_sbert_mpnet(prompt=DESCR_TEST_PROMPT_1)
roberta_vec = embed_prompt_with_roberta_large(prompt=DESCR_TEST_PROMPT_1)

bge_query_return = query_weaviate_collection(vector_query=bge_vec,target_name_vector_query="bge_large_description_vec",weaviate_client=local_client,collection_name=NAME)
e5_query_return = query_weaviate_collection(vector_query=e5_vec,target_name_vector_query="e5_large_description_vec",weaviate_client=local_client,collection_name=NAME)
sbert_mpnet_query_return = query_weaviate_collection(vector_query=sbert_mpnet_vec,target_name_vector_query="sbert_mpnet_base_v2_description_vec",weaviate_client=local_client,collection_name=NAME)
roberta_query_return = query_weaviate_collection(vector_query=roberta_vec,target_name_vector_query="roberta_large_description_vec",weaviate_client=local_client,collection_name=NAME)

In [None]:
### Get the import vector info ###
bge_response = get_query_vector_responses(response=bge_query_return)
e5_response = get_query_vector_responses(response=e5_query_return)
sbert_mpnet_response = get_query_vector_responses(response=sbert_mpnet_query_return)
roberta_response = get_query_vector_responses(response=roberta_query_return)



### Check out the research results!!! ###
print("#######################################")
print("#######################################")
print("BGE")
pprint(bge_response)
print("#######################################")
print("#######################################")
print("E5")
pprint(e5_response)
print("#######################################")
print("#######################################")
print("SBERT MPNET")
pprint(sbert_mpnet_response)
print("#######################################")
print("#######################################")
print("ROBERTA")
pprint(roberta_response)

In [6]:
### About to query all the CVE / CPEs ###

import json

with open("../../enriched_cve_data/cve-cpe-foss_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Example: print the first entry
print(data[0])

{'cve_id': 'CVE-1999-0001', 'cwe_val': ['CWE-20'], 'cve_description': 'ip_input.c in BSD-derived TCP/IP implementations allows remote attackers to cause a denial of service (crash or hang) via crafted packets.', 'cpes_with_versions': {'freebsd freebsd': ['2.2.5', '2.2.2', '2.1.7', '2.2.3', '2.0.5', '1.1.5.1', '1.0', '2.2', '2.2.8', '2.2.4', '2.2.6', '2.1.6', '1.1', '2.1.6.1', '2.1.7.1', '3.0', '2.1.5', '2.0.1', '1.2', '2.0'], 'bsdi bsd_os': ['3.1'], 'openbsd openbsd': ['2.4', '2.3']}, 'foss_vec_matches': []}


In [2]:
### Create new MongoDB collection ###
from pymongo import MongoClient

uri = "mongodb://localhost:27017"
mongodb_client = MongoClient(uri)


try:
    mongodb_client.admin.command("ping")
    print("Connected to MongoDB!")
except :
    print("❌ Could not connect to MongoDB:")

### Get the CVE database ###
CVE_DB = mongodb_client["nvdcve"]

enriched_cve_collection = CVE_DB["enriched_cve_entries"]



Connected to MongoDB!


In [None]:
from embedding_pipeline.embedding_models.GTE_large_embed import embed_prompt_with_gte_large
import copy
### set the batch size ###
BATCH_SIZE = 500
BATCH = []


for entry in data:
    try:
        # all processing code here

        enriched_mongo_entry = copy.deepcopy(entry)

        ### Temporary list to hold vector responses for individual CPEs (ven:product combos)
        enriched_cpe_matches = []


        if 'cpes_with_versions' not in entry or not entry['cpes_with_versions']:
            continue

        for cpe in entry['cpes_with_versions']:


            
            ### Get the CPE identifier ###
            vendor_query:str = cpe


            nomic_vec = embed_prompt_with_nomic(prompt=vendor_query)
            MIXED_UP_sbert_l6_vec= embed_prompt_with_distil_bert(prompt=vendor_query)
            MIXED_UP_distil_vec = embed_prompt_with_sbert_mini_l6(prompt=vendor_query)
            sbert_l12_vec = embed_prompt_with_sbert_mini_l12(prompt=vendor_query)
            gte_large_vec = embed_prompt_with_gte_large(prompt= vendor_query)


            ### Get the query object ###
            nomic_query_return = query_weaviate_collection(nomic_vec,target_name_vector_query="ollama_nomic_name_vec",weaviate_client=local_client,collection_name=NAME)
            distil_query_return = query_weaviate_collection(MIXED_UP_distil_vec,target_name_vector_query="distil_bert_name_vec",weaviate_client=local_client,collection_name=NAME)
            sbert_l6_query_return = query_weaviate_collection(MIXED_UP_sbert_l6_vec,target_name_vector_query="sbert_minilm_l6_v2_name_vec",weaviate_client=local_client,collection_name=NAME)
            sbert_l12_query_return = query_weaviate_collection(sbert_l12_vec,target_name_vector_query="sbert_minilm_l12_v2_name_vec",weaviate_client=local_client,collection_name=NAME)
            gte_large_query_return = query_weaviate_collection(gte_large_vec,target_name_vector_query="gte_large_name_vec",weaviate_client=local_client,collection_name=NAME)

            ### Get the import vector info ###
            nomic_response = get_query_vector_responses(response=nomic_query_return)
            distil_response = get_query_vector_responses(response=distil_query_return)
            sbert_l6_response = get_query_vector_responses(response=sbert_l6_query_return)
            sbert_l12_response = get_query_vector_responses(response=sbert_l12_query_return)
            gte_large_response = get_query_vector_responses(response=gte_large_query_return)
            
        

            
            ### Getting the first vector because this will have the highest cosine similarity / distance  ### 
            # nomic_final = nomic_response[0]
            # distil_final = distil_response[0]
            # sbert_l6_final = sbert_l6_response[0]
            # sbert_12_final = sbert_l12_response[0]
            # gte_large_final = gte_large_response[0]
            
            enriched_cpe_matches.append({
                "cpe": vendor_query,
                "nomic": nomic_response,
                "distil": distil_response,
                "sbert_l6": sbert_l6_response,
                "sbert_l12": sbert_l12_response,
                "gte_large": gte_large_response
            })

        
        ### An 'enriched_mongo_entry' represents 1 entire CVE (that has multiple cpes) ###
        enriched_mongo_entry["foss_vec_matches"] = enriched_cpe_matches
        
        
        
        BATCH.append(enriched_mongo_entry)

        # Batch insert every 500 entries
        if len(BATCH) >= BATCH_SIZE:
            enriched_cve_collection.insert_many(BATCH)
            BATCH = []


    except Exception as e:
        print(f"Failed on entry {entry.get('id', 'unknown')}: {e}")
        continue
        
# Insert any remaining
if BATCH:
    enriched_cve_collection.insert_many(BATCH)
    
    



ModuleNotFoundError: No module named 'embedding_pipeline'

In [None]:
close_weaviate_client(local_client)
#close_weaviate_client(remote_client)

Closing connection to weaviate client


NameError: name 'remote_client' is not defined