### Creating queries
### Next step is to query all 300K-ish CVE entires in the NVD and store the top 3 matches into mongo."

In [1]:
import sys
from pathlib import Path

project_root = Path().resolve()
while not (project_root / 'embedding_pipeline').exists() and project_root != project_root.parent:
    project_root = project_root.parent

sys.path.append(str(project_root))

In [2]:
### Local query test first ###
NAME = "FOSS_vectors"

from embedding_pipeline.weaviate_db.weaviate_config import connect_to_local_weaviate_client
from embedding_pipeline.weaviate_db.weaviate_config import verify_weaviate_client_ready
from embedding_pipeline.weaviate_db.weaviate_config import close_weaviate_client, create_weaviate_collection
from embedding_pipeline.weaviate_db.weaviate_config import list_weaviate_collections, inspect_collection_properties, retrieve_existing_weaviate_collection

local_client = connect_to_local_weaviate_client()
list_weaviate_collections(local_client)


Connecting to local client...
Connected to local weaviate client--> is ready: True
Collection names:
- FOSS_vectors
Collections: FOSS_vectors


In [12]:
from embedding_pipeline.weaviate_db.weaviate_query_operations import query_weaviate_collection
from embedding_pipeline.weaviate_db.weaviate_query_operations import get_query_vector_responses
from embedding_pipeline.embedding_models.nomic_embed import embed_prompt_with_nomic
from embedding_pipeline.embedding_models.DISTIL_BERT_embed import embed_prompt_with_distil_bert
from embedding_pipeline.embedding_models.SBERT_mini_lm_l6_embed import embed_prompt_with_sbert_mini_l6
from embedding_pipeline.embedding_models.SBERT_mini_lm_l12_embed import embed_prompt_with_sbert_mini_l12
from pprint import pprint
import json
local_foss_collection = retrieve_existing_weaviate_collection(collection_name=NAME,weaviate_client=local_client)

### Embed test query
TEST_PROMPT = "freebsd:freebsd"

nomic_vec = embed_prompt_with_nomic(prompt=TEST_PROMPT)

### embedding with distil bert --> acutually embedding with sbert l6
### Dimension of l6 should be 384
MIXED_UP_sbert_l6_vec= embed_prompt_with_distil_bert(prompt=TEST_PROMPT)

### embedding with sbert l6 --> actually embedding with distil bert *** I'll fix error later
### Dimension of distil should be 768
MIXED_UP_distil_vec = embed_prompt_with_sbert_mini_l6(prompt=TEST_PROMPT)

sbert_l12_vec = embed_prompt_with_sbert_mini_l12(prompt=TEST_PROMPT)

# Check the dimensionality of your embedding vectors
print(f"Nomic vector dimension: {len(nomic_vec)}")
print(f"DistilBERT vector dimension: {len(MIXED_UP_distil_vec)}")
print(f"SBERT L6 vector dimension: {len(MIXED_UP_sbert_l6_vec)}")
print(f"SBERT L12 vector dimension: {len(sbert_l12_vec)}")


schema = local_foss_collection.config.get()

# # Print the vector configurations
# for vector_name, vector_config in schema.vector_config.items():
#     print(f"Vector name: {vector_name}")
#     if hasattr(vector_config, "dimensions") and vector_config.dimensions is not None:
#         print(f"Dimensions: {vector_config.dimensions}")
#     else:
#         print("Dimensions not explicitly set (using model default)")

for item in local_foss_collection.iterator(include_vector=True):
    for vector_name, vector in item.vector.items():
        print(f"Vector name: {vector_name}")
        print(f"Dimensions: {len(vector)}")
    break  # Just check the first object


Nomic vector dimension: 768
DistilBERT vector dimension: 384
SBERT L6 vector dimension: 768
SBERT L12 vector dimension: 384
Vector name: bge_large_description_vec
Dimensions: 1024
Vector name: gte_large_description_vec
Dimensions: 1024
Vector name: roberta_large_description_vec
Dimensions: 1024
Vector name: sbert_minilm_l12_v2_name_vec
Dimensions: 384
Vector name: sbert_mpnet_base_v2_description_vec
Dimensions: 768
Vector name: distil_bert_name_vec
Dimensions: 384
Vector name: e5_large_description_vec
Dimensions: 1024
Vector name: gte_large_name_vec
Dimensions: 1024
Vector name: ollama_nomic_name_vec
Dimensions: 768
Vector name: sbert_minilm_l6_v2_name_vec
Dimensions: 768


In [14]:
### Get the query object ###
nomic_query_return = query_weaviate_collection(nomic_vec,target_name_vector_query="ollama_nomic_name_vec",weaviate_client=local_client,collection_name=NAME)
distil_query_return = query_weaviate_collection(MIXED_UP_distil_vec,target_name_vector_query="distil_bert_name_vec",weaviate_client=local_client,collection_name=NAME)
sbert_l6_query_return = query_weaviate_collection(MIXED_UP_sbert_l6_vec,target_name_vector_query="sbert_minilm_l6_v2_name_vec",weaviate_client=local_client,collection_name=NAME)
sbert_l12_query_return = query_weaviate_collection(sbert_l12_vec,target_name_vector_query="sbert_minilm_l12_v2_name_vec",weaviate_client=local_client,collection_name=NAME)


In [16]:
### Get the import vector info ###
nomic_response = get_query_vector_responses(response=nomic_query_return)
distil_response = get_query_vector_responses(response=distil_query_return)
sbert_l6_response = get_query_vector_responses(response=sbert_l6_query_return)
sbert_l12_response = get_query_vector_responses(response=sbert_l12_query_return)



### Check out the research results!!! ###
print("#######################################")
print("#######################################")
print("NOMIC")
pprint(nomic_response)
print("#######################################")
print("#######################################")
print("DISTIL")
pprint(distil_response)
print("#######################################")
print("#######################################")
print("SBERT L6")
pprint(sbert_l6_response)
print("#######################################")
print("#######################################")
print("SBERT L12")
pprint(sbert_l12_response)

#######################################
#######################################
NOMIC
[{'foss_project_name': 'freebsd freebsd-src',
  'vector_certainty': 0.8797721862792969,
  'vector_distance': 0.24045562744140625},
 {'foss_project_name': 'FreeCAD FreeCAD',
  'vector_certainty': 0.8617587685585022,
  'vector_distance': 0.2764824628829956},
 {'foss_project_name': 'Freeboard freeboard',
  'vector_certainty': 0.8310739398002625,
  'vector_distance': 0.3378521203994751},
 {'foss_project_name': 'lTbgykio Books-Free-Books',
  'vector_certainty': 0.8278905749320984,
  'vector_distance': 0.3442188501358032},
 {'foss_project_name': 'FreeRDP FreeRDP',
  'vector_certainty': 0.8252442479133606,
  'vector_distance': 0.3495115041732788}]
#######################################
#######################################
DISTIL
[{'foss_project_name': 'freebsd freebsd-src',
  'vector_certainty': 0.677076518535614,
  'vector_distance': 0.645846962928772},
 {'foss_project_name': 'tobspr-games shapez.io',
 