In [1]:
import marqo
import sys
import pathlib
import pickle

In [2]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [3]:
sys.path.insert(0, "src")
import data_functions
import embedding_functions
import cluster_functions
import graph_functions
import util_functions
from MarqoExternalUtils import MarqoEmbeddingStore

  from tqdm.autonotebook import tqdm





---

In [4]:
base_name = "grascco_lokal_test"
path_name = lambda x: f"./tmp/{base_name}/{base_name}_{x}.pickle"

In [5]:
grascco_data = data_functions.DataProcessingFactory.load(
    file_path=path_name("data"),
)

In [6]:
grascco_embedding = embedding_functions.SentenceEmbeddingsFactory.create(
    data_obj=grascco_data,
    cache_path=pathlib.Path(path_name("embedding")).parent,
    cache_name="_".join(pathlib.Path(path_name("embedding")).stem.split("_")[:-1]),
    model_name="Sahajtomar/German-semantic",
    storage_method=("vector_store", {},),
    vectorstore_normalizeEmbeddings=False,
    vectorstore_annParameters={
        "spaceType": "dotproduct", #prenormalized-angular(default)
        "parameters": {
            "efConstruction": 1024,
            "m": 16
        }
    }
)

INFO:root:Creating Sentence Embedding with 'None'
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: Sahajtomar/German-semantic
Batches: 100%|██████████| 28/28 [00:11<00:00,  2.34it/s]
2025-05-19 11:01:26,185 logger:'marqo' INFO     add_documents batch 0: took 2.520s for Marqo to process & index 128 docs. Roundtrip time: 2.711s.
INFO:marqo:    add_documents batch 0: took 2.520s for Marqo to process & index 128 docs. Roundtrip time: 2.711s.
2025-05-19 11:01:27,065 logger:'marqo' INFO     add_documents batch 1: took 0.690s for Marqo to process & index 128 docs. Roundtrip time: 0.874s.
INFO:marqo:    add_documents batch 1: took 0.690s for Marqo to process & index 128 docs. Roundtrip time: 0.874s.
2025-05-19 11:01:28,085 logger:'marqo' INFO     add_documents batch 2: took 0.841s for Marqo to process & index 128 docs. Roundtrip time: 1.026s.
INFO:marqo:    add_documents batch 2: took

Saved under: C:\Users\fra3066mat\PycharmProjects\concept-graphs\tmp\grascco_lokal_test\grascco_lokal_test.pickle


In [6]:
grascco_embedding_from_vec = embedding_functions.SentenceEmbeddingsFactory.load(
    data_obj_path=path_name("data"),
    embeddings_path=path_name("vec"),
    storage_method=("vector_store", {},),
)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: Sahajtomar/German-semantic


In [8]:
grascco_clustering = cluster_functions.PhraseClusterFactory.load(
    data_obj_path=path_name("clustering"),
)

In [10]:
grascco_graph = pickle.load(
    open(path_name("graph"), 'rb')
)

---

In [5]:
mqs = MarqoEmbeddingStore("http://localhost:8882", "grascco_lokal_test")



In [None]:
mqs.

In [16]:
mqs.marqo_index.recommend(["10"], exclude_input_documents=False)

{'hits': [{'graph_cluster': ['2'],
   '_id': '10',
   'phrase': 'handamputation',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 491.0687255859375},
  {'graph_cluster': ['2'],
   '_id': '238',
   'phrase': 'handchirurgen',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 359.72576904296875},
  {'_id': '51',
   'phrase': 'abgetrennte hand',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 345.06219482421875},
  {'_id': '55',
   'phrase': 'komplette amputation',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 344.34820556640625},
  {'graph_cluster': ['2'],
   '_id': '20',
   'phrase': 'handreplantaticm',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 337.96893310546875},
  {'graph_cluster': ['2'],
   '_id': '229',
   'phrase': 'handchirurgischer sicht',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 313.2591247558594},
  {'graph_cluster': ['2'],
   '_id': '647',
   'phrase': 'handgelenksfraktur',
   '_highlights': [{'phrase_vector': ''}],


In [15]:
list(mqs.best_hits_for_field("308"))

[]

In [7]:
list(mqs.best_hits_for_field([0.3]*1024))

2025-05-20 12:27:10,727 logger:'marqo' INFO     add_documents batch 0: took 0.019s for Marqo to process & index 1 docs. Roundtrip time: 0.031s.
INFO:marqo:    add_documents batch 0: took 0.019s for Marqo to process & index 1 docs. Roundtrip time: 0.031s.


[('1', 5.395548584233482)]

In [27]:
mqs.marqo_index.get_document("877")

MarqoWebError: MarqoWebError: MarqoWebError Error message: {'message': 'Document with ID 877 not found in index grascco_lokal_test', 'code': 'document_not_found', 'type': 'invalid_request', 'link': None}
status_code: 404, type: invalid_request, code: document_not_found, link: 

In [29]:
mqs.marqo_index.delete_documents(["878"])

{'index_name': 'grascco_lokal_test',
 'status': 'succeeded',
 'type': 'documentDeletion',
 'items': [{'_id': '878', 'status': 200, 'result': 'deleted'}],
 'details': {'receivedDocumentIds': 1, 'deletedDocuments': 1},
 'duration': 'PT0.014993S',
 'startedAt': '2025-05-21T09:53:19.424416Z',
 'finishedAt': '2025-05-21T09:53:19.439409Z'}

In [21]:
mqs.marqo_index.delete_documents(["877"])

{'index_name': 'grascco_lokal_test',
 'status': 'succeeded',
 'type': 'documentDeletion',
 'items': [{'_id': '877', 'status': 200, 'result': 'deleted'}],
 'details': {'receivedDocumentIds': 1, 'deletedDocuments': 1},
 'duration': 'PT0.017481S',
 'startedAt': '2025-05-20T13:31:59.757878Z',
 'finishedAt': '2025-05-20T13:31:59.775359Z'}

In [12]:
for i, g in enumerate(grascco_graph):
    doc_list = []
    for n, d in g.nodes(True):
        doc_list.append(
            {
                "_id": str(n),
                "graph_cluster": [str(i)]
            }
        )
    _res = mq.index("grascco_lokal_test").update_documents(
        documents=doc_list,
        # client_batch_size=128,
        # use_existing_tensors=True
    )

In [16]:
doc_list

[{'_id': '128', 'graph_cluster': ['4']},
 {'_id': '147', 'graph_cluster': ['4']},
 {'_id': '153', 'graph_cluster': ['4']},
 {'_id': '160', 'graph_cluster': ['4']},
 {'_id': '161', 'graph_cluster': ['4']},
 {'_id': '164', 'graph_cluster': ['4']},
 {'_id': '187', 'graph_cluster': ['4']},
 {'_id': '202', 'graph_cluster': ['4']},
 {'_id': '203', 'graph_cluster': ['4']},
 {'_id': '204', 'graph_cluster': ['4']},
 {'_id': '206', 'graph_cluster': ['4']},
 {'_id': '212', 'graph_cluster': ['4']},
 {'_id': '483', 'graph_cluster': ['4']},
 {'_id': '487', 'graph_cluster': ['4']},
 {'_id': '758', 'graph_cluster': ['4']}]

---

In [None]:
grascco_embedding.data_processing_obj.data_chunk_sets[1]

In [None]:
vector_name = "phrase_vector"
def create_index_document(sentence_embedding_obj: 'embedding_functions.SentenceEmbeddingsFactory.SentenceEmbeddings'):
    for i, embedding in enumerate(sentence_embedding_obj.sentence_embeddings):
        _chunk_obj = sentence_embedding_obj.data_processing_obj.data_chunk_sets[i]
        yield {
            "_id": str(i),
            "documents": [str(d['id']) for d in _chunk_obj['doc']],
            #"phrase": _chunk_obj['text'],
            vector_name: {
                "content": _chunk_obj['text'],
                "vector": [float(f) for f in embedding]
            }
        }

In [None]:
mq = marqo.Client(url="http://localhost:8882")

In [None]:
index_name = "grascco_index"
settings = {
    "treatUrlsAndPointersAsImages": False,
    "model": "no_model",
    "modelProperties": {
        "dimensions": 1024,
        "type": "no_model",
    },
    "annParameters": {
        "spaceType": "dotproduct", #prenormalized-angular(default)
        "parameters": {
            "efConstruction": 1024,
            "m": 16
        }
    }
}
try:
    mq.create_index(
        index_name=index_name,
        settings_dict=settings,
    )
except:
    mq.delete_index(index_name)
    mq.create_index(
        index_name=index_name,
        settings_dict=settings,
    )

In [None]:
mq.index(index_name).add_documents(
    list(create_index_document(grascco_embedding)),
    client_batch_size=128,
    tensor_fields=[vector_name],
    mappings={
        vector_name: {
            "type": "custom_vector"
        }
    },
)

In [None]:
mq.index("grascco_lokal_test").get_document("100", expose_facets=True)

In [35]:
mq.index("grascco_lokal_test").get_documents(["10", "11"])

{'errors': False,
 'results': [{'_found': True,
   'graph_cluster': ['2'],
   '_id': '10',
   'phrase': 'handamputation'},
  {'_found': True, '_id': '11', 'phrase': 'li'}]}

In [37]:
mq.index("grascco_lokal_test").recommend(
    documents=['10'],
    tensor_fields=["phrase_vector"],
    limit=50,
).get("hits")

[{'graph_cluster': ['2'],
  '_id': '238',
  'phrase': 'handchirurgen',
  '_highlights': [{'phrase_vector': ''}],
  '_score': 359.72576904296875},
 {'_id': '51',
  'phrase': 'abgetrennte hand',
  '_highlights': [{'phrase_vector': ''}],
  '_score': 345.06219482421875},
 {'_id': '55',
  'phrase': 'komplette amputation',
  '_highlights': [{'phrase_vector': ''}],
  '_score': 344.34820556640625},
 {'graph_cluster': ['2'],
  '_id': '20',
  'phrase': 'handreplantaticm',
  '_highlights': [{'phrase_vector': ''}],
  '_score': 337.96893310546875},
 {'graph_cluster': ['2'],
  '_id': '229',
  'phrase': 'handchirurgischer sicht',
  '_highlights': [{'phrase_vector': ''}],
  '_score': 313.2591247558594},
 {'graph_cluster': ['2'],
  '_id': '647',
  'phrase': 'handgelenksfraktur',
  '_highlights': [{'phrase_vector': ''}],
  '_score': 310.511962890625},
 {'graph_cluster': ['2'],
  '_id': '68',
  'phrase': 'hand',
  '_highlights': [{'phrase_vector': ''}],
  '_score': 302.1364440917969},
 {'graph_cluster': 

In [None]:
ex_doc = mq.index("grascco_lokal_test").get_document("10", expose_facets=True)
ex_doc

In [None]:
mq.index("grascco_lokal_test").search(
    q={
        "customVector" : {
            "vector": ex_doc["_tensor_facets"][0]["_embedding"],
            "content": ex_doc["phrase"],
        }
    },
    search_method="HYBRID"
)

In [None]:
_ids = [str(i) for i in range(6)]
np.asarray([
    _res["_tensor_facets"][0]["_embedding"]
    for _res in
    mq.index(index_name).get_documents(_ids, expose_facets=True)["results"]
]).tolist()

In [None]:
mq.index(index_name).get_stats()["numberOfDocuments"]

In [None]:
mq.index(index_name).get_document("5180")

In [None]:
marqo_store = MarqoEmbeddingExternal(
    client_url="http://localhost:8882",
    index_name="grascco_lokal_test"
)

In [None]:
mq.index("grascco_lokal_test").search(
    q='psychose',
    search_method="LEXICAL",
)