In [1]:
import marqo
import sys
import pathlib
import pickle

In [2]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [3]:
sys.path.insert(0, "src")
import data_functions
import embedding_functions
import cluster_functions
import graph_functions
import util_functions
from MarqoEmbedding import MarqoEmbeddingExternal

  from tqdm.autonotebook import tqdm





In [4]:
base_name = "grascco_lokal_test"
path_name = lambda x: f"./tmp/{base_name}/{base_name}_{x}.pickle"

In [5]:
grascco_data = data_functions.DataProcessingFactory.load(
    file_path=path_name("data"),
)

In [22]:
grascco_embedding = embedding_functions.SentenceEmbeddingsFactory.create(
    data_obj=grascco_data,
    cache_path=pathlib.Path(path_name("embedding")).parent,
    cache_name="_".join(pathlib.Path(path_name("embedding")).stem.split("_")[:-1]),
    model_name="Sahajtomar/German-semantic",
    storage_method=("vector_store", {},),
    vectorstore_normalizeEmbeddings=False,
    vectorstore_annParameters={
        "spaceType": "dotproduct", #prenormalized-angular(default)
        "parameters": {
            "efConstruction": 1024,
            "m": 16
        }
    }
)

INFO:root:Creating Sentence Embedding with 'None'
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: Sahajtomar/German-semantic
Batches: 100%|██████████| 28/28 [00:10<00:00,  2.74it/s]
2025-05-17 13:11:22,943 logger:'marqo' INFO     add_documents batch 0: took 2.506s for Marqo to process & index 128 docs. Roundtrip time: 2.690s.
INFO:marqo:    add_documents batch 0: took 2.506s for Marqo to process & index 128 docs. Roundtrip time: 2.690s.
2025-05-17 13:11:23,730 logger:'marqo' INFO     add_documents batch 1: took 0.582s for Marqo to process & index 128 docs. Roundtrip time: 0.769s.
INFO:marqo:    add_documents batch 1: took 0.582s for Marqo to process & index 128 docs. Roundtrip time: 0.769s.
2025-05-17 13:11:24,486 logger:'marqo' INFO     add_documents batch 2: took 0.585s for Marqo to process & index 128 docs. Roundtrip time: 0.764s.
INFO:marqo:    add_documents batch 2: took

Saved under: C:\Users\fra3066mat\PycharmProjects\concept-graphs\tmp\grascco_lokal_test\grascco_lokal_test.pickle


In [8]:
grascco_embedding_from_vec = embedding_functions.SentenceEmbeddingsFactory.load(
    data_obj_path=path_name("data"),
    embeddings_path=path_name("vec"),
    storage_method=("vector_store", {},),
)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: Sahajtomar/German-semantic


In [9]:
grascco_embedding_from_vec.sentence_embeddings

array([[ 0.37018681,  0.31832868,  0.08824699, ...,  0.09354548,
         0.56552267, -0.23733595],
       [-0.19035454,  0.17692763,  0.29991016, ..., -0.71456957,
        -0.16699083, -0.54000551],
       [-0.79879856,  0.94766831,  0.06038243, ..., -0.07941884,
         0.19758023, -0.34084257],
       ...,
       [ 0.16397977,  0.25955284,  0.69989491, ..., -0.23332289,
        -0.23595901,  0.13436836],
       [ 1.63289261, -0.31345105, -0.57402569, ..., -0.48689675,
        -0.37795183, -2.13355231],
       [ 1.07342374,  0.44288728, -0.42544514, ..., -0.72875023,
         0.76730382, -0.42274478]])

In [8]:
grascco_embedding.sentence_embeddings

array([[ 0.3701868 ,  0.31832868,  0.08824699, ...,  0.09354548,
         0.5655227 , -0.23733595],
       [-0.19035454,  0.17692763,  0.29991016, ..., -0.71456957,
        -0.16699083, -0.5400055 ],
       [-0.79879856,  0.9476683 ,  0.06038243, ..., -0.07941884,
         0.19758023, -0.34084257],
       ...,
       [ 0.16397977,  0.25955284,  0.6998949 , ..., -0.23332289,
        -0.23595901,  0.13436836],
       [ 1.6328926 , -0.31345105, -0.5740257 , ..., -0.48689675,
        -0.37795183, -2.1335523 ],
       [ 1.0734237 ,  0.44288728, -0.42544514, ..., -0.7287502 ,
         0.7673038 , -0.42274478]], dtype=float32)

In [None]:
grascco_embedding = embedding_functions.SentenceEmbeddingsFactory.load(
    pathlib.Path(path_name("data")),
    pathlib.Path(path_name("embedding"))
)

In [None]:
grascco_graph = pickle.load(
    open(path_name("graph"), 'rb')
)

In [None]:
grascco_embedding.data_processing_obj.data_chunk_sets[1]

In [None]:
vector_name = "phrase_vector"
def create_index_document(sentence_embedding_obj: 'embedding_functions.SentenceEmbeddingsFactory.SentenceEmbeddings'):
    for i, embedding in enumerate(sentence_embedding_obj.sentence_embeddings):
        _chunk_obj = sentence_embedding_obj.data_processing_obj.data_chunk_sets[i]
        yield {
            "_id": str(i),
            "documents": [str(d['id']) for d in _chunk_obj['doc']],
            #"phrase": _chunk_obj['text'],
            vector_name: {
                "content": _chunk_obj['text'],
                "vector": [float(f) for f in embedding]
            }
        }

In [8]:
mq = marqo.Client(url="http://localhost:8882")

In [None]:
index_name = "grascco_index"
settings = {
    "treatUrlsAndPointersAsImages": False,
    "model": "no_model",
    "modelProperties": {
        "dimensions": 1024,
        "type": "no_model",
    },
    "annParameters": {
        "spaceType": "dotproduct", #prenormalized-angular(default)
        "parameters": {
            "efConstruction": 1024,
            "m": 16
        }
    }
}
try:
    mq.create_index(
        index_name=index_name,
        settings_dict=settings,
    )
except:
    mq.delete_index(index_name)
    mq.create_index(
        index_name=index_name,
        settings_dict=settings,
    )

In [None]:
mq.index(index_name).add_documents(
    list(create_index_document(grascco_embedding)),
    client_batch_size=128,
    tensor_fields=[vector_name],
    mappings={
        vector_name: {
            "type": "custom_vector"
        }
    },
)

In [17]:
mq.index("grascco_lokal_test").get_document("100", expose_facets=True)

{'_id': '100',
 '_tensor_facets': [{'phrase_vector': '',
   '_embedding': [1.2465311288833618,
    0.0051830909214913845,
    -0.4766988754272461,
    -0.7763726711273193,
    -0.4204809367656708,
    -0.3822985887527466,
    0.6792799830436707,
    -1.2419755458831787,
    0.15514884889125824,
    0.892401933670044,
    -0.25824201107025146,
    0.11779597401618958,
    -0.3055914342403412,
    0.1317112147808075,
    1.6687474250793457,
    -0.22460587322711945,
    0.2978264093399048,
    -0.7987778186798096,
    -0.7297893762588501,
    1.055659532546997,
    0.031436167657375336,
    -0.7863857746124268,
    0.5765108466148376,
    0.3118683099746704,
    0.07113844156265259,
    -0.040275249630212784,
    -0.8426237106323242,
    0.272452175617218,
    0.23435313999652863,
    0.7937514781951904,
    -0.7250549793243408,
    -0.519151508808136,
    -0.0374252013862133,
    0.678233802318573,
    -0.2907734811306,
    0.4647919833660126,
    0.5209764838218689,
    -0.929605901241

In [16]:
mq.index("grascco_lokal_test").recommend(
    documents=['100'],
    tensor_fields=["phrase_vector"],
    limit=50,
)

{'hits': [{'_id': '101',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 0.6402169712938903},
  {'_id': '819',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 0.6302630550269199},
  {'_id': '223',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 0.629973271164629},
  {'_id': '91',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 0.6283698861084067},
  {'_id': '110',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 0.6126386125508531},
  {'_id': '308',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 0.6114135187540254},
  {'_id': '512',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 0.6101491259212647},
  {'_id': '796',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 0.6089959687370734},
  {'_id': '266',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 0.6086899532639627},
  {'_id': '575',
   '_highlights': [{'phrase_vector': ''}],
   '_score': 0.6064146705173943},
  {'_id': '143',
   '_highlights': [{'phrase_vector': 

In [24]:
ex_doc = mq.index("grascco_lokal_test").get_document("10", expose_facets=True)
ex_doc

{'_id': '10',
 'phrase': 'handamputation',
 '_tensor_facets': [{'phrase_vector': '',
   '_embedding': [0.4533662497997284,
    0.09949952363967896,
    -0.5357570648193359,
    -0.03132299706339836,
    -0.9523380398750305,
    -0.16063173115253448,
    -0.9059784412384033,
    -0.7324122786521912,
    -0.9460609555244446,
    0.5539281964302063,
    -0.9374650120735168,
    0.5457251071929932,
    -1.1647920608520508,
    -0.680462658405304,
    0.3145199120044708,
    -1.1129482984542847,
    1.0397886037826538,
    -1.0934596061706543,
    0.16691578924655914,
    -0.5487030148506165,
    0.41993510723114014,
    0.36982718110084534,
    -0.7017555832862854,
    1.1234169006347656,
    -0.21901510655879974,
    0.6956271529197693,
    -0.8515775799751282,
    -1.0247939825057983,
    0.15074367821216583,
    -0.06135685369372368,
    -1.2718614339828491,
    -1.2504996061325073,
    1.5353994369506836,
    -0.21241247653961182,
    0.2173781543970108,
    0.4966448247432709,
    0.2

In [25]:
mq.index("grascco_lokal_test").search(
    q={
        "customVector" : {
            "vector": ex_doc["_tensor_facets"][0]["_embedding"],
            "content": ex_doc["phrase"],
        }
    },
    search_method="HYBRID"
)

{'hits': [{'_id': '10',
   'phrase': 'handamputation',
   '_tensor_score': 491.0687255859375,
   '_lexical_score': 7.227573969880332,
   '_highlights': [{'phrase_vector': ''}],
   '_score': 0.01639344262295082},
  {'_id': '238',
   'phrase': 'handchirurgen',
   '_tensor_score': 359.72576904296875,
   '_highlights': [{'phrase_vector': ''}],
   '_score': 0.008064516129032258},
  {'_id': '51',
   'phrase': 'abgetrennte hand',
   '_tensor_score': 345.06219482421875,
   '_highlights': [{'phrase_vector': ''}],
   '_score': 0.007936507936507936},
  {'_id': '55',
   'phrase': 'komplette amputation',
   '_tensor_score': 344.34820556640625,
   '_highlights': [{'phrase_vector': ''}],
   '_score': 0.0078125},
  {'_id': '20',
   'phrase': 'handreplantaticm',
   '_tensor_score': 337.96893310546875,
   '_highlights': [{'phrase_vector': ''}],
   '_score': 0.007692307692307693},
  {'_id': '229',
   'phrase': 'handchirurgischer sicht',
   '_tensor_score': 313.2591247558594,
   '_highlights': [{'phrase_v

In [None]:
_ids = [str(i) for i in range(6)]
np.asarray([
    _res["_tensor_facets"][0]["_embedding"]
    for _res in
    mq.index(index_name).get_documents(_ids, expose_facets=True)["results"]
]).tolist()

In [None]:
mq.index(index_name).get_stats()["numberOfDocuments"]

In [None]:
mq.index(index_name).get_document("5180")

In [11]:
marqo_store = MarqoEmbeddingExternal(
    client_url="http://localhost:8882",
    index_name="grascco_lokal_test"
)

In [9]:
mq.index("grascco_lokal_test").search(
    q='psychose',
    search_method="LEXICAL",
)

{'hits': [{'_id': '12',
   'phrase': 'akute psychose',
   '_score': 5.435213741645274,
   '_highlights': []}],
 'query': 'psychose',
 'limit': 10,
 'offset': 0,
 'processingTimeMs': 8}