In [1]:
import marqo
import sys
import pathlib
import pickle

In [2]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [3]:
sys.path.insert(0, "src")
import data_functions
import embedding_functions
import cluster_functions
import graph_functions
import util_functions

  from tqdm.autonotebook import tqdm





In [4]:
base_name = "grascco_lokal_test"
path_name = lambda x: f"./tmp/{base_name}/{base_name}_{x}.pickle"

In [6]:
grascco_data = data_functions.DataProcessingFactory.load(
    file_path=path_name("data"),
)

In [7]:
grascco_embedding = embedding_functions.SentenceEmbeddingsFactory.create(
    data_obj=grascco_data,
    cache_path=pathlib.Path(path_name("embedding")).parent,
    cache_name="_".join(pathlib.Path(path_name("embedding")).stem.split("_")[:-1]),
    model_name="Sahajtomar/German-semantic",
    storage_method=("vector_store", {},)
)

INFO:root:Creating Sentence Embedding with 'None'
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: Sahajtomar/German-semantic
Batches: 100%|██████████| 28/28 [00:10<00:00,  2.64it/s]
2025-05-16 13:25:47,697 logger:'marqo' INFO     add_documents batch 0: took 2.845s for Marqo to process & index 128 docs. Roundtrip time: 3.026s.
INFO:marqo:    add_documents batch 0: took 2.845s for Marqo to process & index 128 docs. Roundtrip time: 3.026s.
2025-05-16 13:25:48,555 logger:'marqo' INFO     add_documents batch 1: took 0.667s for Marqo to process & index 128 docs. Roundtrip time: 0.852s.
INFO:marqo:    add_documents batch 1: took 0.667s for Marqo to process & index 128 docs. Roundtrip time: 0.852s.
2025-05-16 13:25:49,433 logger:'marqo' INFO     add_documents batch 2: took 0.705s for Marqo to process & index 128 docs. Roundtrip time: 0.886s.
INFO:marqo:    add_documents batch 2: took

Saved under: C:\Users\fra3066mat\PycharmProjects\concept-graphs\tmp\grascco_lokal_test\grascco_lokal_test.pickle


In [14]:
grascco_embedding_from_vec = embedding_functions.SentenceEmbeddingsFactory.load(
    data_obj_path=path_name("data"),
    embeddings_path=path_name("vec"),
    storage_method=("vector_store", {},),
)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: Sahajtomar/German-semantic


In [16]:
grascco_embedding_from_vec.sentence_embeddings

array([[ 0.01791377,  0.0154043 ,  0.00427037, ...,  0.00452677,
         0.0273663 , -0.01148496],
       [-0.00875529,  0.00813772,  0.01379426, ..., -0.03286636,
        -0.00768068, -0.02483735],
       [-0.03856981,  0.04575796,  0.00291555, ..., -0.00383472,
         0.00954012, -0.01645751],
       ...,
       [ 0.00778285,  0.01231896,  0.03321858, ..., -0.01107403,
        -0.01119914,  0.00637742],
       [ 0.07469263, -0.01433804, -0.02625738, ..., -0.02227189,
        -0.01728847, -0.09759407],
       [ 0.05090078,  0.02100131, -0.02017422, ..., -0.03455668,
         0.03638485, -0.02004617]])

In [8]:
grascco_embedding.sentence_embeddings

array([[ 0.3701868 ,  0.31832868,  0.08824699, ...,  0.09354548,
         0.5655227 , -0.23733595],
       [-0.19035454,  0.17692763,  0.29991016, ..., -0.71456957,
        -0.16699083, -0.5400055 ],
       [-0.79879856,  0.9476683 ,  0.06038243, ..., -0.07941884,
         0.19758023, -0.34084257],
       ...,
       [ 0.16397977,  0.25955284,  0.6998949 , ..., -0.23332289,
        -0.23595901,  0.13436836],
       [ 1.6328926 , -0.31345105, -0.5740257 , ..., -0.48689675,
        -0.37795183, -2.1335523 ],
       [ 1.0734237 ,  0.44288728, -0.42544514, ..., -0.7287502 ,
         0.7673038 , -0.42274478]], dtype=float32)

In [None]:
grascco_embedding = embedding_functions.SentenceEmbeddingsFactory.load(
    pathlib.Path(path_name("data")),
    pathlib.Path(path_name("embedding"))
)

In [None]:
grascco_graph = pickle.load(
    open(path_name("graph"), 'rb')
)

In [None]:
grascco_embedding.data_processing_obj.data_chunk_sets[1]

In [None]:
vector_name = "phrase_vector"
def create_index_document(sentence_embedding_obj: 'embedding_functions.SentenceEmbeddingsFactory.SentenceEmbeddings'):
    for i, embedding in enumerate(sentence_embedding_obj.sentence_embeddings):
        _chunk_obj = sentence_embedding_obj.data_processing_obj.data_chunk_sets[i]
        yield {
            "_id": str(i),
            "documents": [str(d['id']) for d in _chunk_obj['doc']],
            #"phrase": _chunk_obj['text'],
            vector_name: {
                "content": _chunk_obj['text'],
                "vector": [float(f) for f in embedding]
            }
        }

In [None]:
mq = marqo.Client(url="http://localhost:8882")

In [None]:
index_name = "grascco_index"
settings = {
    "treatUrlsAndPointersAsImages": False,
    "model": "no_model",
    "modelProperties": {
        "dimensions": 1024,
        "type": "no_model",
    },
    "annParameters": {
        "spaceType": "dotproduct", #prenormalized-angular(default)
        "parameters": {
            "efConstruction": 1024,
            "m": 16
        }
    }
}
try:
    mq.create_index(
        index_name=index_name,
        settings_dict=settings,
    )
except:
    mq.delete_index(index_name)
    mq.create_index(
        index_name=index_name,
        settings_dict=settings,
    )

In [None]:
mq.index(index_name).add_documents(
    list(create_index_document(grascco_embedding)),
    client_batch_size=128,
    tensor_fields=[vector_name],
    mappings={
        vector_name: {
            "type": "custom_vector"
        }
    },
)

In [None]:
np.asarray(mq.index(index_name).get_document("100", expose_facets=True)["_tensor_facets"][0]["_embedding"])

In [None]:

mq.index(index_name).recommend(
    documents=['100'],
    tensor_fields=[vector_name],
    limit=50
)

In [None]:
doc_100 = [x for x in mq.index(index_name).get_document("100", expose_facets=True)["_tensor_facets"] if x.get("phrase_vector", False)][0]
doc_100

In [None]:
mq.index(index_name).search(
    q={
        "customVector" : {
            "vector": doc_100["_embedding"],
            "content": doc_100["phrase_vector"],
        }
    },
    search_method="HYBRID"
)

In [None]:
_ids = [str(i) for i in range(6)]
np.asarray([
    _res["_tensor_facets"][0]["_embedding"]
    for _res in
    mq.index(index_name).get_documents(_ids, expose_facets=True)["results"]
]).tolist()

In [None]:
mq.index(index_name).get_stats()["numberOfDocuments"]

In [None]:
mq.index(index_name).get_document("5180")