In [17]:
import os
import sys

sys.path.insert(0, os.path.abspath(".."))

In [18]:
folder = "../data/DND rules"
import glob

md_files = glob.glob(f"{folder}/**/*.md", recursive=True)

In [19]:
test = {}
for file in md_files:
    length = len(file.split("/"))
    test[length] = test.get(length, 0) + 1

In [20]:
len(md_files)

1028

In [21]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "h1"),
    ("##", "h2"),
    ("###", "h3"),
    ("####", "h4"),
    ("#####", "h5"),
    ("######", "h6"),
    ("#######", "h7"),
]
paragraphs = []
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

all_splits = []
for file in md_files:
    if any([name in file for name in ["Changelog.md", "README.md", "Legal.md"]]):
        continue
    splits = file.split("/")
    with open(file, "r") as f:
        md_header_splits = markdown_splitter.split_text(f.read())
        for text in md_header_splits:
            splits[-1] = splits[-1].replace(".md", "")
            txt_metadata = [f"{k}: {v}" for k, v in text.metadata.items()]
            text.metadata["Category"] = splits[2]
            if len(splits) > 3:
                text.metadata["Sub category"] = splits[3]

            txt_metadata = [
                f"Category: {splits[2]}",
                f"Sub category: {splits[3]}" if len(splits) > 3 else "",
                *txt_metadata,
            ]
            text.page_content = f"{'\n'.join(txt_metadata)}\n\n{text.page_content}"
        all_splits.extend(md_header_splits)

In [22]:
TOKEN = (
    "OsSrZ1DqQh2Kdfamz0HP7CC7g1DYDZcEDRy_Ql_trCP81h5TnBov2oZ5cOAKSEemTiK9hQ8WdRhiQuPc"
)
PRODUCT_ID = "101420"
URL = f"https://api.infomaniak.com/1/ai/{PRODUCT_ID}/openai/v1"

In [23]:
from common.infomaniak.ik_embeddings import IKEmbeddings

model = "bge_multilingual_gemma2"  # "mini_lm_l12_v2" #

embeder = IKEmbeddings(model=model)
index = f"dnd_rules_{model}"

In [24]:
import os

import meilisearch
from langchain.vectorstores import Meilisearch

# You can use the same code as `setup.py` to check for missing env vars

# Create the vector store
client = meilisearch.Client(
    url=os.environ.get("MEILI_HTTP_ADDR", "http://localhost:7700"),
    api_key=os.environ.get("MEILI_API_KEY", "2AB8F223884CE"),
)

In [26]:
# client.delete_index("dnd_rules")
client.delete_index(index)

TaskInfo(task_uid=0, index_uid='dnd_rules_bge_multilingual_gemma2', status='enqueued', type='indexDeletion', enqueued_at=datetime.datetime(2025, 2, 15, 17, 23, 5, 425377))

In [27]:
from requests import patch

patch(
    "http://localhost:7700/experimental-features",
    headers={
        "Authorization": f"Bearer {os.environ.get('MEILI_API_KEY', '2AB8F223884CE')}"
    },
    json={"vectorStore": True},
).text

'{"vectorStore":true,"metrics":false,"logsRoute":false,"editDocumentsByFunction":false,"containsFilter":false}'

In [28]:
from requests import get

get(
    "http://localhost:7700/version",
    headers={
        "Authorization": f"Bearer {os.environ.get('MEILI_API_KEY', '2AB8F223884CE')}"
    },
).text

'{"commitSha":"876084d48004e04ac0720cb064915d53c4743a1a","commitDate":"2025-01-30T10:52:23.000000000Z","pkgVersion":"1.12.8"}'

In [29]:
embedders = {"custom": {"source": "userProvided", "dimensions": 3584}}
embedder_name = "custom"
vector_store = Meilisearch(
    client=client, embedding=embeder, index_name=index, embedders=embedders
)

batch_size = 100
for i in range(0, len(all_splits), batch_size):
    # vector_store = vector_store.add_documents(documents=all_splits, embedding=embeder, embedder_name=embedder_name)
    vector_store.add_documents(
        all_splits[i : i + batch_size], embedding=embeder, embedder_name=embedder_name
    )

100%|██████████| 1/1 [00:02<00:00,  2.60s/it]
100%|██████████| 1/1 [00:02<00:00,  2.77s/it]
100%|██████████| 1/1 [00:05<00:00,  5.10s/it]
100%|██████████| 1/1 [00:03<00:00,  3.48s/it]
100%|██████████| 1/1 [00:03<00:00,  3.19s/it]
100%|██████████| 1/1 [00:02<00:00,  2.61s/it]
100%|██████████| 1/1 [00:02<00:00,  2.63s/it]
100%|██████████| 1/1 [00:03<00:00,  3.04s/it]
100%|██████████| 1/1 [00:03<00:00,  3.15s/it]
100%|██████████| 1/1 [00:02<00:00,  2.83s/it]
100%|██████████| 1/1 [00:02<00:00,  2.05s/it]
100%|██████████| 1/1 [00:02<00:00,  2.94s/it]
100%|██████████| 1/1 [00:02<00:00,  2.81s/it]
100%|██████████| 1/1 [00:02<00:00,  2.97s/it]
100%|██████████| 1/1 [00:02<00:00,  2.95s/it]
100%|██████████| 1/1 [00:02<00:00,  2.89s/it]
100%|██████████| 1/1 [00:02<00:00,  2.63s/it]
100%|██████████| 1/1 [00:02<00:00,  2.77s/it]
100%|██████████| 1/1 [00:02<00:00,  2.66s/it]
100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
100%|██████████| 1/1 [00:02<00:00,  2.01s/it]
100%|██████████| 1/1 [00:02<00:00,

In [30]:
index

'dnd_rules_bge_multilingual_gemma2'