In [None]:
import os
import sys

sys.path.insert(0, os.path.abspath(".."))

In [None]:
folder = "../data/DND rules"
import glob

md_files = glob.glob(f"{folder}/**/*.md", recursive=True)

In [None]:
test = {}
for file in md_files:
    length = len(file.split("/"))
    test[length] = test.get(length, 0) + 1

In [None]:
len(md_files)

In [None]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "h1"),
    ("##", "h2"),
    ("###", "h3"),
    ("####", "h4"),
    ("#####", "h5"),
    ("######", "h6"),
    ("#######", "h7"),
]
paragraphs = []
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

all_splits = []
for file in md_files:
    if any([name in file for name in ["Changelog.md", "README.md", "Legal.md"]]):
        continue
    splits = file.split("/")
    with open(file, "r") as f:
        md_header_splits = markdown_splitter.split_text(f.read())
        for text in md_header_splits:
            splits[-1] = splits[-1].replace(".md", "")
            txt_metadata = [f"{k}: {v}" for k, v in text.metadata.items()]
            text.metadata["Category"] = splits[2]
            if len(splits) > 3:
                text.metadata["Sub category"] = splits[3]

            txt_metadata = [
                f"Category: {splits[2]}",
                f"Sub category: {splits[3]}" if len(splits) > 3 else "",
                *txt_metadata,
            ]
            text.page_content = f"{'\n'.join(txt_metadata)}\n\n{text.page_content}"
        all_splits.extend(md_header_splits)

In [None]:
import os

from dotenv import load_dotenv

load_dotenv()
TOKEN = os.getenv("IK_API_KEY", "")
PRODUCT_ID = os.getenv("IK_PRODUCT_ID", "")
URL = f"https://api.infomaniak.com/1/ai/{PRODUCT_ID}/openai/v1"

In [None]:
from common.infomaniak.ik_embeddings import IKEmbeddings

model = "bge_multilingual_gemma2"  # "mini_lm_l12_v2" #

embeder = IKEmbeddings(model=model)
index = f"dnd_rules_{model}"

In [None]:
import os

import meilisearch
from langchain.vectorstores import Meilisearch

# You can use the same code as `setup.py` to check for missing env vars

# Create the vector store
client = meilisearch.Client(
    url=os.environ.get("MEILI_HTTP_ADDR", "http://localhost:7700"),
    api_key=os.environ.get("MEILI_API_KEY", "2AB8F223884CE"),
)

In [None]:
# client.delete_index("dnd_rules")
client.delete_index(index)

In [None]:
from requests import patch

patch(
    "http://localhost:7700/experimental-features",
    headers={
        "Authorization": f"Bearer {os.environ.get('MEILI_API_KEY', '2AB8F223884CE')}"
    },
    json={"vectorStore": True},
).text

In [None]:
from requests import get

get(
    "http://localhost:7700/version",
    headers={
        "Authorization": f"Bearer {os.environ.get('MEILI_API_KEY', '2AB8F223884CE')}"
    },
).text

In [None]:
embedders = {"custom": {"source": "userProvided", "dimensions": 3584}}
embedder_name = "custom"
vector_store = Meilisearch(
    client=client, embedding=embeder, index_name=index, embedders=embedders
)

batch_size = 100
for i in range(0, len(all_splits), batch_size):
    # vector_store = vector_store.add_documents(documents=all_splits, embedding=embeder, embedder_name=embedder_name)
    vector_store.add_documents(
        all_splits[i : i + batch_size], embedding=embeder, embedder_name=embedder_name
    )

In [None]:
index