In [None]:
import os
from dotenv import load_dotenv
import weaviate
from tqdm import tqdm

load_dotenv()

WEAVIATE_URL = os.getenv("WEAVIATE_URL")
WEAVIATE_APIKEY = os.getenv("WEAVIATE_APIKEY")
print(WEAVIATE_URL)

Create client

In [None]:
client = weaviate.Client(
    url=WEAVIATE_URL, auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_APIKEY)
)

schema = client.schema.get()
client.query.aggregate("Passage").with_meta_count().do()

Check properties

In [None]:
schema = client.schema.get()
all_properties = [x["name"] for x in schema["classes"][0]["properties"]]
print(all_properties)

In [None]:
def get_batch_with_cursor(client, class_name, class_properties, batch_size, cursor):
    query = (
        client.query.get(class_name, class_properties)
        .with_additional(["id"])
        .with_limit(batch_size)
    )

    if cursor is not None:
        return query.with_after(cursor).do()
    else:
        return query.do()

In [None]:
def patch_all(
    client, batch_size: int = 5000, class_name: str = "Passage", resume_from: int = 0
) -> None:
    """Append terms to all records."""

    cursor = None
    n = (
        client.query.aggregate(class_name)
        .with_meta_count()
        .do()["data"]["Aggregate"][class_name][0]["meta"]["count"]
    )
    print(f"Total number of objects: {n}")

    with tqdm(total=n) as progress_bar:
        while True:
            # Fetch a batch of objects
            results = get_batch_with_cursor(
                client, class_name, ["topic", "doc_type"], batch_size, cursor
            )

            # Stop if there are no more results
            if not results["data"]["Get"][class_name]:
                break

            objects = results["data"]["Get"][class_name]
            cursor = objects[-1]["_additional"]["id"]

            # Skip if we are not at the resume point yet
            if progress_bar.n < resume_from:
                progress_bar.update(batch_size)
                continue

            # Update the objects
            for obj in objects:
                if obj["topic"] == "climate_change":
                    client.data_object.update(
                        uuid=obj["_additional"]["id"],
                        class_name=class_name,
                        data_object={"doc_type": "paragraph"},
                    )
            progress_bar.update(batch_size)

In [None]:
patch_all(client=client, batch_size=1000, resume_from=7179000)

- Double check all doc_type is not None
- Keep `type` field for now... but it is useless


In [None]:
where_filter = {
    "operator": "And",
    "operands": [
        {"path": ["topic"], "operator": "Equal", "valueText": "climate_change"},
        {"path": ["doc_type"], "operator": "NotEqual", "valueText": "paragraph"},
    ],
}

In [None]:
client.query.get("Passage", ["paper_id", "topic", "doc_type"]).with_where(
    where_filter
).with_limit(10).do()