# Add paragraph ordering

1. Gather a master list of docid.
1. Subset docid from Geoarchive, CriticalMASS.
1. For each docid, call preprocessorv2 (v1 + paragraph ordering).
1. Compare hashed_text for each paragraph. If unchanged, retrieve embedding data from existing Weaviate.
1. If changed, drop paragraphs with the same docid and reprocess everything in it.

### Step 1: Gather a master list of docid.

In [None]:
import sys
sys.path.append("/hdd/clo36/repo/ask-xDD/askem/retriever")

In [None]:
# import pickle
# from askem.elastic import DocumentTopicFactory
# id2topics_factory = DocumentTopicFactory()
# id2topics = id2topics_factory.run()
# with open("tmp/id2topics.pkl", "wb") as f:
#     pickle.dump(id2topics, f)

In [None]:
import pickle
with open("tmp/id2topics.pkl", "rb") as input_file:
    id2topics = pickle.load(input_file)

### Step 2: Subset docid from Geoarchive, CriticalMASS.

In [None]:
ids_to_patch = []
target_topics = ["criticalmaas", "geoarchive"]
for k, v in id2topics.items():
    if any([t in v for t in target_topics]):
        ids_to_patch.append(k)

print(f"Found {len(ids_to_patch)} documents to patch")

### Step 3: For each docid, call preprocessorv2 (v1 + paragraph ordering).

In [None]:
from askem.preprocessing import HaystackPreprocessor
from askem.ingest_v2 import WeaviateIngester
from askem.retriever.base import get_client


In [None]:
preprocessor = HaystackPreprocessor()
weaviate_client = get_client()
ingester = WeaviateIngester(
    client=weaviate_client,
    class_name="Paragraph",
    id2topics=id2topics,
    ingested=set(),
)

In [None]:
# Add new paragraph_order property

# add_prop = {
#   "name": "paragraph_order",
#   "dataType": ["int"],
#   "moduleConfig": {"text2vec-transformers": {"skip": True}}
# }

# weaviate_client.schema.property.create("Paragraph", add_prop)

In [None]:
def get_weaviate_paragraph(doc_id: str, hashed_text: str | None = None) -> dict:
    """Get a paragraph from weaviate by paper_id and hashed_text"""

    where_filter = {"operator": "And", "operands": []}
    where_filter["operands"].append({"path":"paper_id", "operator":"Equal", "valueText": doc_id})
    if hashed_text:
        where_filter["operands"].append({"path":"hashed_text", "operator":"Equal", "valueText": hashed_text})
    return weaviate_client.query.get("Paragraph", ["paper_id", "hashed_text", "paragraph_order"]).with_where(where_filter).with_additional("id").with_limit(10000).do()

In [None]:
from tqdm import tqdm

def patch(doc_id: str) -> None:
    ingester.write_batch_to_file([doc_id])
    input_file = ingester.files_to_ingest[0]
    new_paragraphs = preprocessor.run(input_file=input_file, topics=id2topics[doc_id], doc_type="paragraph")

    # Check if all paragraphs are unchanged
    new_hashes = {p["hashed_text"] for p in new_paragraphs}
    assert len(new_hashes) <= 10000
    old_paragraphs = get_weaviate_paragraph(doc_id=doc_id)
    old_records = old_paragraphs["data"]["Get"]["Paragraph"]
    old_hashes = {p["hashed_text"] for p in old_records}
    assert old_hashes == new_hashes, f"Old hashes: {len(old_hashes)}, New hashes: {len(new_hashes)}"

    # Create hash to uuid mapping
    hash2uuid = {p["hashed_text"]: p["_additional"]["id"] for p in old_records}

    # Create skip list (already has order, for resuming)
    skip = {p["hashed_text"] for p in old_records if p["paragraph_order"] is not None}

    # Patch on batch
    for new in tqdm(new_paragraphs):
        
        # Skip already has order
        if new["hashed_text"] in skip:
            continue

        weaviate_client.data_object.update(
            uuid=hash2uuid[new["hashed_text"]],
            class_name="Paragraph",
            data_object={
                "paragraph_order": new["paragraph_order"],
            }
        )
    


In [None]:
import json
status = {'success': [], 'fail': []}

for doc_id in tqdm(ids_to_patch):
    ingester.purge_ingest_folder()
    try:
        patch(doc_id)
        status['success'].append(doc_id)
    except Exception as e:
        print(f"Failed to patch {doc_id}: {e}")
        status['fail'].append(doc_id)
        continue

with open("tmp/patch_status.json", "w") as f:
    json.dump(status, f)