# Revamp data ingest pipeline

New requirements:
1. Have a resume mechanism to avoid duplication of text
2. Have a append topic mechanism to modify existing paragraph topics.
3. Don't break production demo.
4. Directly ingest from ElasticSearch service without writing too much txt to disk.
5. Make a cron job to do this automatically.

Files to be ingested:

- /hdd/iaross/askem/criticalmaas_text

Steps:

1. Make a new Class: `Paragraph` to replace `Passage` class. `Passage` will be deprecated after the entire migration is done.
1. Create canonical `id2topics` pickle file. Hopefully it is small enough to be loaded into memory.
1. Use batch mechanism to ingest data from ElasticSearch service. e.g., 1000 paragraphs per batch.
1. Upgrade frontend to use `Paragraph` class.
1. Setup cron job to do this automatically.


In [None]:
# !ln -s /hdd/iaross/askem/criticalmaas_text ./ingest/

### Prepartions

Dump current id and topic to a file

In [None]:
from dotenv import load_dotenv
import os
from tqdm import tqdm
import weaviate
import hashlib

load_dotenv()

In [None]:
auth = weaviate.auth.AuthApiKey(os.getenv("WEAVIATE_APIKEY"))
client = weaviate.Client(os.getenv("WEAVIATE_URL"), auth)

In [None]:
# Check backup status
client.backup.get_create_status(
    backup_id="pre_duduplication",
    backend="filesystem",
)

In [None]:
def get_batch_with_cursor(
    client, class_name, class_properties, batch_size, cursor=None
):
    query = (
        client.query.get(class_name, class_properties)
        .with_additional(["id"])
        .with_limit(batch_size)
    )

    if cursor is not None:
        return query.with_after(cursor).do()
    else:
        return query.do()

In [None]:
def get_hash(text):
    return hashlib.sha256(text.encode()).hexdigest()

In [None]:
# Get number of documents

metadata = client.query.aggregate("passage").with_meta_count().do()
n = metadata["data"]["Aggregate"]["Passage"][0]["meta"]["count"]
n

Dump topic to a file

In [None]:
cursor = None
class_name = "Passage"
id2topic = {}

pbar = tqdm(total=n)
while True:
    # From the SOURCE instance, get the next group of objects
    results = get_batch_with_cursor(
        client,
        class_name,
        class_properties=["paper_id", "topic"],
        batch_size=1024,
        cursor=cursor,
    )

    # If empty, we're finished
    if len(results["data"]["Get"][class_name]) == 0:
        break

    # A batch of objects
    objects = results["data"]["Get"][class_name]
    for object in objects:
        paper_id = object["paper_id"]
        topic = object["topic"]

        if paper_id not in id2topic:
            id2topic[paper_id] = [topic]
        elif topic not in id2topic[paper_id]:
            id2topic[paper_id].append(topic)

    # Update the cursor to the id of the last retrieved object
    cursor = results["data"]["Get"][class_name][-1]["_additional"]["id"]
    pbar.update(len(objects))
pbar.close()

In [None]:
import pickle
import datetime

# today date in YYMMDD format
today = datetime.datetime.now().strftime("%y%m%d")
today

In [None]:
with open(f"topic_dump_{today}.pkl", "wb") as f:
    pickle.dump(id2topic, f)

Deduplicate with text hash

In [None]:
class_name = "Passage"
existing_hash = set()
batch_size = 32
cursor = None
deleted = 0
pbar = tqdm(total=n)
while True:
    # From the SOURCE instance, get the next group of objects
    results = get_batch_with_cursor(
        client,
        class_name,
        class_properties=["paper_id", "text_content"],
        batch_size=batch_size,
        cursor=cursor,
    )

    # If empty, we're finished
    if len(results["data"]["Get"][class_name]) == 0:
        break

    # A batch of objects
    objects = results["data"]["Get"][class_name]
    for object in objects:
        uuid = object["_additional"]["id"]
        paper_id = object["paper_id"]
        text = object["text_content"]
        hashed_text = get_hash(text)

        if hashed_text not in existing_hash:
            print(f"Updating object: {uuid}")
            existing_hash.add(hashed_text)
            client.data_object.update(
                uuid=uuid,
                class_name=class_name,
                data_object={
                    "topic_list": id2topic[paper_id],
                    "text_hash": hashed_text,
                },
            )
        else:
            # Delete the duplicated object
            print(f"Deleting object: {uuid}")
            try:
                client.data_object.delete(uuid, class_name)
                deleted += 1
            except weaviate.exceptions.UnexpectedStatusCodeException as e:
                print(e)

    # Update the cursor to the id of the last retrieved object
    cursor = results["data"]["Get"][class_name][-1]["_additional"]["id"]
    pbar.update(len(objects))
pbar.close()