# Revamp data ingest pipeline

New requirements:
1. Have a resume mechanism to avoid duplication of text
2. Have a append topic mechanism to modify existing paragraph topics.
3. Don't break production demo.

Files to be ingested:

- /hdd/iaross/askem/criticalmaas_text

In [None]:
# !ln -s /hdd/iaross/askem/criticalmaas_text ./ingest/

### Prepartions

1. Dump current id and topic to a file

In [1]:
from dotenv import load_dotenv
import os
from tqdm import tqdm
import weaviate
import hashlib

load_dotenv()

True

In [2]:
auth = weaviate.auth.AuthApiKey(os.getenv("WEAVIATE_APIKEY"))
client = weaviate.Client(os.getenv("WEAVIATE_URL"), auth)

In [None]:
# Check backup status
client.backup.get_create_status(
    backup_id="pre_duduplication",
    backend="filesystem",
)

In [3]:
def get_batch_with_cursor(
    client, class_name, class_properties, batch_size, cursor=None
):
    query = (
        client.query.get(class_name, class_properties)
        .with_additional(["id"])
        .with_limit(batch_size)
    )

    if cursor is not None:
        return query.with_after(cursor).do()
    else:
        return query.do()

In [None]:
def get_hash(text):
    return hashlib.sha256(text.encode()).hexdigest()

In [4]:
# Get number of documents

metadata = client.query.aggregate("passage").with_meta_count().do()
n = metadata["data"]["Aggregate"]["Passage"][0]["meta"]["count"]
n

21996410

Dump topic to a file

In [7]:
cursor = None
class_name = "Passage"
id2topic = {}

pbar = tqdm(total=n)
while True:
    # From the SOURCE instance, get the next group of objects
    results = get_batch_with_cursor(
        client,
        class_name,
        class_properties=["paper_id", "topic"],
        batch_size=1024,
        cursor=cursor,
    )

    # If empty, we're finished
    if len(results["data"]["Get"][class_name]) == 0:
        break

    # A batch of objects
    objects = results["data"]["Get"][class_name]
    for object in objects:
        paper_id = object["paper_id"]
        topic = object["topic"]

        if paper_id not in id2topic:
            id2topic[paper_id] = [topic]
        elif topic not in id2topic[paper_id]:
            id2topic[paper_id].append(topic)

    # Update the cursor to the id of the last retrieved object
    cursor = results["data"]["Get"][class_name][-1]["_additional"]["id"]
    pbar.update(len(objects))
pbar.close()

100%|██████████| 21996410/21996410 [32:09<00:00, 11400.17it/s]


In [8]:
import pickle

with open("topic_dump_240129.pkl", "wb") as f:
    pickle.dump(id2topic, f)

Deduplicate with text hash

In [None]:
cursor = None
class_name = "Passage"
existing_hash = set()
deleted = 0
pbar = tqdm(total=n)
while True:
    # From the SOURCE instance, get the next group of objects
    results = get_batch_with_cursor(
        client,
        class_name,
        class_properties=["paper_id", "text_content"],
        batch_size=64,
        cursor=cursor,
    )

    # If empty, we're finished
    if len(results["data"]["Get"][class_name]) == 0:
        break

    # A batch of objects
    objects = results["data"]["Get"][class_name]
    for object in objects:
        uuid = object["_additional"]["id"]
        paper_id = object["paper_id"]
        text = object["text_content"]
        hashed_text = get_hash(text)

        if hashed_text not in existing_hash:
            existing_hash.add(hashed_text)
        else:
            # Delete the duplicated object
            try:
                client.data_object.delete(uuid, class_name)
                deleted += 1
            except weaviate.exceptions.UnexpectedStatusCodeException as e:
                print(e)

    # Update the cursor to the id of the last retrieved object
    cursor = results["data"]["Get"][class_name][-1]["_additional"]["id"]
    pbar.update(len(objects))
pbar.close()

{'backend': 'filesystem',
 'id': 'pre_duduplication',
 'path': '/tmp/backups/pre_duduplication',
 'status': 'STARTED'}