# Revamp data ingest pipeline

New requirements:
1. Have a resume mechanism to avoid duplication of text
2. Have a append topic mechanism to modify existing paragraph topics.
3. Don't break production demo.

Files to be ingested:

- /hdd/iaross/askem/criticalmaas_text

In [None]:
# !ln -s /hdd/iaross/askem/criticalmaas_text ./ingest/

### Prepartions

1. Dump current id and topic to a file

In [18]:
from dotenv import load_dotenv
import os
from tqdm import tqdm
import weaviate

load_dotenv()

True

In [8]:
auth = weaviate.auth.AuthApiKey(os.getenv("WEAVIATE_APIKEY"))
client = weaviate.Client(os.getenv("WEAVIATE_URL"), auth)

In [9]:
def get_batch_with_cursor(
    client, class_name, class_properties, batch_size, cursor=None
):
    query = (
        client.query.get(class_name, class_properties)
        .with_additional(["id"])
        .with_limit(batch_size)
    )

    if cursor is not None:
        return query.with_after(cursor).do()
    else:
        return query.do()

In [21]:
# Get number of documents

metadata = client.query.aggregate("passage").with_meta_count().do()
n = metadata["data"]["Aggregate"]["Passage"][0]["meta"]["count"]
n

21996410

In [None]:
cursor = None
class_name = "Passage"
id2topic = {}

pbar = tqdm(total=n)
while True:
    # From the SOURCE instance, get the next group of objects
    results = get_batch_with_cursor(
        client,
        class_name,
        class_properties=["paper_id", "topic"],
        batch_size=1024,
        cursor=cursor,
    )

    # If empty, we're finished
    if len(results["data"]["Get"][class_name]) == 0:
        break

    # Otherwise, find the uuid of the figure object and delete it
    objects = results["data"]["Get"][class_name]

    for object in objects:
        paper_id = object["paper_id"]
        topic = object["topic"]

        if paper_id not in id2topic:
            id2topic[paper_id] = topic
        elif id2topic[paper_id] != topic:
            print(
                f"Conflicts: paper_id {paper_id} has {id2topic[paper_id]} and {topic}"
            )

    # Update the cursor to the id of the last retrieved object
    cursor = results["data"]["Get"][class_name][-1]["_additional"]["id"]
    pbar.update(len(objects))
pbar.close()

In [17]:
id2topic

{'5b04d247cf58f16373bb2333': 'covid-19',
 '558d0a6ae13823109f3edbf1': 'criticalmaas',
 '5b6a8dffcf58f18a7913fa4c': 'dolomites',
 '615a12e467467f7269718eea': 'climate_change',
 '5fd3792178f934caa36cef9f': 'climate_change',
 '558d6d3be13823109f3eddb6': 'dolomites',
 '56e49ac7cf58f11f60a3f451': 'climate_change',
 '5fecab3bea8bd37226fe451e': 'covid-19',
 '606b7cbc3f2ac7e70180c968': 'covid-19',
 '55068d7de1382326932d91b8': 'dolomites',
 '5c39de341faed655488c79dc': 'climate_change',
 '621f96db2688b711353407fa': 'covid-19',
 '55bfd7becf58f16e99a370ec': 'dolomites',
 '54cfd68be13823b501cdbb2b': 'dolomites',
 '5d24fa060b45c76caf9212bc': 'climate_change',
 '54b4328fe138239d86856352': 'dolomites',
 '57c53033cf58f1bcd9f85fc4': 'climate_change',
 '63b9b34274bed2df5c4300e8': 'covid-19',
 '5d2531f60b45c76caf921ba7': 'criticalmaas',
 '558f029ae13823109f3ee6fa': 'dolomites',
 '5d4f244d0b45c76cafa45569': 'dolomites',
 '557fb874e13823bc80ba27f6': 'dolomites',
 '620e6886ad0e9c819b0c3c48': 'dolomites',
 '5