# Data Producer for Covitext API

## Setup

In [1]:
import asyncio
import concurrent.futures
import requests
import glob
import json
import copy

# parameters
data_path = './../data/'
api_url = 'http://localhost:8000'


### Get all json paths

In [2]:
# list of the paths of all jsons
all_json_paths = glob.glob(f'{data_path}/documents/**/*.json', recursive=True)

len(all_json_paths)


### Process and post document snippets

In [None]:
class Document:
    def __init__(self, paper_id):
        self.paper_id = paper_id
        self.title = ''
        self.abstract = []
        self.body_text = []

    def __repr__(self):
        return f'[{self.paper_id}] \'{self.title}\': {self.abstract}...'


In [None]:

def post_snippet(paper_id: str, text: str, text_type: str):
    if text_type != 'title' and len(text) < 24:
        return None

    try:
        return requests.post(f"{api_url}/api/document/snippet",
                             json={'paper_id': paper_id, 'text': text, 'text_type': text_type})
        # pass
    except Exception as e:
        print('error when trying to post', paper_id)
        print(e)

    return None

def post_document(document: Document):
    print(f'START {document.paper_id}')

    # check if document was already inserted
    try:
        res = requests.get(f"{api_url}/api/document?id={document.paper_id}")

        if res.status_code != 404:
            print("skipping", document.paper_id)
            return
    except Exception as e:
        print('error when trying to check for document', document.paper_id)
        print(e)

    post_snippet(document.paper_id, document.title, 'title')

    for paragraph in document.abstract:
        post_snippet(document.paper_id, paragraph, 'abstract')

    for paragraph in document.body_text:
        post_snippet(document.paper_id, paragraph, 'body_text')

    print(f'END {document.paper_id}')

async def iterate_json_paths():
    limit = 32
    futures = set()

    with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor:
        for json_path in all_json_paths:
            if len(futures) >= limit:
                completed, futures = concurrent.futures.wait(
                    futures, return_when=concurrent.futures.FIRST_COMPLETED)
                # break
            with open(json_path) as json_file:
                json_data = json.load(json_file)

                if not 'paper_id' in json_data:
                    print(f"Current document has no id!")
                    continue

                current_doc = Document(paper_id=json_data['paper_id'])

                if 'metadata' in json_data:
                    if 'title' in json_data['metadata']:
                        current_doc.title = json_data['metadata']['title']
                        pass

                if 'abstract' in json_data:
                    for paragraph in json_data['abstract']:
                        current_doc.abstract.append(paragraph["text"])
                        pass

                # if 'body_text' in json_data:
                #     for paragraph in json_data['body_text']:
                #         current_doc.body_text.append(paragraph["text"])
                #         pass

                print(f"SUBMITTING {current_doc.paper_id}")
                futures.add(executor.submit(post_document, current_doc))
                current_doc = None
                # await post_document(current_doc)


In [None]:
await iterate_json_paths()


SUBMITTING PMC8230242
START PMC8230242
SUBMITTING PMC7324312
START PMC7324312
SUBMITTING PMC7706756
START PMC7706756
SUBMITTING PMC8944676
START PMC8944676
SUBMITTING PMC8645973
START PMC8645973
SUBMITTING PMC8618802
START PMC8618802
SUBMITTING PMC9010243
START PMC9010243
SUBMITTING PMC3610377
START PMC3610377
SUBMITTING PMC8831017
START PMC8831017
SUBMITTING PMC9009978
START PMC9009978
END PMC8230242
END PMC8944676
SUBMITTING PMC545201
START PMC545201
SUBMITTING PMC8758950
START PMC8758950
END PMC7324312
SUBMITTING PMC3161824
START PMC3161824
SUBMITTING PMC8926542
START PMC8926542
SUBMITTING PMC7223598
START PMC7223598
END PMC7706756
SUBMITTING PMC8035790
START PMC8035790
END PMC8831017
END PMC8645973
END PMC3610377
SUBMITTING PMC8403254
START PMC8403254
END PMC9009978
SUBMITTING PMC7978470
START PMC7978470
SUBMITTING PMC7848879
START PMC7848879
SUBMITTING PMC8077985
START PMC8077985
SUBMITTING PMC7885137
END PMC9010243
START PMC7885137
END PMC8618802
SUBMITTING PMC8500192
END PMC8926