# Data Producer for Covitext API

## Setup

In [1]:
import requests
import glob
import json

# parameters
data_path = './../data/'
api_url = 'http://localhost:8000'


### Get all json paths

In [2]:
# list of the paths of all jsons
all_json_paths = glob.glob(f'{data_path}/documents/**/*.json', recursive=True)

len(all_json_paths)


716956

### Process and post document snippets

In [3]:
class Document:
    paper_id: str = ''
    title: str = ''
    abstract: list[str] = []
    body_text: list[str] = []

    def __init__(self, paper_id):
        self.paper_id = paper_id

    def __repr__(self):
        return f'[{self.paper_id}] \'{self.title}\': {self.abstract}...'


In [4]:

def post_snippet(paper_id: str, text: str, text_type: str):
    try:
        res = requests.post(f"{api_url}/api/document/snippet",
                            json={'paper_id': paper_id, 'text': text, 'text_type': text_type})

    except Exception as e:
        print('error when trying to post', paper_id)
        print(e)

def post_document(document: Document):
    post_snippet(document.paper_id, document.title, 'title')

    for paragraph in document.abstract:
        post_snippet(document.paper_id, paragraph, 'abstract')

    for paragraph in document.body_text:
        post_snippet(document.paper_id, paragraph, 'body_text')

    print(f'{document.paper_id}:')

def iterate_json_paths():
    for json_path in all_json_paths:
        with open(json_path) as json_file:
            json_data = json.load(json_file)

            if not 'paper_id' in json_data:
                print(f"Current document has no id!")
                continue

            current_doc = Document(paper_id=json_data['paper_id'])

            if 'title' in json_data:
                current_doc.title = json_data['title']

            if 'abstract' in json_data:
                for paragraph in json_data['abstract']:
                    current_doc.abstract.append(paragraph["text"])

            if 'body_text' in json_data:
                for paragraph in json_data['body_text']:
                    current_doc.body_text.append(paragraph["text"])

            post_document(current_doc)


In [5]:
iterate_json_paths()

PMC8230242:
PMC7324312:
PMC7706756:
PMC8944676:
PMC8645973:
PMC8618802:
PMC9010243:
PMC3610377:
PMC8831017:
PMC9009978:
PMC545201:
PMC8758950:
PMC3161824:
PMC8926542:
PMC7223598:
PMC8035790:
PMC8403254:
PMC7978470:
PMC7848879:
PMC8077985:
PMC7885137:
PMC8500192:
PMC8330176:
PMC8237772:
