In [1]:
import json
import os
import bs4
from bs4 import BeautifulSoup
from glob import glob
import re
import pandas as pd

In [71]:
#print(extract_data(get_nxml_from_pmcid("PMC7581548"))[0])

In [76]:
def load_data(file):
    with open(file, 'r') as f:
        html_doc = ''.join(f.readlines())
    return html_doc


allowed_text_between_refs = [',', ';', '-', '–', ' ']
def sentence_extract(p, full_text, annotation, markers, surrounding_markers):
    p_text = ''
    prev_3_tags = (None, None, None)
    for n in p.contents:
        has_annotation = False
        if n.name == None:
            p_text += n.replace('\n', ' ')

        elif n.name == 'sup':
            full_text += p_text
            p_text = ''
            full_text, annotation = sentence_extract(n, full_text, annotation, markers, surrounding_markers)
            full_text = full_text[:-2]

        elif n.name  == 'xref' and markers is not None and (n['rid'] in markers):
            xref_text = n.text.replace('\n', ' ')
            
            # annotate marker
            ann_tag = 'Citation'
            
            # if at least 1 of the previous 2 tags is xref,
            # then this citations is part of a multi-citation
            if (prev_3_tags[2] and prev_3_tags[2].startswith('xref')) or \
            (prev_3_tags[1] and prev_3_tags[1].startswith('xref') and prev_3_tags[2] in allowed_text_between_refs):
                ann_tag = 'MultiCitation'
            # to do: check for cases other than [',', ';', '-', '–']
            offset = len(full_text) + len(p_text)
            annotation.append(f'T{len(annotation)}\t{ann_tag} {offset} {offset + len(xref_text)}\t{xref_text}')
            has_annotation = True
            
            p_text += xref_text
            
        #########
        elif n.name == 'xref' and markers is not None and surrounding_markers is not None:
            xref_text = n.text.replace('\n', ' ')
            if prev_3_tags[2] and prev_3_tags[2].startswith('xref'):
                _, prev_id, prev_len = prev_3_tags[2].split("$")
                prev_len = int(prev_len)
                
                if prev_id in surrounding_markers and n['rid'] in surrounding_markers:
                    ann_tag = 'MultiCitation'
                    # to do: check for cases other than [',', ';', '-', '–']
                    offset = len(full_text) + len(p_text)
                    annotation.append(f'T{len(annotation)}\t{ann_tag} {offset-prev_len} {offset + len(xref_text)}\t{p_text[-prev_len:] + xref_text}')
                    has_annotation = True
                p_text += xref_text
       
                    
            elif (prev_3_tags[1] and prev_3_tags[1].startswith('xref') and prev_3_tags[2] in allowed_text_between_refs):

                _, prev_id, prev_len = prev_3_tags[1].split("$")
                prev_len = int(prev_len)
                if prev_id in surrounding_markers and n['rid'] in surrounding_markers:
                    ann_tag = 'MultiCitation'
                    # to do: check for cases other than [',', ';', '-', '–']
                    offset = len(full_text) + len(p_text)
                    annotation.append(f'T{len(annotation)}\t{ann_tag} {offset-prev_len-1} {offset + len(xref_text)}\t{p_text[-prev_len-1:] + xref_text}')
                    has_annotation = True
                p_text += xref_text
            else:
                p_text += n.text.replace('\n', ' ')          
            
        #############
            
        elif n.name in ['xref', 'sup', 'italic', 'named-content']:
            p_text += n.text.replace('\n', ' ')
        elif n.name == 'table-wrap':
            # title of figure and table
            label = n.label.text.replace('\n', ' ') if n.label else ''
            caption = n.caption.text.replace('\n', ' ') if n.caption else ''
            caption_text = label + " " + caption + ". "
            full_text += caption_text
#             print(caption_text)
        elif n.name == "title":
            p_text += n.get_text() + "\n\n"
        elif n.name == "ext-link":
            p_text += n.get_text()
        elif n.name == 'fig':
            continue
        else:
            n_text = '. '.join([i.get_text() for i in n.findAll('p')])
            p_text += n_text + " "
        
        if has_annotation:
            prev_3_tags = (prev_3_tags[1], prev_3_tags[2], 'annotation')
            
            
        ##############
        elif n.name == 'xref':
            prev_3_tags = (prev_3_tags[1], prev_3_tags[2], n.name + "$" + n['rid'] + "$" + str(len(n.text.replace('\n', ' '))))
            
        ###############
            
        elif n.name == None:
            prev_3_tags = (prev_3_tags[1], prev_3_tags[2], n.strip())
        else:
            prev_3_tags = (prev_3_tags[1], prev_3_tags[2], n.name)
            
        if prev_3_tags[0] == 'annotation' and '\tCitation ' in annotation[-1] \
            and (prev_3_tags[1] == 'xref' or (prev_3_tags[2] == 'xref' and \
                                            prev_3_tags[1] in allowed_text_between_refs)):
            annotation[-1] = annotation[-1].replace('\tCitation ', '\tMultiCitation ')
    full_text += p_text + '\n\n'
    return full_text, annotation


def paragraph_extract(sec, full_text, annotation, markers, surrounding_markers, paragraph_markers, offset):
    if sec.find('p', recursive=False):
        for p in sec.findAll('p', recursive=False):
            full_text, annotation = sentence_extract(p, full_text, annotation, markers, surrounding_markers)
            full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
                                                                                offset, paragraph_markers)
        full_text += '\n\n'
        full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
                                                                                offset, paragraph_markers, False)
    return full_text, annotation, paragraph_markers, offset


def iter_extract(sec, full_text, text_section, annotation, markers, surrounding_markers,  paragraph_markers, offset):
    if sec.find('sec', recursive=False): # if there are subsections
        section_level = 0
        pre_c = len(full_text)
        if sec.title:
            full_text += sec.title.text + '\n\n'
            full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
                                                                                offset, paragraph_markers)
#         if sec.find('fig'):
#             print('first if')
#             for fig in sec.findAll('fig', recursive=False):
#                 label = fig.label.text.replace('\n', ' ') if fig.label else ''
#                 caption = fig.caption.text.replace('\n', ' ') if fig.caption else ''
#                 caption_text = label + " " + caption + ". "
#                 full_text += caption_text + "\n"
                
#                 # each figure is it's own paragraph
#                 full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
#                                                                                 offset, paragraph_markers)
        full_text, annotation, paragraph_markers, offset = paragraph_extract(sec, full_text, 
                                                             annotation, markers, surrounding_markers, paragraph_markers, offset)
        full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
                                                                                offset, paragraph_markers)
        
        if len(sec.findAll('sec', recursive=False)) != 0:
            section_level += 1
                
        for i in sec.findAll('sec', recursive=False):
            full_text, text_section, annotation, paragraph_markers, offset = iter_extract(i, full_text, 
                                                  text_section, annotation, markers, surrounding_markers, paragraph_markers, offset)

        after_c = len(full_text)

        sec_name = sec.title.text if sec.title else ''
        if sec_name in text_section.keys():
            if text_section[sec_name][-1]-pre_c == -1:
                text_section[sec_name][-1] = after_c-1
                text_section[sec_name][0] = section_level
            elif text_section[sec_name][-2] == pre_c and text_section[sec_name][-1] == after_c-1:
                pass
            else:
                text_section[sec_name].extend([pre_c, after_c-1,section_level])
        else:
            text_section[sec.title.text if sec.title else ''] = [pre_c, after_c-1,section_level]
        return full_text, text_section, annotation, paragraph_markers, offset
    elif not sec.find('p', recursive=False):
        section_level = 0
        if sec:
            pre_c = len(full_text)
            full_text, annotation = sentence_extract(sec, full_text, annotation, markers, surrounding_markers)
            full_text += '\n\n'
            full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
                                                                                offset, paragraph_markers)
            
            after_c = len(full_text)
            sec_name = sec.title.text if sec.title else ''
            if sec_name in text_section.keys():
                if text_section[sec_name][-1]-pre_c == -1:
                    text_section[sec_name][-1] = after_c-1
                    text_section[sec_name][0] = section_level
                else:
                    text_section[sec_name].extend([pre_c, after_c-1,section_level])
            else:
                text_section[sec.title.text if sec.title else ''] = [pre_c, after_c-1,section_level]
            return full_text, text_section, annotation, paragraph_markers, offset
    else: # if this is the only section
        section_level = 0
        pre_c = len(full_text)
        if sec.title:
            full_text += sec.title.text.replace('\n', ' ') + '\n\n'
            full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
                                                                                offset, paragraph_markers)
        
        full_text, annotation, paragraph_markers, offset = paragraph_extract(sec, full_text, 
                                                             annotation, markers, surrounding_markers,  paragraph_markers, offset)
        full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
                                                                                offset, paragraph_markers)
        # full_text += ' '.join([j.get_text().replace('\n', ' ') for j in sec.findAll('p')]) + '\n'
        after_c = len(full_text)
        sec_name = sec.title.text if sec.title else ''
        if sec_name in text_section.keys():
            if text_section[sec_name][-1]-pre_c == -1:
                text_section[sec_name][-1] = after_c-1
                text_section[sec_name][0] = section_level
            else:
                text_section[sec_name].extend([pre_c, after_c-1,section_level])
        else:
            text_section[sec.title.text if sec.title else ''] = [pre_c, after_c-1,section_level]
        return full_text, text_section, annotation, paragraph_markers, offset

In [78]:
# print(extract_data(get_nxml_from_pmcid("PMC7581548"))[0])

In [52]:
#######################################  CHANGE START #####################################
def figure_caption_extract(sec, full_text, annotation, offset, markers, surrounding_markers, paragraph_markers):
# def figure_caption_extract(sec, full_text, text_section, annotation, markers, paragraph_markers, offset):
    #if sec.find('fig', recursive=True):
    for fig in sec.findAll('fig', recursive=True):
        label = fig.label.text.replace('\n', ' ') if fig.label else ''
        full_text += label + "\n\n"
        
        full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
                                                                                offset, paragraph_markers)
        
#         full_text, annotation = iter_extract(fig, full_text, text_section , annotation, markers, 
#                                              paragraph_markers, offset)
#         full_text, annotation = sentence_extract(fig.find('caption'), full_text, annotation, markers)
        if fig.caption:
            for p in fig.caption.findAll('p', recursive=False):
                full_text, annotation = sentence_extract(p, full_text, annotation, markers, surrounding_markers)
                full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
                                                                              offset, paragraph_markers)

        #caption = fig.caption.text.replace('\n', ' ') if fig.caption else ''
        #full_text += caption + "\n\n"
        
#         full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
#                                                                           offset, paragraph_markers)
        
    return full_text, annotation, offset, paragraph_markers
#######################################  CHANGE END #######################################

def cleanup_text_and_add_paragraph_marker(full_text, current_offset, paragraph_markers, add_marker = True):
    while '\n\n\n' in full_text:
        full_text = full_text.replace('\n\n\n', '\n\n')
    if add_marker and not current_offset == len(full_text):
        paragraph_markers.append(f"{current_offset} {len(full_text)}")
    new_offset = len(full_text)
    return full_text, new_offset, paragraph_markers

In [37]:
def extract_data(file, num = None, markers = None, surrounding_markers=None):
    html_doc = load_data(file)
    soup = BeautifulSoup(html_doc, 'html.parser').findAll('article', recursive=False)[0]

    for c in soup.children:
        if c.name not in ['front', 'body', 'back', 'floats-group']:
            c.replaceWith('')

    section_header = []
    text_section = dict()
    article_meta = soup.findAll('article-meta')[0].findAll('article-id')

    article_meta = [i.text for i in article_meta if i['pub-id-type'] == 'pmid'][0]
    if num:
        if article_meta in num:
            return False, False, False

    full_text = ''
    annotation = []
    paragraph_markers = []
    
#     # title section
    offset = 0
    title = soup.findAll('title-group')
    if len(title) == 1:
        title = title[0].find('article-title').get_text(separator=' ').replace('\n', ' ')
        full_text += title+'\n'
    else:
        text_section['title']=[0, full_text.__len__()-1]
        
    # title is it's own paragraph
    full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
                                                                                offset, paragraph_markers)
                        
    # abstract section (fixed by Shruthan)
    abstract = soup.findAll('abstract')
    if abstract:
        for abs in abstract:
            if (not abs.get('abstract-type')) or abs.get('abstract-type') not in \
                                ['toc', 'graphical', 'teaser', 'author-highlights']:
                if not abs.find('title', recursive=False):
                    full_text += '\nAbstract\n\n'
                    full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
                                                                                offset, paragraph_markers, False)
                abs_sec = abs.contents
                for sec in abs_sec:
                    if type(sec) is bs4.element.NavigableString:
                        full_text += sec
                        full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
                                                                                offset, paragraph_markers)
                    elif type(sec) is bs4.element.Comment:
                        pass
                    else:
                        full_text, text_section, annotation, paragraph_markers, offset = iter_extract(
                                    sec, full_text, text_section, annotation, markers, surrounding_markers, paragraph_markers, offset)
    else:
        abstract = ''
        if title in text_section: # check these cases
            text_section['abstract'] = [text_section['title'][1], full_text.__len__()-1]

    # body section
    body = soup.findAll('body')
    if len(body) > 0:
        full_text += '\n'
        full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
                                                                                offset, paragraph_markers, False)
        if len(body) == 1:
            body = body[0].contents
            for sec in body:
                if type(sec) is bs4.element.NavigableString:
                    full_text += sec
                    full_text, offset, paragraph_markers = cleanup_text_and_add_paragraph_marker(full_text, 
                                                                                offset, paragraph_markers)
                elif type(sec) is bs4.element.Comment:
                    pass
                else:
                    full_text, text_section, annotation, paragraph_markers, offset = iter_extract(sec, full_text, 
                                                                      text_section, annotation, 
                                                                      markers, surrounding_markers, paragraph_markers, offset)
    full_text, annotation, offset, paragraph_markers = figure_caption_extract(soup, full_text, 
                                                              annotation, offset, markers, surrounding_markers, paragraph_markers)
#     full_text, annotation, offset, paragraph_markers = figure_caption_extract(sec, full_text, 
#                                        text_section, annotation, markers, paragraph_markers, offset)
    
#     float_group = soup.findAll('floats-group')
#     print(len(float_group))
    # back section
#     back_matter_text = ""
#     back = soup.findAll('back')
#     if len(back) > 0:
#         for sec in back[0].contents:
#             #foot note
#             if sec.name == "fn-group":
#                 foot_notes = sec.findAll("fn")
#                 for foot_note in foot_notes:
#                     foot_note_text = '\n'.join([i.get_text() for i in foot_note.findAll('p')])  
#                     back_matter_text += foot_note_text
#                 back_matter_text += "\n"                      
#             #acknowledgement
#             if sec.name == "ack":
#                 sec_title = sec.find("title").get_text()
#                 back_matter_text += sec_title
#                 back_matter_text += "\n"
#                 sec_text = ' '.join([i.get_text() for i in sec.findAll('p', recursive=False)])          
#                 back_matter_text += sec_text
#                 sub_sections = sec.findAll('sec')
#                 for sub_sec in sub_sections:
#                     if sub_sec.find("title"):
#                         back_matter_text += "\n"
#                         sub_sec_title = sub_sec.find("title").get_text()
#                         back_matter_text += sub_sec_title
#                         back_matter_text += "\n"
#                         sub_sec_text = ' '.join([i.get_text() for i in sub_sec.findAll('p')])
#                         back_matter_text += sub_sec_text
                        
#             #notes
#             if sec.name == "notes":
#                 sub_sections = sec.findAll('sec')
#                 for sub_sec in sub_sections:
#                     if sub_sec.find("title"):
#                         back_matter_text += "\n" 
#                         sub_sec_title = sub_sec.find("title").get_text()
#                         back_matter_text += sub_sec_title
#                         back_matter_text += "\n"
#                         sub_sec_text = ' '.join([i.get_text() for i in sub_sec.findAll('p')])
#                         back_matter_text += sub_sec_text
            
#             #sections
#             if sec.name == "sec":
#                 if sec.find("title"):
#                     sec_title = sec.find("title").get_text()
#                     back_matter_text += sec_title
#                     back_matter_text += "\n"
#                     sec_text = ' '.join([i.get_text() for i in sec.findAll('p')])
#                     back_matter_text += sec_text
                    
#             #glossary
#             if sec.name == "glossary":
#                 if sec.find("title"):
#                     sec_title = sec.find("title").get_text()
#                     back_matter_text += sec_title
#                     terms = sec.findAll("def-item")
#                     for term in terms:
#                         back_matter_text += "\n"
#                         term_abb = term.find("term").get_text()
#                         term_meaning = term.find("p").get_text()
#                         back_matter_text += term_abb + " " + term_meaning
                        
#     full_text +=back_matter_text
#     for j in soup.find_all('floats-group', recursive=True):
#         print(j.name)
    return full_text, article_meta, text_section, annotation, paragraph_markers

In [5]:
def generate_brat_data(input_file, output_filename, markers = None, surrounding_markers=None):   
    annotation = ""
    processed_full_text = ""

    full_text, PMID, section_header, annotation, paragraph_markers = extract_data(input_file, markers = markers, surrounding_markers=surrounding_markers)
   
    with open(output_filename + ".txt", "w+") as o:
        o.write(full_text) 
        
    with open(output_filename + "_par.csv", "w") as o:
        o.write('\n'.join(paragraph_markers))
        
    with open(output_filename + "_sec.json", "w") as o:
        json.dump(section_header, o)

    if markers is not None:
        with open(output_filename + ".ann", "w") as o:
            o.write('\n'.join(annotation))

In [6]:
import pandas as pd

citation_context = pd.read_csv("covid_context_subset.csv")

In [83]:
import json

with open('sampled_res_30.json') as f:
    oa_paper_citations = json.load(f)

In [84]:
input_folder = "pmc-fulltext-nxml"
output_folder = "output30"
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

In [9]:
citation_context.head()

Unnamed: 0.1,Unnamed: 0,pmcid,pmid,location,IMRaD,sentence_id,total_sentences,intxt_id,intxt_pmid,intxt_mark,citation,progression
0,47218,PMC7236719,32840252,body,NoIMRaD,15,43,PMC7236719_sref21,32311448,21>|sref21|,In view of the previously mentioned (prelimina...,34.88
1,47222,PMC7236719,32840252,body,NoIMRaD,22,43,PMC7236719_sref21,32311448,21>|sref21|,Thromboprophylaxis. Recommendations on prophyl...,51.16
2,47230,PMC7236719,32840252,body,NoIMRaD,34,43,PMC7236719_sref21,32311448,21>|sref21|,This is based on the preference for parenteral...,79.07
3,47437,PMC7236820,32398297,body,NoIMRaD,9,53,PMC7236820_C9,32311448,9>|C9|,The cumulative incidence of venous thromboembo...,16.98
4,49287,PMC7239079,32511451,body,NoIMRaD,51,290,PMC7239079_R49,32291278,Kissler et al . 2020>|R49|,"More typical, however, is for R 0 to vary with...",17.59


In [10]:
from getnxml import get_nxml_from_pmcid

In [11]:
pmid2pmcid = {}
pmid2pmcid["33930320"] = "PMC8078878"
pmid2pmcid['32311448'] = "PMC7164881"
pmid2pmcid['32731257'] = "PMC7581548"
pmid2pmcid['32839612'] = "PMC7578095"
pmid2pmcid['33378609'] = "PMC7787219"
pmid2pmcid['32416070'] = "PMC7227586"
pmid2pmcid['32445440'] = "PMC7262788"
pmid2pmcid['32841599'] = "PMC7418704"
pmid2pmcid['32366695'] = "PMC7199903"
pmid2pmcid['32291278'] = "PMC7164482"
pmid2pmcid['32540903'] = "PMC7299280"

In [16]:
# with open("sampled_res_30.json", 'w') as f:
#     json.dump(oa_paper_citations, f)

In [85]:
c = 0
for ref_paper, citing_papers in oa_paper_citations.items():
    if len(citing_papers) >= 20:
        # ref_paper_pmcid = list(oa_file_list[oa_file_list[3] == 'PMID:' + ref_paper][2])[0]
        ref_paper_pmcid = pmid2pmcid[ref_paper]
        output_subfolder = os.path.join(output_folder, ref_paper_pmcid)
        if not os.path.exists(output_subfolder):
            os.mkdir(output_subfolder)
        
    
        path = get_nxml_from_pmcid(ref_paper_pmcid)
        generate_brat_data(path, 
                           os.path.join(output_subfolder, ref_paper_pmcid))

        for citing_paper in citing_papers:
            print(citing_paper)

#             if citing_paper != '21219646': continue
            citing_paper_pmcid = citing_paper
#             print(ref_paper, citing_paper)
            markers = citation_context[(citation_context['intxt_pmid'] == int(ref_paper)) & 
                                      (citation_context['pmcid'] == citing_paper)]['intxt_id']
            surrounding_markers = citation_context[(citation_context['intxt_pmid'] == ref_paper) & 
                                      (citation_context['pmcid'] == citing_paper)]['intxt_mark']
            markers = [m.split('_')[1] for m in set(markers)]
#             print(markers)
            citing_path = get_nxml_from_pmcid(citing_paper)
            generate_brat_data(os.path.join(citing_path), 
                       os.path.join(output_subfolder, citing_paper_pmcid), markers, surrounding_markers)
#             print((citing_paper_pmcid, citing_paper))
    


PMC9375576
PMC8434920
PMC9187102
PMC9621022
PMC9349458
PMC8284046
PMC8454213
PMC8649466
PMC9412348
PMC9317287
PMC8885466
PMC9587126
PMC8623766
PMC8473168
PMC9296083
PMC8428325
PMC8623604
PMC8563977
PMC9396110
PMC8992638
PMC8531069
PMC9016488
PMC8389568
PMC8879968
PMC8949184
PMC8700804
PMC8755369
PMC8586723
PMC9480732
PMC8875598
PMC8442752
PMC9520447
PMC7454798
PMC9214161
PMC7871721
PMC9225593
PMC8837491
PMC9044512
PMC8056322
PMC9412042
PMC8894812
PMC7331864
PMC7361754
PMC7485709
PMC7392617
PMC8621119
PMC7492826
PMC7430069
PMC8352775
PMC7733673
PMC7396456
PMC7938644
PMC7743107
PMC9104617
PMC7487272
PMC9043882
PMC7539230
PMC8064159
PMC7423832
PMC8024143
PMC9075903
PMC7928010
PMC8366622
PMC8366625
PMC8172228
PMC8451603
PMC8219099
PMC8428573
PMC9240059
PMC8288147
PMC8497464
PMC8539008
PMC8393506
PMC8325333
PMC7937514
PMC8748674
PMC8638770
PMC7808328
PMC8139703
PMC8687884
PMC9426593
PMC7834972
PMC8357629
PMC8223959
PMC8654298
PMC9119307
PMC8926328
PMC7969912
PMC9556142
PMC8144958
PMC8524637

In [1]:
# citation_context[(citation_context['intxt_pmid'] == 33378609) & 
#                                       (citation_context['pmcid'] == 'PMC8513403')]

In [None]:
# print(extract_data(get_nxml_from_pmcid("PMC7581548"))[0])

In [82]:
# output_subfolder = os.path.join('test')
# citing_path = get_nxml_from_pmcid("PMC7654894")
# generate_brat_data(os.path.join(citing_path), 
#            os.path.join(output_subfolder, "PMC7654894"), "PMC7654894_R23")

In [103]:
# t = {'': [139, 1466, 0, 1467, 1943, 0, 1944, 2490, 0, 2491, 3666, 0, 3667, 4951, 0, 4952, 6399, 0, 6400, 7919, 0, 7920, 8369, 0, 8370, 10261, 0, 74448, 74477, 0], 'Baseline: no interventions': [10271, 11946, 0], 'Minimum number of cases': [12546, 13598, 0], 'Minimum intervention': [13599, 14629, 0], 'Minimum use of non-quarantine interventions': [14630, 15301, 0], 'Minimum quarantine': [15302, 15943, 0], 'Optimal interventions for four objectives': [11947, 15943, 1], 'Results': [10262, 15943, 1], 'Discussion': [15944, 20784, 0], 'Uncontrolled university model': [20794, 47500, 0], 'Equilibrium points': [47548, 47795, 0], 'Basic reproduction number': [47796, 49582, 0], 'Proposition 1': [49603, 49716, 0], 'Proof': [49717, 52174, 0], 'Stability analysis': [49583, 52174, 1], 'Analysis of the uncontrolled university model': [47501, 52174, 1], 'Formulation of the controlled university model': [52208, 69927, 0], 'Formulation of the optimal control problem': [69928, 71354, 0], 'Implementation of four different scenarios': [71355, 74420, 0], 'Control of the university model': [52175, 74420, 1], 'Methods': [20785, 74420, 1], 'Supplementary Information': [74421, 74477, 1]}

In [105]:
# with open("test.json", 'w') as f:
#     json.dump(t, f)

In [2]:
# print(extract_data(get_nxml_from_pmcid("PMC7581548"))[0])