In [2]:
from gpt_xml_helpers import *

In [3]:
import io
import os
import time
import pandas as pd
import xml.etree.ElementTree as ET
from typing import Dict
from pathlib import Path

In [4]:
# wrapper = GPTWrapper("gpt-3.5-turbo-1106")
wrapper = GPTWrapper("gpt-4-1106-preview")

In [5]:
data_path = Path('sample_xmls')
xml_folder = data_path

In [6]:
xml_files = []

for root, dirs, files in os.walk(xml_folder):
    for file in files:
        if file.endswith('.xml'):
            xml_files.append(os.path.join(root, file))

In [8]:
def prompt_truncated_paper(wrapper, parsed_xml: Dict, max_tokens: int = 100000):
    truncated_paper = truncate(
        f"""Abstract:
```
{parsed_xml['abstract']}
```

Figures/Tables Captions:
```
{parsed_xml['figure_and_table_captions']}
```

Main Content:
```
{parsed_xml['main_content']}
```""",
        max_tokens,
        wrapper,
    )

    text_to_send = f"""Your task now is to evaluate whether a scientific publication titled "{parsed_xml['title']}" includes information on how to obtain the data reported in the paper:

{truncated_paper}


======
Your task:
Determine whether this paper reports shared data.

Start by "Transparency outline:".
And then answer:
"1. List the types of data reported in the paper"
"2. Species of organisms studied(e.g. human, mouse, rat, non-human primate, etc)"
"3. How to obtain raw data"
"4. Data sharing statement"
"5. Print a pipe separated csv with: "type_of_data" | "species" | "link_to_dataset" | "data_sharing_statement""

Start the csv with "csv" and finish it with "/csv".

"""
    return text_to_send

In [9]:
def step2_parse_xml(xml: str) -> Dict:
    xml_file = io.StringIO(xml)
    tree = ET.parse(xml_file)
    root = tree.getroot()

    title = get_article_title(root)
    abstract = get_abstract(root)
    introduction = get_section_text(root, section_title="Introduction")
    figure_and_table_captions = get_figure_and_table_captions(root)

    # Get all section titles, including Figures
    section_titles = [
        sec.find("title").text if sec.find("title") is not None else ""
        for sec in root.findall(".//sec")
    ]

    # Get Main_content section, including Introduction, but excluding Figures
    main_content = get_main_content(root)

    return {
        "title": title,
        "abstract": abstract,
        "introduction": introduction,
        "figure_and_table_captions": figure_and_table_captions,
        "section_titles": section_titles,
        "main_content": main_content,
    }

# adapt number of tokens here
def step3_get_lm_review(wrapper, parsed_xml: Dict) -> Dict:
    text_to_send = prompt_truncated_paper(wrapper, parsed_xml, 15000)
    review_generated = wrapper.send_query(text_to_send, n_query=1)
    return {"text_to_send": text_to_send, "review_generated": review_generated}

In [12]:
rtransparent_df = pd.read_csv("rtransparent_gpt_xmls.csv")

In [13]:
samples = rtransparent_df['article'].tolist()
samples

['PMC6467449',
 'PMC6505707',
 'PMC7132284',
 'PMC7933130',
 'PMC7938970',
 'PMC8443060',
 'PMC8795560',
 'PMC9216730',
 'PMC9279418',
 'PMC9369370']

In [14]:
xml_samples = [xml for xml in xml_files if os.path.basename(xml).strip('.xml') in samples]

In [80]:
full_output = ""
full_output += wrapper.model_name
print(full_output)
for xml in xml_samples:
    current_output = ""
    article = os.path.basename(xml).strip('.xml')
    with open(xml) as xml_f:
        current_output += article
        current_output += '\n**********************************\n'
        current_output += f"rtransparent is open data: {rtransparent_df[rtransparent_df['article'] == article]['is_open_data'].iloc[0]}\n"
        current_output += f"rtransparent data statement: {rtransparent_df[rtransparent_df['article'] == article]['open_data_statements'].iloc[0]}\n"
        current_output += '**********************************\n'
        parsed_xml = step2_parse_xml(xml_f.read())
        generated = step3_get_lm_review(wrapper, parsed_xml)
        current_output += generated["review_generated"]
        current_output += '\n**********************************\n'
        print(current_output)
        time.sleep(60)
    full_output += current_output

gpt-4-1106-preview
# tokens sent to GPT: 11449
PMC8795560
**********************************
rtransparent is open data: False
rtransparent data statement: nan
**********************************
Transparency outline:

1. List the types of data reported in the paper
The paper is discussing policy and conceptual frameworks regarding genomic data but does not reference specific datasets or raw data generated by the research itself.

2. Species of organisms studied(e.g. human, mouse, rat, non-human primate, etc)
The paper does not detail specific organisms studied; it speaks broadly about global biodiversity.

3. How to obtain raw data
The paper does not offer raw data, nor does it present a method for obtaining such data. It notes there is "no data underlying this work."

4. Data sharing statement
The paper states, "Data Availability: There are no data underlying this work."

5. Print a pipe separated csv with: "type_of_data" | "species" | "link_to_dataset" | "data_sharing_statement"
Since