### File Structure of the Dataset Folder
```
├── journal-meta/
│   ├── 0968-090X.csv         # TRC
│   ├── mini-dataset.csv      # A mini dataset for tutorial, extracted from TRC
│   ├── journal-meta-dataset.csv   # the combined dataset for all journals
│   ├── github_data.json      # the links for the GitHub repository
│   ├── url_data.json         # the links for the data avaiablity urls
│   └── ... (other journal CSV files)
├── journal-full-text/
│   ├── 0968-090X/
│   │   └── 10.1016_j.trc.2023.104311.xml
│   └── ... (other DOI folders)
```

In [1]:
import pandas as pd
import os
from tqdm import tqdm
from utils import *
# replace it with your downloaded folder path
full_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-full-text'
meta_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-meta'
result_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-results'

In [2]:
data = pd.read_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'))
data.columns

Index(['title', 'doi', 'volume', 'date', 'year', 'month', 'abstract', 'issn',
       'journal_name', 'unique_id', 'is_github', 'num_of_github_urls',
       'is_availablity_statement', 'is_data_mentioned_in_section_title',
       'is_experiment_mentioned_in_section_title',
       'is_link_in_avaiablity_statement',
       'num_of_links_in_avaiablity_statement'],
      dtype='object')

In [3]:
data['is_availability_statement'] = 0
for i in range(len(data)):
# # In case for the mini test to debug the code
# for i in tqdm(range(1000)):
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    sections = extract_sections_and_text_from_xml(paper_path)
    reorganized_sections = postprocess_sections(sections)
    for section in reorganized_sections:
        section_title = section['title']
        if 'data availability' in section_title.lower():
            data.loc[i, 'is_availability_statement'] = 1

In [16]:
import openai
import yaml
import json
# Load API key from config.yaml
from openai import OpenAI
with open("/Users/junyi/Work/RR/config.yaml", "r") as config_file:
    config = yaml.safe_load(config_file)
openai.api_key = config["openai_api_key"]
client = OpenAI(api_key=config["openai_api_key"])
def analyze_with_openai(data_context):
    definition_context = """
                        ---------------------------
                        Definition of data source:
                        Real-world data is the data is collected from the real-world, such as data from sensors, surveys, or other sources.
                        Simulation data is the data generated from simulation or synthetic data, even though the scenario is based on real-world.
                        """
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": data_context + definition_context
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "data_source_description",
                "strict": True,
                "schema": {
                    "type": "object",
                    "properties": {
                        "source_description": {
                            "type": "string",
                            "description": "Description of the data source."
                        },
                        "real_world": {
                            "type": "boolean",
                            "description": "Is the data collected from real-world based on the definition?"
                        },
                        "simulation": {
                            "type": "boolean",
                            "description": "Is the data collected from simulation or synthetic data based on the definition?"
                        },
                        "details": {
                            "type": "object",
                            "properties": {
                                "dataset_size_description": {
                                    "type": "boolean",
                                    "description": "Indicates whether there is a description of the dataset size."
                                },
                                "data_collection_description": {
                                    "type": "boolean",
                                    "description": "Indicates whether there is a description of data collection."
                                },
                                "size_decription_detail": {
                                    "type": "string",
                                    "description": "Description of the dataset size."
                                },
                                "data_collection_detail": {
                                    "type": "string",
                                    "description": "Description of the data collection."
                                }
                            },
                            "required": [
                                "dataset_size_description",
                                "data_collection_description",
                                "size_decription_detail",
                                "data_collection_detail"
                            ],
                            "additionalProperties": False
                        }
                    },
                    "required": [
                        "source_description",
                        "real_world",
                        "simulation",
                        "details"
                    ],
                    "additionalProperties": False
                }
            }
        },
        temperature=0,
        max_completion_tokens=16383,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response.choices[0].message.content

In [None]:
import os
from tqdm import tqdm
data['source_description'] = ''
data['real_world'] = ''
data['simulation'] = ''
data['dataset_size_description'] = ''
data['data_collection_description'] = ''
data['is_data_mentioned'] = 0
if not os.path.exists(result_data_folder + '/data-description'):
    os.makedirs(result_data_folder + '/data-description')
for i in tqdm(range(len(data))):
# for i in tqdm(range(10)):
    # Initialize data context
    data_context = ''
    label = 0
    try:
        # Construct paths for journal and paper
        journal_path = os.path.join(full_data_folder, data['issn'][i])
        paper_path = os.path.join(journal_path, f"{data['unique_id'][i]}.xml")
        
        # get the abstract from the xml
        abstract = extract_abstract_from_xml(paper_path)
        if abstract:
            # if "data" in abstract.lower():
            if "data" in abstract.lower():
                label = 1
        data_context += abstract
        # Extract and process sections from the paper XML
        sections = extract_sections_and_text_from_xml(paper_path)
        reorganized_sections = postprocess_sections(sections)

        # Traverse the sections and collect data-related text
        for section in reorganized_sections:
            if 'data' in section['title'].lower():
                data_context += section['text']
                label = 1
            for subsection in section.get('subsections', []):
                if 'data' in subsection['title'].lower():
                    data_context += subsection['text']
                    label = 1
                for subsubsection in subsection.get('subsubsections', []):
                    if 'data' in subsubsection['title'].lower():
                        data_context += subsubsection['text']
                        label = 1
    except KeyError as e:
        print(f"Missing key in data: {e}")
    except FileNotFoundError as e:
        print(f"File not found: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
    if len(data_context)>0:
        response = analyze_with_openai(data_context)
        # print(data['title'][i])
        # print(response)
        response = json.loads(response)
        data.loc[i, 'source_description'] = response['source_description']
        data.loc[i, 'real_world'] = response['real_world']
        data.loc[i, 'simulation'] = response['simulation']
        data.loc[i, 'dataset_size_description'] = response['details']['dataset_size_description']
        data.loc[i, 'data_collection_description'] = response['details']['data_collection_description']
        data.loc[i, 'is_data_mentioned'] = label
        # print(response['source_description'])
        # combine the response with the data context as json file
        data_description = {
            "data_context": data_context,
            "data_source_description": response
        }
        save_json(data_description, os.path.join(result_data_folder + '/data-description', f"{data['unique_id'][i]}.json"))
    else:
        print("No data context found")  
data.to_csv(os.path.join(full_data_folder, 'full-meta-dataset-data-descriptive.csv'), index=False)

In [20]:
data.to_csv(os.path.join(meta_data_folder, 'full-meta-dataset-data-descriptive.csv'), index=False)

In [None]:
data_context

In [None]:
data['is_data_mentioned_in_section_title'] = 0
url_pattern = r'(https?://\S+|www\.\S+)'  # URL pattern
for i in range(len(data)):
# # In case for the mini test to debug the code
# for i in tqdm(range(1000)):
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    sections = extract_sections_and_text_from_xml(paper_path)
    reorganized_sections = postprocess_sections(sections)
    label = 0
    url = []
    for section in reorganized_sections:
        section_title = section['title']
        if 'data' in section_title.lower():
            if section['text']:
                if 'avaiable' in section['text'].lower():
                    label = 1
                    print(section['text'])
        for subsection in section['subsections']:
            subsection_title = subsection['title']
            if 'data' in subsection_title.lower():
                section_text = subsection['text']
                label = 1
                if section['text']:
                    url.extend(re.findall(url_pattern, section['text']))
            for subsubsection in subsection['subsubsections']:
                subsubsection_title = subsubsection['title']
                if 'data' in subsubsection_title.lower():
                    label = 1
                    if section['text']:
                        url.extend(re.findall(url_pattern, section['text']))
    data.loc[i, 'is_data_mentioned_in_section_title'] = label
    # if url:
    #     print(url)
print(len(data[data['is_data_mentioned_in_section_title'] == 1])/(len(data)))
data.to_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'), index=False)

In [None]:
data['is_experiment_mentioned_in_section_title'] = 0
for i in range(len(data)):
# # In case for the mini test to debug the code
# for i in tqdm(range(1000)):
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    sections = extract_sections_and_text_from_xml(paper_path)
    reorganized_sections = postprocess_sections(sections)
    label = 0
    for section in reorganized_sections:
        section_title = section['title']
        if 'experiment' in section_title.lower():
            label = 1
        for subsection in section['subsections']:
            subsection_title = subsection['title']
            if 'experiment' in subsection_title.lower():
                label = 1
            for subsubsection in subsection['subsubsections']:
                subsubsection_title = subsubsection['title']
                if 'experiment' in subsubsection_title.lower():
                    label = 1
    data.loc[i, 'is_experiment_mentioned_in_section_title'] = label
data.to_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'), index=False)

In [None]:
data['is_link_in_avaiablity_statement'] = 0
data['num_of_links_in_avaiablity_statement'] = 0
url_pattern = r'(https?://\S+|www\.\S+)'  # URL pattern
# # In case for the mini test to debug the code
url_data = []
for i in range(len(data)):
# for i in range(1000):
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    sections = extract_sections_and_text_from_xml(paper_path)
    reorganized_sections = postprocess_sections(sections)
    for section in reorganized_sections:
        section_title = section['title']
        if 'data availability' in section_title.lower():
            if 'http' in section['text']:
                data.loc[i, 'is_link_in_avaiablity_statement'] = 1
                url = re.findall(url_pattern, section['text'])
                unique_url = list(set(url))
                if url:
                    print(url)
                url_data.append({
                'issn': data['issn'][i],
                'unique_id': data['unique_idf'][i],
                'title': data['title'][i],
                'url': unique_url
                })
                data.loc[i, 'num_of_links_in_avaiablity_statement'] = len(unique_url)
data.to_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'), index=False)
save_json(url_data, os.path.join(meta_data_folder, 'url_data.json'))

In [None]:
content = response.choices[0].message.content

In [None]:
# transform a str to json
content

In [None]:
import json
json_content = json.loads(content)

In [None]:
json_content['real_world']