### File Structure of the Dataset Folder
```
├── journal-meta/
│   ├── 0968-090X.csv         # TRC
│   ├── mini-dataset.csv      # A mini dataset for tutorial, extracted from TRC
│   ├── journal-meta-dataset.csv   # the combined dataset for all journals
│   ├── github_data.json      # the links for the GitHub repository
│   ├── url_data.json         # the links for the data avaiablity urls
│   └── ... (other journal CSV files)
├── journal-full-text/
│   ├── 0968-090X/
│   │   └── 10.1016_j.trc.2023.104311.xml
│   └── ... (other DOI folders)
```

In [None]:
import pandas as pd
import os
from tqdm import tqdm
from utils import *
# replace it with your downloaded folder path
full_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-full-text'
meta_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-meta'

In [None]:
data = pd.read_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'))
data.columns

In [None]:
data['is_availablity_statement'] = 0
for i in range(len(data)):
# # In case for the mini test to debug the code
# for i in tqdm(range(1000)):
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    sections = extract_sections_and_text_from_xml(paper_path)
    reorganized_sections = postprocess_sections(sections)
    for section in reorganized_sections:
        section_title = section['title']
        if 'data availability' in section_title.lower():
            data.loc[i, 'is_availablity_statement'] = 1
data.to_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'), index=False)

In [None]:
data['is_data_mentioned_in_section_title'] = 0
for i in range(len(data)):
# # In case for the mini test to debug the code
# for i in tqdm(range(1000)):
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    sections = extract_sections_and_text_from_xml(paper_path)
    reorganized_sections = postprocess_sections(sections)
    label = 0
    for section in reorganized_sections:
        section_title = section['title']
        if 'data' in section_title.lower():
            label = 1
        for subsection in section['subsections']:
            subsection_title = subsection['title']
            if 'data' in subsection_title.lower():
                label = 1
            for subsubsection in subsection['subsubsections']:
                subsubsection_title = subsubsection['title']
                if 'data' in subsubsection_title.lower():
                    label = 1
    data.loc[i, 'is_data_mentioned_in_section_title'] = label
data.to_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'), index=False)

In [None]:
data['is_experiment_mentioned_in_section_title'] = 0
for i in range(len(data)):
# # In case for the mini test to debug the code
# for i in tqdm(range(1000)):
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    sections = extract_sections_and_text_from_xml(paper_path)
    reorganized_sections = postprocess_sections(sections)
    label = 0
    for section in reorganized_sections:
        section_title = section['title']
        if 'experiment' in section_title.lower():
            label = 1
        for subsection in section['subsections']:
            subsection_title = subsection['title']
            if 'experiment' in subsection_title.lower():
                label = 1
            for subsubsection in subsection['subsubsections']:
                subsubsection_title = subsubsection['title']
                if 'experiment' in subsubsection_title.lower():
                    label = 1
    data.loc[i, 'is_experiment_mentioned_in_section_title'] = label
data.to_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'), index=False)

In [None]:
data['is_link_in_avaiablity_statement'] = 0
data['num_of_links_in_avaiablity_statement'] = 0
url_pattern = r'(https?://\S+|www\.\S+)'  # URL pattern
# # In case for the mini test to debug the code
url_data = []
for i in range(len(data)):
# for i in range(1000):
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    sections = extract_sections_and_text_from_xml(paper_path)
    reorganized_sections = postprocess_sections(sections)
    for section in reorganized_sections:
        section_title = section['title']
        if 'data availability' in section_title.lower():
            if 'http' in section['text']:
                data.loc[i, 'is_link_in_avaiablity_statement'] = 1
                url = re.findall(url_pattern, section['text'])
                unique_url = list(set(url))
                if url:
                    print(url)
                url_data.append({
                'issn': data['issn'][i],
                'unique_id': data['unique_idf'][i],
                'title': data['title'][i],
                'url': unique_url
                })
                data.loc[i, 'num_of_links_in_avaiablity_statement'] = len(unique_url)
data.to_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'), index=False)
save_json(url_data, os.path.join(meta_data_folder, 'url_data.json'))