### File Structure of the Dataset Folder
```
├── journal-meta/
│   ├── 0968-090X.csv         # TRC
│   ├── mini-dataset.csv      # A mini dataset for tutorial, extracted from TRC
│   └── ... (other journal CSV files)
├── journal-full-text/
│   ├── 0968-090X/
│   │   └── 10.1016_j.trc.2023.104311.xml
│   └── ... (other DOI folders)
```

In [1]:
import pandas as pd
import os
import re
import xml.etree.ElementTree as ET
# here replace the path to the folder of your downloaded dataset
full_text_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-full-text'
meta_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-meta'
# List all files in the folder with csv
journals = [f for f in os.listdir(meta_folder) if f.endswith('.csv')]
journal_issn_list = [['TRA','0965-8564'],
                     ['TRB','0191-2615'],
                     ['TRC','0968-090X'],
                     ['TRD','1361-9209'],
                     ['TRE','1366-5545'],
                     ['TRF','1369-8478'],
                     ['TRIP','2590-1982'],
                     ['mini-dataset','0968-090X']]
journal_issn_df = pd.DataFrame(journal_issn_list, columns=['journal','issn'])

In [2]:
# This section demonstrates how to work with the dataset, 
# utilizing the dataframe's apply method for efficient iteration in a loop.
# An example here is to clean up the abstract.
def cleanup_abstract(abstract):
    """
    Cleans up an abstract string by standardizing spacing.

    Args:
        abstract (str): The abstract of a journal article, which may contain irregular spacing,
                        including multiple spaces, leading spaces, or trailing spaces.

    Returns:
        str: A cleaned string where all excessive spaces are replaced with a single space,
             and any leading or trailing spaces are removed. This is essential for preparing
             text data for further analysis or display, ensuring uniformity in the formatting
             of abstracts.

    Example:
        >>> cleanup_abstract("  This  is   an example   abstract.  ")
        'This is an example abstract.'
    """
    # Check if the input is a string
    if not isinstance(abstract, str):
        raise ValueError("Input must be a string.")
    
    return re.sub(r'\s+', ' ', abstract).strip()
for journal in journals:
    if journal == 'mini-dataset.csv': # to take the mini dataset for tutorial
        # connect the journal name with the issn from the journal_issn_df
        journal_issn = journal_issn_df[journal_issn_df['journal'] == journal.replace('.csv', '')]['issn'].values[0]
        journal_meta = pd.read_csv(os.path.join(meta_folder, journal))
        journal_meta['issn'] = journal_issn
        journal_meta['abstract'] = journal_meta['abstract'].apply(cleanup_abstract) # to clean up the abstract
        journal_meta.to_csv(os.path.join(meta_folder, journal), index=False) # at the end, save the cleaned dataset

In [128]:
# This section demostrates how to link the doi with the full text
def doi_to_unique_id(doi):
    """
    Converts a DOI to a unique identifier by replacing slashes with underscores.

    Args:
        doi (str): The DOI of a journal article.

    Returns:
        str: A unique identifier where slashes are replaced with underscores.

    Example:
        >>> doi_to_unique_id("10.1016/j.trc.2023.104311")
        "10.1016_j.trc.2023_104311"
    """
    return doi.replace('/', '_')

import xml.etree.ElementTree as ET

def extract_sections_and_text_from_xml(file_path):
    """
    Extracts sections and text from an XML file.
    
    Args:
        file_path (str): The path to the XML file.

    Returns:
        list: A list of dictionaries, each containing the label, title, text, subsections, and subsubsections of a section.
    
    Example:
        >>> extract_sections_and_text_from_xml('/path/to/file.xml')
        [{'label': '1', 'title': 'Introduction', 'text': 'This is the introduction...', 'subsections': []}]
    """
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Namespace to handle XML namespaces
    namespaces = {
        'xocs': 'http://www.elsevier.com/xml/xocs/dtd',
        'ce': 'http://www.elsevier.com/xml/common/dtd',
        'ja': 'http://www.elsevier.com/xml/ja/dtd',
        'mml': 'http://www.w3.org/1998/Math/MathML'
    }

    # Extracting the sections using the item-toc element
    sections = []
    for item in root.findall('.//xocs:item-toc-entry', namespaces):
        section_title = item.find('xocs:item-toc-section-title', namespaces)
        section_label = item.find('xocs:item-toc-label', namespaces)
        section_text = []
        
        # Use the section label to find the corresponding section id in <ce:section>
        if section_label is not None and section_title is not None:
            label_text = section_label.text.strip()
            section_elem = root.find(f".//ce:section[ce:label='{label_text}']", namespaces)
            if section_elem is not None:
                # Get all text under the section element, including paragraphs and other texts
                section_text_parts = []
                subsections = []
                before_subsection_text = True

                # Iterate over all elements within the section
                for elem in section_elem:
                    # Check if this element is a subsection
                    if elem.tag == f"{{{namespaces['ce']}}}section":
                        # This is a subsection, process it
                        subsection_title_elem = elem.find(f"ce:section-title", namespaces)
                        if subsection_title_elem is not None:
                            subsection_title = subsection_title_elem.text
                            subsection_paragraphs = []
                            subsubsections = []
                            
                            for sub_elem in elem:
                                # If this is a paragraph, append text
                                if sub_elem.tag == f"{{{namespaces['ce']}}}para":
                                    paragraph_text = ''.join(sub_elem.itertext())
                                    subsection_paragraphs.append(paragraph_text)
                                
                                # If this is a sub-subsection, process it
                                elif sub_elem.tag == f"{{{namespaces['ce']}}}section":
                                    subsubsection_title_elem = sub_elem.find(f"ce:section-title", namespaces)
                                    if subsubsection_title_elem is not None:
                                        subsubsection_title = subsubsection_title_elem.text
                                        subsubsection_paragraphs = []
                                        for subsub_elem in sub_elem.findall('ce:para', namespaces=namespaces):
                                            paragraph_text = ''.join(subsub_elem.itertext())
                                            subsubsection_paragraphs.append(paragraph_text)
                                        subsubsection_text = ' '.join(subsubsection_paragraphs)
                                        subsubsections.append({
                                            "label": sub_elem.find(f"ce:label", namespaces).text if sub_elem.find(f"ce:label", namespaces) is not None else "",
                                            "title": subsubsection_title,
                                            "text": subsubsection_text
                                        })
                            
                            subsection_text = ' '.join(subsection_paragraphs)
                            subsections.append({
                                "label": elem.find(f"ce:label", namespaces).text if elem.find(f"ce:label", namespaces) is not None else "",
                                "title": subsection_title,
                                "text": subsection_text,
                                "subsubsections": subsubsections
                            })
                    else:
                        # Collect text before any subsection starts
                        if before_subsection_text and elem.tag == f"{{{namespaces['ce']}}}para":
                            paragraph_text = ''.join(elem.itertext())
                            section_text_parts.append(paragraph_text)

                section_text = ' '.join(section_text_parts)
                
                sections.append({
                    "label": section_label.text,
                    "title": section_title.text,
                    "text": section_text,
                    "subsections": subsections
                })

    return sections

# Function to postprocess sections, subsections, and subsubsections
def postprocess_sections(data):
    """
    Postprocesses sections, subsections, and subsubsections by removing duplicate labels and ensuring unique content.

    Args:
        data (list): A list of dictionaries, each containing the label, title, text, subsections, and subsubsections of a section.
    
    Returns:
        list: A list of dictionaries, each containing the label, title, text, subsections, and subsubsections of a section.

    Example:
        >>> reorganized_sections = postprocess_sections(sections)
        # Save the reorganized sections to a JSON file
        import json
        # Define the file path for the output
        output_file_path = '../example.json'
        
        # Open the file in write mode and dump the data
        with open(output_file_path, 'w') as file:
            json.dump(reorganized_sections, file, indent=4)  # Added indentation for better readability
        
        for section in reorganized_sections:
            print(section['label'], section['title'])
            for subsection in section['subsections']:
                print("    ", subsection['label'], subsection['title'])
                for subsubsection in subsection['subsubsections']:
                    print("        ", subsubsection['label'], subsubsection['title'])
        # if you want to see the original sections, subsections, and subsubsections, you can use the following code
        # for section in sections:
        #     print(section['label'], section['title'])
        #     for subsection in section['subsections']:
        #         print("    ", subsection['label'], subsection['title'])
        #         for subsubsection in subsection['subsubsections']:
        #             print("        ", subsubsection['label'], subsubsection['title'])
    """
    reorganized_data = []
    
    labels_to_remove = set()
    
    for section in data:
        # Skip if the section is marked for removal
        if section["label"] in labels_to_remove:
            continue
        
        new_section = {
            "label": section["label"],
            "title": section["title"],
            "text": section["text"],
            "subsections": []
        }
        
        # Iterate through subsections to reorganize them
        for subsection in data:
            # Check if the subsection label starts with the section label and follows the x.x format
            if subsection["label"].startswith(section["label"] + ".") and len(subsection["label"].split('.')) == 2:
                new_subsection = {
                    "label": subsection["label"],
                    "title": subsection["title"],
                    "text": subsection["text"],
                    "subsubsections": []
                }
                labels_to_remove.add(subsection["label"])
                
                # Iterate through subsubsections to reorganize them under the appropriate subsection
                for subsubsection in data:
                    if subsubsection["label"].startswith(new_subsection["label"] + "."):
                        new_subsubsection = {
                            "label": subsubsection["label"],
                            "title": subsubsection["title"],
                            "text": subsubsection["text"]
                        }
                        labels_to_remove.add(subsubsection["label"])
                        new_subsection["subsubsections"].append(new_subsubsection)
                
                # Add the subsection only if it is unique or has no subsubsections
                if new_subsection["subsubsections"]:
                    # If subsubsections exist, avoid duplicate content
                    new_subsection["text"] = ""
                new_section["subsections"].append(new_subsection)
        
        reorganized_data.append(new_section)
    
    return reorganized_data

# search the "github.com" across all the text in all the sections, subsections, and subsubsections
# and extract the full github url, like https://github.com/username/repository
def extract_github_urls(text):
    # Regular expression to match GitHub URLs exactly without trailing directories
    github_url_pattern = r"https?://github\.com/[\w-]+/[\w-]+(?!/\S)"
    
    # Find all matching GitHub URLs in the text
    github_urls = re.findall(github_url_pattern, text)
    
    return github_urls

In [138]:
for journal in journals:
    if journal == 'mini-dataset.csv': # to take the mini dataset for tutorial
        journal_issn = journal_issn_df[journal_issn_df['journal'] == journal.replace('.csv', '')]['issn'].values[0]
        journal_meta = pd.read_csv(os.path.join(meta_folder, journal))
        journal_meta['unique_id'] = journal_meta['doi'].apply(doi_to_unique_id) # to convert the doi to a unique id
        # here we use the for loop to help understand how it works, it can be done in one line of code later
        for i in range(len(journal_meta)):
            github_urls = []
            fulltext_path = os.path.join(full_text_folder, journal_issn, journal_meta.iloc[i]['unique_id'] + '.xml')
            sections = extract_sections_and_text_from_xml(fulltext_path)
            reorganized_sections = postprocess_sections(sections)
            for section in reorganized_sections:
                # add a preprcessing for the text here to make it more readable
                urls  = extract_github_urls(cleanup_abstract(section['text']))
                if urls:
                    github_urls.extend(urls)
                for subsection in section['subsections']:
                    urls = github_urls.extend(extract_github_urls(cleanup_abstract(subsection['text'])))
                    if urls:
                        github_urls.extend(urls)
                    for subsubsection in subsection['subsubsections']:
                        urls = extract_github_urls(cleanup_abstract(subsubsection['text']))
                        if urls:
                            github_urls.extend(urls)
            if github_urls:
                print(github_urls)
# write the reorganized sections to a json file
import json
with open('example.json', 'w') as file:
    json.dump(reorganized_sections, file, indent=4)
            

['https://github.com/pabloguarda/isuelogit', 'https://github.com/pabloguarda/pesuelogit']
['https://github.com/zhandongxu/GP_RTAP']
['https://github.com/tjzxh/EADC']
['https://github.com/LehmannJonas/2E-MT-VRP-PTW-Instances']
['https://github.com/LiBiyue/MAST-GNN']
['https://github.com/HDDL/DPRDDM']
['https://github.com/xinychen/transdim']


In [125]:
for section in reorganized_sections:
    print(section['label'], section['title'])
    # print(cleanup_abstract(section['text']))
    # print('--------------------------------')
    for subsection in section['subsections']:
        print("    ", subsection['label'], subsection['title'])
        # print(cleanup_abstract(subsection['text']))
        # print('--------------------------------')
        for subsubsection in subsection['subsubsections']:
            print("        ", subsubsection['label'], subsubsection['title'])
            # print(cleanup_abstract(subsubsection['text']))
            # print('--------------------------------')
            break

1 Introduction
2 Preliminaries
     2.1 Notations
     2.2 Problem definition
3 Methodology
     3.1 Model description
     3.2 Computing the variable 
     3.3 Computing the variable 
     3.4 Solution algorithm
4 Experiments
     4.1 Traffic data sets
     4.2 Baseline imputation models
     4.3 Imputation results
         4.3.1 Evaluation on PeMS-4W and PeMS-8W Data
5 Conclusion and future directions
Appendix A Computing sources


In [140]:
for journal in journals:
    if journal == '0968-090X.csv': # to take the trc dataset for test
        journal_issn = journal.replace('.csv', '')
        # journal_issn = journal_issn_df[journal_issn_df['journal'] == journal.replace('.csv', '')]['issn'].values[0]
        journal_meta = pd.read_csv(os.path.join(meta_folder, journal))
        journal_meta['unique_id'] = journal_meta['doi'].apply(doi_to_unique_id) # to convert the doi to a unique id
        # here we use the for loop to help understand how it works, it can be done in one line of code later
        count = 0
        for i in range(len(journal_meta)):
            github_urls = []
            fulltext_path = os.path.join(full_text_folder, journal_issn, journal_meta.iloc[i]['unique_id'] + '.xml')
            sections = extract_sections_and_text_from_xml(fulltext_path)
            reorganized_sections = postprocess_sections(sections)
            for section in reorganized_sections:
                # add a preprcessing for the text here to make it more readable
                urls  = extract_github_urls(cleanup_abstract(section['text']))
                if urls:
                    github_urls.extend(urls)
                for subsection in section['subsections']:
                    urls = github_urls.extend(extract_github_urls(cleanup_abstract(subsection['text'])))
                    if urls:
                        github_urls.extend(urls)
                    for subsubsection in subsection['subsubsections']:
                        urls = extract_github_urls(cleanup_abstract(subsubsection['text']))
                        if urls:
                            github_urls.extend(urls)
            if github_urls:
                count += 1
                print(github_urls)
        print(f'{count}/{len(journal_meta)}')    

['https://github.com/lijunsun/bgcp_imputation']
['https://github.com/bstabler/TransportationNetwork']
['https://github.com/davidrey123/Ridesharing']
['https://github.com/FwDeng/ODPFM']
['https://github.com/DanqingZ/CPS_TRC']
['https://github.com/Lemma1/Multimodal-DUE']
['https://github.com/sysuits/BATF']
['https://github.com/junzis/acsmc']
['https://github.com/stasmix/popsynth']
['https://github.com/mbattifarano/mac-data']
['https://github.com/ZiyuanGu/network-bi-partitioning']
['https://github.com/adfriedm/WorkFunctionAlgorithm']
['https://github.com/optimatorlab/mFSTSP']
['https://github.com/rahulnair23/transfor-2019']
['https://github.com/LiTrans/BSMD']
['https://github.com/cjsyzwsh/ASU-DNN']
['https://github.com/jsebanaz90/TRC_2019_867-SupplementaryMaterials']
['https://github.com/wenbo-purdue-git/isttt-23-taxi-system-modeling-']
['https://github.com/xchChen/CACSP_ADMM']
['https://github.com/junzis/openap', 'https://github.com/tudelft-cns-atm/bluesky']
['https://github.com/DrKeHan/