### File Structure of the Dataset Folder
```
├── journal-meta/
│   ├── 0968-090X.csv         # TRC
│   ├── mini-dataset.csv      # A mini dataset for tutorial, extracted from TRC
│   └── ... (other journal CSV files)
├── journal-full-text/
│   ├── 0968-090X/
│   │   └── 10.1016_j.trc.2023.104311.xml
│   └── ... (other DOI folders)
```

In [1]:
import pandas as pd
import os
from tqdm import tqdm
from utils import *
full_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-full-text'
meta_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-meta'

In [2]:
import pandas as pd
import os
import re
import xml.etree.ElementTree as ET

In [3]:
# This section demonstrates how to work with the dataset, 
# utilizing the dataframe's apply method for efficient iteration in a loop.
# An example here is to clean up the abstract.
def cleanup_abstract(abstract):
    """
    Cleans up an abstract string by standardizing spacing.

    Args:
        abstract (str): The abstract of a journal article, which may contain irregular spacing,
                        including multiple spaces, leading spaces, or trailing spaces.

    Returns:
        str: A cleaned string where all excessive spaces are replaced with a single space,
             and any leading or trailing spaces are removed. This is essential for preparing
             text data for further analysis or display, ensuring uniformity in the formatting
             of abstracts.

    Example:
        >>> cleanup_abstract("  This  is   an example   abstract.  ")
        'This is an example abstract.'
    """
    # Check if the input is a string
    if not isinstance(abstract, str):
        raise ValueError("Input must be a string.")
    
    return re.sub(r'\s+', ' ', abstract).strip()
'''
for journal in journals:
    if journal == 'mini-dataset.csv': # to take the mini dataset for tutorial
        # connect the journal name with the issn from the journal_issn_df
        journal_issn = journal_issn_df[journal_issn_df['journal'] == journal.replace('.csv', '')]['issn'].values[0]
        journal_meta = pd.read_csv(os.path.join(meta_folder, journal))
        journal_meta['issn'] = journal_issn
        journal_meta['abstract'] = journal_meta['abstract'].apply(cleanup_abstract) # to clean up the abstract
        journal_meta.to_csv(os.path.join(meta_folder, journal), index=False) # at the end, save the cleaned dataset
'''

"\nfor journal in journals:\n    if journal == 'mini-dataset.csv': # to take the mini dataset for tutorial\n        # connect the journal name with the issn from the journal_issn_df\n        journal_issn = journal_issn_df[journal_issn_df['journal'] == journal.replace('.csv', '')]['issn'].values[0]\n        journal_meta = pd.read_csv(os.path.join(meta_folder, journal))\n        journal_meta['issn'] = journal_issn\n        journal_meta['abstract'] = journal_meta['abstract'].apply(cleanup_abstract) # to clean up the abstract\n        journal_meta.to_csv(os.path.join(meta_folder, journal), index=False) # at the end, save the cleaned dataset\n"

In [6]:
# This section demostrates how to link the doi with the full text
def doi_to_unique_id(doi):
    """
    Converts a DOI to a unique identifier by replacing slashes with underscores.

    Args:
        doi (str): The DOI of a journal article.

    Returns:
        str: A unique identifier where slashes are replaced with underscores.

    Example:
        >>> doi_to_unique_id("10.1016/j.trc.2023.104311")
        "10.1016_j.trc.2023_104311"
    """
    return doi.replace('/', '_')

import xml.etree.ElementTree as ET

def extract_sections_and_text_from_xml(file_path):
    """
    Extracts sections and text from an XML file.
    
    Args:
        file_path (str): The path to the XML file.

    Returns:
        list: A list of dictionaries, each containing the label, title, text, subsections, and subsubsections of a section.
    
    Example:
        >>> extract_sections_and_text_from_xml('/path/to/file.xml')
        [{'label': '1', 'title': 'Introduction', 'text': 'This is the introduction...', 'subsections': []}]
    """
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Namespace to handle XML namespaces
    namespaces = {
        'xocs': 'http://www.elsevier.com/xml/xocs/dtd',
        'ce': 'http://www.elsevier.com/xml/common/dtd',
        'ja': 'http://www.elsevier.com/xml/ja/dtd',
        'mml': 'http://www.w3.org/1998/Math/MathML'
    }

    # Extracting the sections using the item-toc element
    sections = []
    for item in root.findall('.//xocs:item-toc-entry', namespaces):
        section_title = item.find('xocs:item-toc-section-title', namespaces)
        section_label = item.find('xocs:item-toc-label', namespaces)
        section_text = []
        
        # Use the section label to find the corresponding section id in <ce:section>
        if section_label is not None and section_title is not None:
            label_text = section_label.text.strip()
            section_elem = root.find(f".//ce:section[ce:label='{label_text}']", namespaces)
            if section_elem is not None:
                # Get all text under the section element, including paragraphs and other texts
                section_text_parts = []
                subsections = []
                before_subsection_text = True

                # Iterate over all elements within the section
                for elem in section_elem:
                    # Check if this element is a subsection
                    if elem.tag == f"{{{namespaces['ce']}}}section":
                        # This is a subsection, process it
                        subsection_title_elem = elem.find(f"ce:section-title", namespaces)
                        if subsection_title_elem is not None:
                            subsection_title = subsection_title_elem.text
                            subsection_paragraphs = []
                            subsubsections = []
                            
                            for sub_elem in elem:
                                # If this is a paragraph, append text
                                if sub_elem.tag == f"{{{namespaces['ce']}}}para":
                                    paragraph_text = ''.join(sub_elem.itertext())
                                    subsection_paragraphs.append(paragraph_text)
                                
                                # If this is a sub-subsection, process it
                                elif sub_elem.tag == f"{{{namespaces['ce']}}}section":
                                    subsubsection_title_elem = sub_elem.find(f"ce:section-title", namespaces)
                                    if subsubsection_title_elem is not None:
                                        subsubsection_title = subsubsection_title_elem.text
                                        subsubsection_paragraphs = []
                                        for subsub_elem in sub_elem.findall('ce:para', namespaces=namespaces):
                                            paragraph_text = ''.join(subsub_elem.itertext())
                                            subsubsection_paragraphs.append(paragraph_text)
                                        subsubsection_text = ' '.join(subsubsection_paragraphs)
                                        subsubsections.append({
                                            "label": sub_elem.find(f"ce:label", namespaces).text if sub_elem.find(f"ce:label", namespaces) is not None else "",
                                            "title": subsubsection_title,
                                            "text": subsubsection_text
                                        })
                            
                            subsection_text = ' '.join(subsection_paragraphs)
                            subsections.append({
                                "label": elem.find(f"ce:label", namespaces).text if elem.find(f"ce:label", namespaces) is not None else "",
                                "title": subsection_title,
                                "text": subsection_text,
                                "subsubsections": subsubsections
                            })
                    else:
                        # Collect text before any subsection starts
                        if before_subsection_text and elem.tag == f"{{{namespaces['ce']}}}para":
                            paragraph_text = ''.join(elem.itertext())
                            section_text_parts.append(paragraph_text)

                section_text = ' '.join(section_text_parts)
                
                sections.append({
                    "label": section_label.text,
                    "title": section_title.text,
                    "text": section_text,
                    "subsections": subsections
                })

    return sections

# Function to postprocess sections, subsections, and subsubsections
def postprocess_sections(data):
    """
    Postprocesses sections, subsections, and subsubsections by removing duplicate labels and ensuring unique content.

    Args:
        data (list): A list of dictionaries, each containing the label, title, text, subsections, and subsubsections of a section.
    
    Returns:
        list: A list of dictionaries, each containing the label, title, text, subsections, and subsubsections of a section.

    Example:
        >>> reorganized_sections = postprocess_sections(sections)
        # Save the reorganized sections to a JSON file
        import json
        # Define the file path for the output
        output_file_path = '../example.json'
        
        # Open the file in write mode and dump the data
        with open(output_file_path, 'w') as file:
            json.dump(reorganized_sections, file, indent=4)  # Added indentation for better readability
        
        for section in reorganized_sections:
            print(section['label'], section['title'])
            for subsection in section['subsections']:
                print("    ", subsection['label'], subsection['title'])
                for subsubsection in subsection['subsubsections']:
                    print("        ", subsubsection['label'], subsubsection['title'])
        # if you want to see the original sections, subsections, and subsubsections, you can use the following code
        # for section in sections:
        #     print(section['label'], section['title'])
        #     for subsection in section['subsections']:
        #         print("    ", subsection['label'], subsection['title'])
        #         for subsubsection in subsection['subsubsections']:
        #             print("        ", subsubsection['label'], subsubsection['title'])
    """
    reorganized_data = []
    
    labels_to_remove = set()
    
    for section in data:
        # Skip if the section is marked for removal
        if section["label"] in labels_to_remove:
            continue
        
        new_section = {
            "label": section["label"],
            "title": section["title"],
            "text": section["text"],
            "subsections": []
        }
        
        # Iterate through subsections to reorganize them
        for subsection in data:
            # Check if the subsection label starts with the section label and follows the x.x format
            if subsection["label"].startswith(section["label"] + ".") and len(subsection["label"].split('.')) == 2:
                new_subsection = {
                    "label": subsection["label"],
                    "title": subsection["title"],
                    "text": subsection["text"],
                    "subsubsections": []
                }
                labels_to_remove.add(subsection["label"])
                
                # Iterate through subsubsections to reorganize them under the appropriate subsection
                for subsubsection in data:
                    if subsubsection["label"].startswith(new_subsection["label"] + "."):
                        new_subsubsection = {
                            "label": subsubsection["label"],
                            "title": subsubsection["title"],
                            "text": subsubsection["text"]
                        }
                        labels_to_remove.add(subsubsection["label"])
                        new_subsection["subsubsections"].append(new_subsubsection)
                
                # Add the subsection only if it is unique or has no subsubsections
                if new_subsection["subsubsections"]:
                    # If subsubsections exist, avoid duplicate content
                    new_subsection["text"] = ""
                new_section["subsections"].append(new_subsection)
        
        reorganized_data.append(new_section)
    
    return reorganized_data

# search the "github.com" across all the text in all the sections, subsections, and subsubsections
# and extract the full github url, like https://github.com/username/repository
def extract_github_urls(text):
    # Regular expression to match GitHub URLs exactly without trailing directories
    github_url_pattern = r"https?://github\.com/[\w-]+/[\w-]+(?!/\S)"
    
    # Find all matching GitHub URLs in the text
    github_urls = re.findall(github_url_pattern, text)
    
    return github_urls

In [7]:
for journal in journals:
    if journal == 'mini-dataset.csv': # to take the mini dataset for tutorial
        journal_issn = journal_issn_df[journal_issn_df['journal'] == journal.replace('.csv', '')]['issn'].values[0]
        journal_meta = pd.read_csv(os.path.join(meta_folder, journal))
        journal_meta['unique_id'] = journal_meta['doi'].apply(doi_to_unique_id) # to convert the doi to a unique id
        # here we use the for loop to help understand how it works, it can be done in one line of code later
        for i in range(len(journal_meta)):
            github_urls = []
            fulltext_path = os.path.join(full_text_folder, journal_issn, journal_meta.iloc[i]['unique_id'] + '.xml')
            sections = extract_sections_and_text_from_xml(fulltext_path)
            reorganized_sections = postprocess_sections(sections)
            for section in reorganized_sections:
                # add a preprcessing for the text here to make it more readable
                urls  = extract_github_urls(cleanup_abstract(section['text']))
                if urls:
                    github_urls.extend(urls)
                for subsection in section['subsections']:
                    urls = github_urls.extend(extract_github_urls(cleanup_abstract(subsection['text'])))
                    if urls:
                        github_urls.extend(urls)
                    for subsubsection in subsection['subsubsections']:
                        urls = extract_github_urls(cleanup_abstract(subsubsection['text']))
                        if urls:
                            github_urls.extend(urls)
            if github_urls:
                print(github_urls)
                
# write the reorganized sections to a json file
import json
with open('example.json', 'w') as file:
    json.dump(reorganized_sections, file, indent=4)
            

NameError: name 'reorganized_sections' is not defined

In [9]:
for section in reorganized_sections:
    print(section['label'], section['title'])
    # print(cleanup_abstract(section['text']))
    # print('--------------------------------')
    for subsection in section['subsections']:
        print("    ", subsection['label'], subsection['title'])
        # print(cleanup_abstract(subsection['text']))
        # print('--------------------------------')
        for subsubsection in subsection['subsubsections']:
            print("        ", subsubsection['label'], subsubsection['title'])
            # print(cleanup_abstract(subsubsection['text']))
            # print('--------------------------------')
            break

1 Introduction
2 Related studies
     2.1 Data-driven traffic prediction
     2.2 Model-based traffic prediction
     2.3 Hybrid traffic prediction
3 The proposed model
     3.1 Model framework
     3.2 Data generation module
     3.3 Event detection module
         3.3.1 Global standard error range
     3.4 Traffic prediction module
         3.4.1 Sub-module 1: Base predictor using historical data
4 Experiments
     4.1 Study area
     4.2 Dataset construction for traffic prediction
         4.2.1 Historical traffic data
     4.3 Prediction setting
         4.3.1 Base predictor of sub-module 1
5 Experimental results
     5.1 Evaluation measurements
     5.2 Experiment analysis
6 Conclusion


In [10]:
for journal in journals:
    if journal == '0968-090X.csv': # to take the trc dataset for test
        journal_issn = journal.replace('.csv', '')
        # journal_issn = journal_issn_df[journal_issn_df['journal'] == journal.replace('.csv', '')]['issn'].values[0]
        journal_meta = pd.read_csv(os.path.join(meta_folder, journal))
        journal_meta['unique_id'] = journal_meta['doi'].apply(doi_to_unique_id) # to convert the doi to a unique id
        # here we use the for loop to help understand how it works, it can be done in one line of code later
        count = 0
        for i in range(len(journal_meta)):
            github_urls = []
            fulltext_path = os.path.join(full_text_folder, journal_issn, journal_meta.iloc[i]['unique_id'] + '.xml')
            sections = extract_sections_and_text_from_xml(fulltext_path)
            reorganized_sections = postprocess_sections(sections)
            for section in reorganized_sections:
                # add a preprcessing for the text here to make it more readable
                urls  = extract_github_urls(cleanup_abstract(section['text']))
                if urls:
                    github_urls.extend(urls)
                for subsection in section['subsections']:
                    urls = github_urls.extend(extract_github_urls(cleanup_abstract(subsection['text'])))
                    if urls:
                        github_urls.extend(urls)
                    for subsubsection in subsection['subsubsections']:
                        urls = extract_github_urls(cleanup_abstract(subsubsection['text']))
                        if urls:
                            github_urls.extend(urls)
            if github_urls:
                count += 1
                print(github_urls)
        print(f'{count}/{len(journal_meta)}')    

In [None]:
import os
import re

import pandas as pd
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm", disable=["parser", "attribute_ruler"])


def doi_to_unique_id(doi):
    """Convert DOI to a unique identifier."""
    return doi.replace('/', '_')


# Function to classify data acquisition type
def classify_acquisition_type(text):
    patterns = {
        "simulated": r"(simulation|simulated|model-based|virtual|synthetic data|emulated|agent-based modeling|"
                     r"traffic simulation|transport simulation|micro-simulation|macro-simulation|meso-simulation|"
                     r"behavioral modeling|route choice modeling|demand forecasting simulation|synthetic scenarios|GAN-generated)",
        "generated": r"(generated|created|constructed|fabricated|synthesized|computer-generated|"
                     r"algorithm-generated|machine-generated|data augmentation|automatically produced|"
                     r"stochastic modeling|synthetic generation|trained on synthetic|augmented data)",
        "existing": r"(existing|archived|previously collected|third-party|external|publicly available|real-world|"
                    r"historical data|field data|empirical data|open datasets|government data|observational data|"
                    r"collected from sensors|IoT data|crowdsourced data|published data|survey results|"
                    r"remote sensing|satellite data|real-time data)"
    }
    for acquisition_type, pattern in patterns.items():
        if re.search(pattern, text, re.IGNORECASE):
            return acquisition_type
    return None



# Function to classify data access type
def classify_access_type(text):
    patterns = {
        "open source": r"(open\s*source|publicly\s*available|shared\s*freely|free\s*access|freely\s*accessible|"
                       r"unrestricted access|hosted on GitHub|open repository|downloadable datasets)",
        "proprietary": r"(proprietary|restricted|commercial|license required|not open|private|subscription|confidential|"
                       r"requires authorization|data agreement|exclusive access)",
        "third-party": r"(third-party|external source|external provider|sourced externally|purchased data|licensed from|"
                        r"collaborative data|shared by partners|data obtained from agencies|"
                        r"industry-provided data|crowdsourced)"
    }
    for access_type, pattern in patterns.items():
        if re.search(pattern, text, re.IGNORECASE):
            return access_type
    return None



# Function to classify data type
def classify_data_type(text):
    patterns = {
        "open source": r"(open\s*source|publicly\s*available|shared\s*freely|free\s*access|freely\s*accessible|"
                       r"open\s*data|free\s*to\s*use|government\s*open\s*data|unrestricted\s*access|"
                       r"hosted\s*on\s*platforms\s*like\s*GitHub|open\s*repository|"
                       r"downloadable\s*without\s*restrictions|free\s*and\s*open\s*datasets)",
        "proprietary": r"(proprietary|restricted|commercial|license\s*required|not\s*open|private|confidential|"
                       r"internal\s*use\s*only|data\s*for\s*purchase|requires\s*payment|subscription\s*required|"
                       r"restricted\s*access|non-disclosure\s*agreement|paid\s*dataset|for\s*institutional\s*use)",
        "third-party": r"(third-party|external\s*source|external\s*provider|sourced\s*externally|"
                        r"licensed\s*from\s*another\s*organization|collaborative\s*data|"
                        r"acquired\s*from\s*partners|data\s*from\s*vendors|industry-provided\s*data|"
                        r"data\s*obtained\s*from\s*other\s*agencies|crowdsourced\s*data|shared\s*by\s*research\s*partners)"
    }
    for data_type, pattern in patterns.items():
        if re.search(pattern, text, re.IGNORECASE):
            return data_type
    return None


# Function to classify license type
def classify_license_type(text):
    patterns = {
        "Creative Commons": r"(Creative\s*Commons|CC\s*BY|CC\s*BY-SA|CC\s*BY-NC|CC\s*BY-ND|CC\s*BY-NC-SA|"
                            r"CC\s*BY-NC-ND|Creative\s*Commons\s*Zero|CC0|CC\s*Public\s*Domain\s*Dedication)",
        "MIT": r"(MIT\s*license|Massachusetts\s*Institute\s*of\s*Technology\s*license)",
        "Apache": r"(Apache\s*license|Apache\s*2\.0|Apache\s*Software\s*License)",
        "GNU": r"(GNU\s*GPL|General\s*Public\s*License|LGPL|Affero\s*GPL|GNU\s*AGPL|GPLv[23]|GNU\s*Lesser\s*GPL)",
        "BSD": r"(BSD\s*license|Berkeley\s*Software\s*Distribution|BSD-2-Clause|BSD-3-Clause|BSD\s*Zero-Clause)",
        "public domain": r"(public\s*domain|no\s*restrictions|publicly\s*available|dedicated\s*to\s*the\s*public\s*domain|"
                          r"free\s*for\s*use|no\s*rights\s*reserved|unrestricted\s*use)",
        "proprietary": r"(proprietary|restricted|all\s*rights\s*reserved|exclusive\s*rights|not\s*for\s*redistribution|"
                       r"internal\s*use\s*only|closed-source|private\s*license|requires\s*authorization|limited\s*license)",
        "custom": r"(custom\s*license|tailored\s*license|unique\s*licensing\s*terms|organization-specific\s*license|"
                  r"bespoke\s*license\s*terms|institutional\s*license)"
    }
    for license_type, pattern in patterns.items():
        if re.search(pattern, text, re.IGNORECASE):
            return license_type
    return None



# Function to check relevant sections
def is_relevant_section(title):
    """
    Determine if a section is relevant based on its title.
    """
    if not title:  # Handle None or empty title
        return False

    relevant_keywords = [
        "abstract", "introduction", "background", "overview",
        "data", "dataset", "data source", "data description", "data collection",
        "method", "methodology", "approach", "experimental design", "procedures",
        "results", "findings", "outcomes", "analysis", "discussion",
        "license", "licensing", "copyright", "terms of use", "data policy",
        "simulation", "modeling", "synthetic data", "generated data",
        "reproducibility", "replication", "validation", "verification",
        "tools", "frameworks", "software", "code", "implementation",
        "availability", "resources", "access", "sharing", "open source",
        "conclusion", "future work", "limitations", "summary"
    ]
    return any(keyword in title.lower() for keyword in relevant_keywords)

def extract_dataset_links(text):
    """
    Extract dataset links from text using regex.

    Args:
        text (str): Text content to process.

    Returns:
        list: A list of unique dataset links.
    """
    dataset_link_pattern = r"(https?://(?:www\.)?(?:[\w-]+\.)+(?:com|org|edu|io|gov|net)/[^\s]+(?:\.csv|\.json|\.xlsx|\.txt|\.zip|\.tar|\.h5|))"
    dataset_links = re.findall(dataset_link_pattern, text, re.IGNORECASE)
    return list(set(dataset_links))  # Remove duplicates


def extract_dataset_names(text):
    """
    Extract dataset names from text using regex and spaCy NER.

    Args:
        text (str): Text content to process.

    Returns:
        list: A list of unique dataset names.
    """
    dataset_name_patterns = [
        r"dataset\s*(?:named|called|known as|referred to as|is titled|entitled|used in this study as|termed)\s*[:\-]?\s*\"?(.*?)\"?\.?",
        r"(?:named|titled|referred to as|termed|entitled|used as)\s*\"(.*?)\"",
        r"(?:the dataset|data)\s*(?:is|was|has been)\s*(?:called|named|referred to as|known as|entitled|termed)\s*\"?(.*?)\"?\.?",
        r"(?:dataset|data)\s*(?:from|of|for)\s*(.*?)\s*(?:was|is|were|has been|collected)",
        r"datasets?\s*(?:used|analyzed|provided|sourced from|developed)\s*(.*?)\."
    ]

    dataset_names = []
    for pattern in dataset_name_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        dataset_names.extend(matches)

    # Use spaCy NER for additional extraction
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in {"PRODUCT", "WORK_OF_ART", "ORG"} and "dataset" in ent.text.lower():
            dataset_names.append(ent.text.strip())

    # Remove duplicates and clean trailing punctuations
    return list(set([name.strip().rstrip(".") for name in dataset_names]))



# Main extraction function
def extract_data_details_with_spacy(text):
    """Extract data-related information from text."""
    details = {
        "acquisition_type": classify_acquisition_type(text),
        "access_type": classify_access_type(text),
        "data_type": classify_data_type(text),
        "license": classify_license_type(text),
    }

    # Regex for dataset link and name
    dataset_link_pattern = r"(https?://[^\s]+|www\.[^\s]+|ftp://[^\s]+|doi\.org/[^\s]+)"
    dataset_name_pattern = r"dataset\s*(?:named|called|known as|referred to as)\s*[:\-]?\s*\"?(.*?)\"?\."

    # Extract dataset link
    dataset_link_match = re.search(dataset_link_pattern, text, re.IGNORECASE)
    if dataset_link_match:
        details["dataset_link"] = dataset_link_match.group(1)

    # Extract dataset name
    dataset_name_match = re.search(dataset_name_pattern, text, re.IGNORECASE)
    if dataset_name_match:
        details["dataset_name"] = dataset_name_match.group(1)

    return details


# Process the mini-dataset
journals = ['mini-dataset.csv']  # Example journal
meta_folder = '../'
full_text_folder = '../journal-full-text'
journal_issn_df = pd.DataFrame({'journal': ['mini-dataset'], 'issn': ['0968-090X']})  # Example ISSN DataFrame

detailed_info = []

for journal in journals:
    if journal == 'mini-dataset.csv':  # Adjust for larger datasets
        journal_issn = journal_issn_df[journal_issn_df['journal'] == journal.replace('.csv', '')]['issn'].values[0]
        journal_meta = pd.read_csv(os.path.join(meta_folder, journal))
        journal_meta['unique_id'] = journal_meta['doi'].apply(doi_to_unique_id)

        for i in range(len(journal_meta)):
            fulltext_path = os.path.join(full_text_folder, journal_issn, journal_meta.iloc[i]['unique_id'] + '.xml')
            if not os.path.exists(fulltext_path):
                continue

            sections = extract_sections_and_text_from_xml(fulltext_path)
            reorganized_sections = postprocess_sections(sections)

            paper_info = {"doi": journal_meta.iloc[i]['doi']}
            for section in reorganized_sections:
                if is_relevant_section(section.get('title', '')):
                    data_details = extract_data_details_with_spacy(section['text'])
                    paper_info.update(data_details)

            detailed_info.append(paper_info)

# Save results
detailed_info_df = pd.DataFrame(detailed_info)
detailed_info_df.to_csv('data_details.csv', index=False)

# Summarize results
def summarize_results(df):
    summary = {}
    total_papers = len(df)
    for column in df.columns:
        if column == "doi":
            continue
        value_counts = df[column].value_counts()
        percentages = (value_counts / total_papers * 100).round(2)
        summary[column] = pd.DataFrame({"Count": value_counts, "Percentage (%)": percentages})
    return summary


summary_tables = summarize_results(detailed_info_df)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\BELKESSA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\BELKESSA\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [45]:
detailed_info_df

Unnamed: 0,doi,acquisition_type,access_type,data_type,license,dataset_link,dataset_name
0,10.1016/j.trc.2023.104451,simulated,,,,,
1,10.1016/j.trc.2023.104349,simulated,,,,,
2,10.1016/j.trc.2023.104427,existing,,,,,
3,10.1016/j.trc.2023.104453,simulated,,,,,
4,10.1016/j.trc.2023.104459,generated,,,,,
...,...,...,...,...,...,...,...
95,10.1016/j.trc.2023.104458,,,,,,
96,10.1016/j.trc.2024.104499,,,,,,
97,10.1016/j.trc.2024.104496,,,,,https://doi.org/10.1016/j.trc.2024.104496.,
98,10.1016/j.trc.2024.104494,,,,,,


In [46]:
for column, summary_df in summary_tables.items():
    print(f"Summary for {column}:\n{summary_df}\n")

Summary for acquisition_type:
                  Count  Percentage (%)
acquisition_type                       
existing             29            29.0
simulated            21            21.0
generated            12            12.0

Summary for access_type:
             Count  Percentage (%)
access_type                       
proprietary     11            11.0
third-party      2             2.0
open source      1             1.0

Summary for data_type:
             Count  Percentage (%)
data_type                         
proprietary     11            11.0
open source      1             1.0
third-party      1             1.0

Summary for license:
               Count  Percentage (%)
license                             
proprietary        2             2.0
public domain      1             1.0

Summary for dataset_link:
                                                    Count  Percentage (%)
dataset_link                                                             
https://en.wikipedia.org/