### File Structure of the Dataset Folder
```
├── journal-meta/
│   ├── 0968-090X.csv         # TRC
│   ├── mini-dataset.csv      # A mini dataset for tutorial, extracted from TRC
│   └── ... (other journal CSV files)
├── journal-full-text/
│   ├── 0968-090X/
│   │   └── 10.1016_j.trc.2023.104311.xml
│   └── ... (other DOI folders)
```

In [11]:
import pandas as pd
import os
import re
import xml.etree.ElementTree as ET
# here replace the path to the folder of your downloaded dataset
full_text_folder = '/Users/ruth/Downloads/rr-measure-basic/RR-measure-dataset/journal-full-text'
meta_folder = '/Users/ruth/Downloads/rr-measure-basic/RR-measure-dataset/journal-meta'
# List all files in the folder with csv
journals = [f for f in os.listdir(meta_folder) if f.endswith('.csv')]
journal_issn_list = [['TRA','0965-8564'],
                     ['TRB','0191-2615'],
                     ['TRC','0968-090X'],
                     ['TRD','1361-9209'],
                     ['TRE','1366-5545'],
                     ['TRF','1369-8478'],
                     ['TRIP','2590-1982'],
                     ['mini-dataset','0968-090X']]
journal_issn_df = pd.DataFrame(journal_issn_list, columns=['journal','issn'])
print(journal_issn_df)

        journal       issn
0           TRA  0965-8564
1           TRB  0191-2615
2           TRC  0968-090X
3           TRD  1361-9209
4           TRE  1366-5545
5           TRF  1369-8478
6          TRIP  2590-1982
7  mini-dataset  0968-090X


In [2]:
# This section demonstrates how to work with the dataset, 
# utilizing the dataframe's apply method for efficient iteration in a loop.
# An example here is to clean up the abstract.
def cleanup_abstract(abstract):
    """
    Cleans up an abstract string by standardizing spacing.

    Args:
        abstract (str): The abstract of a journal article, which may contain irregular spacing,
                        including multiple spaces, leading spaces, or trailing spaces.

    Returns:
        str: A cleaned string where all excessive spaces are replaced with a single space,
             and any leading or trailing spaces are removed. This is essential for preparing
             text data for further analysis or display, ensuring uniformity in the formatting
             of abstracts.

    Example:
        >>> cleanup_abstract("  This  is   an example   abstract.  ")
        'This is an example abstract.'
    """
    # Check if the input is a string
    if not isinstance(abstract, str):
        raise ValueError("Input must be a string.")
    
    return re.sub(r'\s+', ' ', abstract).strip()

for journal in journals:
    if journal == 'mini-dataset.csv': # to take the mini dataset for tutorial
        # connect the journal name with the issn from the journal_issn_df
        journal_issn = journal_issn_df[journal_issn_df['journal'] == journal.replace('.csv', '')]['issn'].values[0]
        journal_meta = pd.read_csv(os.path.join(meta_folder, journal))
        journal_meta['issn'] = journal_issn
        journal_meta['abstract'] = journal_meta['abstract'].apply(cleanup_abstract) # to clean up the abstract
        journal_meta.to_csv(os.path.join(meta_folder, journal), index=False) # at the end, save the cleaned dataset

In [3]:
# This section demostrates how to link the doi with the full text
def doi_to_unique_id(doi):
    """
    Converts a DOI to a unique identifier by replacing slashes with underscores.

    Args:
        doi (str): The DOI of a journal article.

    Returns:
        str: A unique identifier where slashes are replaced with underscores.

    Example:
        >>> doi_to_unique_id("10.1016/j.trc.2023.104311")
        "10.1016_j.trc.2023_104311"
    """
    return doi.replace('/', '_')

def extract_sections_and_text_from_xml(file_path):
    """
    Extracts sections and text from an XML file.
    
    Args:
        file_path (str): The path to the XML file.

    Returns:
        list: A list of dictionaries, each containing the label, title, text, and subsections of a section.
    
    Example:
        >>> extract_sections_and_text_from_xml('/path/to/file.xml')
        [{'label': 'Introduction', 'title': 'Introduction', 'text': 'This is the introduction...', 'subsections': []}]
    """
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Namespace to handle XML namespaces
    namespaces = {
        'xocs': 'http://www.elsevier.com/xml/xocs/dtd',
        'ce': 'http://www.elsevier.com/xml/common/dtd',
        'ja': 'http://www.elsevier.com/xml/ja/dtd',
        'mml': 'http://www.w3.org/1998/Math/MathML'
    }

    # Extracting the sections using the item-toc element
    sections = []
    for item in root.findall('.//xocs:item-toc-entry', namespaces):
        section_title = item.find('xocs:item-toc-section-title', namespaces)
        section_label = item.find('xocs:item-toc-label', namespaces)
        section_text = []
        
        # Use the section label to find the corresponding section id in <ce:section>
        if section_label is not None:
            label_text = section_label.text.strip()
            section_elem = root.find(f".//ce:section[ce:label='{label_text}']", namespaces)
            if section_elem is not None:
                # Get all text under the section element, including paragraphs and other texts
                section_text_parts = []
                subsections = []
                before_subsection_text = True

                # Iterate over all elements within the section
                for elem in section_elem:
                    # Check if this element is a subsection
                    if elem.tag == f"{{{namespaces['ce']}}}section":
                        # This is a subsection, process it
                        subsection_title_elem = elem.find(f".//ce:section-title", namespaces)
                        if subsection_title_elem is not None:
                            subsection_title = subsection_title_elem.text
                            subsection_paragraphs = []
                            for sub_elem in elem.findall('.//ce:para', namespaces=namespaces):
                                # Append text, taking care of <ce:cross-ref> tags and <mml:math> tags within paragraphs
                                paragraph_text = ''.join(sub_elem.itertext())
                                subsection_paragraphs.append(paragraph_text)
                            subsection_text = ' '.join(subsection_paragraphs)
                            subsections.append({
                                "title": subsection_title,
                                "text": subsection_text
                            })
                    else:
                        # Collect text before any subsection starts
                        if before_subsection_text and elem.tag == f"{{{namespaces['ce']}}}para":
                            # Append text, taking care of <ce:cross-ref> tags and <mml:math> tags within paragraphs
                            paragraph_text = ''.join(elem.itertext())
                            section_text_parts.append(paragraph_text)

                section_text = ' '.join(section_text_parts)
                
                sections.append({
                    "label": section_label.text,
                    "title": section_title.text,
                    "text": section_text,
                    "subsections": subsections
                })

    return sections

for journal in journals:
    if journal == 'mini-dataset.csv': # to take the mini dataset for tutorial
        journal_issn = journal_issn_df[journal_issn_df['journal'] == journal.replace('.csv', '')]['issn'].values[0]
        journal_meta = pd.read_csv(os.path.join(meta_folder, journal))
        journal_meta['unique_id'] = journal_meta['doi'].apply(doi_to_unique_id) # to convert the doi to a unique id
        # here we use the for loop to help understand how it works, it can be done in one line of code later
        for i in range(len(journal_meta)):
            fulltext_path = os.path.join(full_text_folder, journal_issn, journal_meta.iloc[i]['unique_id'] + '.xml')
            sections = extract_sections_and_text_from_xml(fulltext_path)
            break

In [None]:
sections

In [35]:
import requests
import os
from urllib.parse import urlparse

params = {
    'q': "",
    'cx': "d59d166aa06584204",
    'key': os.environ["API_SEARCH_KEY"]
}

def is_github_repo(url):
    """Check if a given URL is a GitHub repository."""
    parsed = urlparse(url)
    
    path_parts = parsed.path.strip('/').split('/')
    
    # <username>/<repository_name>
    if len(path_parts) == 2:
        return True
    return False

def get_github_repo_link(paper_title, first_author):
    # Form the search query
    query = f"{paper_title}, {first_author}"

    params["q"] = query
    
    # Google Custom Search API endpoint
    url = "https://www.googleapis.com/customsearch/v1"
    
    try:
        # Send the request
        response = requests.get(url, params=params)
        response.raise_for_status()
        results = response.json()
        
        # Extract items from the response
        for item in results.get('items', []):
            link = item.get('link', '')
            # Check if the link points to a GitHub repository
            if is_github_repo(link) and not link.endswith(('.md', '/issues', '/pulls')):
                print(link)
                return link
        return "Not found"
    
    except requests.exceptions.RequestException as e:
        print(e)
        return f"{e}"
    
print(journals)

valid_2590_1982 = [
    "Decentralized network level adaptive signal control by multi-agent deep reinforcement learning",
    "What passengers really want: Assessing the value of rail innovation to improve experiences",
    "A systematic overview of transportation equity in terms of accessibility, traffic emissions, and safety outcomes: From conventional to emerging technologies",
    "Where to improve cycling infrastructure? Assessing bicycle suitability and bikeability with open data in the city of Paris",
]
valid_0965_8564=["The contradictions of bike-share benefits, purposes and outcomes","Value of demand information in autonomous mobility-on-demand systems","Driving aggressiveness management policy to enhance the performance of mixed traffic conditions in automated driving environments","The importance of user perspective in the evolution of MaaS","Do new bike share stations increase member use: A quasi-experimental study","How just is transportation justice theory? The issues of paternalism and production","Identifying commonly used and potentially unsafe transit transfers with crowdsourcing","Special Issue “Walking and Cycling for better Transport, Health and the Environment”","Increasing cycling for transportation in Canadian communities: Understanding what works","Associations between individual characteristics, availability of bicycle infrastructure, and city-wide safety perceptions of bicycling: A cross-sectional survey of bicyclists in 6 Canadian and U.S. cities","Impacts of an active travel intervention with a cycling focus in a suburban context: One-year findings from an evaluation of London's in-progress mini-Hollands programme","Exploring parental perceptions about school travel and walking school buses: A thematic analysis approach","A pooled RP/SP mode, route and destination choice model to investigate mode and user-type effects in the value of travel time savings","Transport poverty and subjective wellbeing","Why do people take e-scooter trips? Insights on temporal and spatial usage patterns of detailed trip data","Scale effects in ridesplitting: A case study of the City of Chicago","I can board, but I'd rather wait: Active boarding delay choice behaviour analysis using smart card data in metro systems","Working from self-driving cars"]
valid_0191_2615=["An epidemiological diffusion framework for vehicular messaging in general transportation networks","Bayesian estimation of mixed multinomial logit models: Advances and simulation-based evaluations","A hierarchical approach for splitting truck platoons near network discontinuities","Path-based dynamic pricing for vehicle allocation in ridesharing systems with fully compliant drivers","Editorial","A functional form with a physical meaning for the macroscopic fundamental diagram","Statistical inference of travelers’ route choice preferences with system-level data","The integer programing extreme value (IPEV) model: An application for estimation of the leisure trip demand"]
valid_1366_5545=["Real-time demand forecasting for an urban delivery platform","A branch-price-and-cut algorithm for the vehicle routing problem with release and due dates","Relief and stimulus in a cross-sector multi-product scarce resource supply chain network","Deep attention models with dimension-reduction and gate mechanisms for solving practical time-dependent vehicle routing problems"]
valid_1369_8478=["The impact of a child bike seat and trailer on the objective overtaking behaviour of motorized vehicles passing cyclists"]
valid_0968_090X=["Data-driven activity scheduler for agent-based mobility models","An artificial neural network based approach to investigate travellers’ decision rules","WRAP: An open-source kinematic aircraft performance model","Modeling competing free-floating carsharing operators – A case study for Zurich, Switzerland","A Bayesian tensor decomposition approach for spatiotemporal traffic data imputation","Deep reinforcement learning enabled self-learning control for energy efficient driving","Graph Markov network for traffic forecasting with missing data","Link-based traffic state estimation and prediction for arterial networks using license-plate recognition data","GE-GAN: A novel deep learning framework for road traffic state estimation","Differential variable speed limits control for freeway recurrent bottlenecks via deep actor-critic algorithm","Safe, efficient, and comfortable velocity control based on reinforcement learning for autonomous driving","A nonconvex low-rank tensor completion model for spatiotemporal traffic data imputation","Macroscopic parking dynamics modeling and optimal real-time pricing considering cruising-for-parking","Deep neural networks for choice analysis: Extracting complete economic information for interpretation","Stacked bidirectional and unidirectional LSTM recurrent neural network for forecasting network-wide traffic state with missing values","Forecast network-wide traffic states for multiple steps ahead: A deep learning approach considering dynamic non-local spatial correlation and non-stationary temporal dependency","Microsimulation of energy and flow effects from optimal automated driving in mixed traffic","A customized deep learning approach to integrate network-scale online traffic data imputation and prediction","Truck body type classification using a deep representation learning ensemble on 3D point sets","DDP-GCN: Multi-graph convolutional network for spatiotemporal traffic forecasting","A context-aware pedestrian trajectory prediction framework for automated vehicles","Processing, assessing, and enhancing the Waymo autonomous vehicle open dataset for driving behavior research","Traffic congestion propagation inference using dynamic Bayesian graph convolution network","Study of automated shuttle interactions in city traffic using surrogate measures of safety","Gaussian process latent class choice models","Long-term 4D trajectory prediction using generative adversarial networks","Routing battery-constrained delivery drones in a depot network: A business model and its optimization–simulation assessment","A novel one-stage approach for pointwise transportation mode identification inspired by point cloud processing","Virtual track networks: A hierarchical modeling framework and open-source tools for simplified and efficient connected and automated mobility (CAM) system design based on general modeling network specification (GMNS)","Mind the gap: Modelling difference between censored and uncensored electric vehicle charging demand","A physics-informed Transformer model for vehicle trajectory prediction on highways","A novel spatio-temporal generative inference network for predicting the long-term highway traffic speed","Deep trip generation with graph neural networks for bike sharing system expansion","OASIS: Optimisation-based Activity Scheduling with Integrated Simultaneous choice dimensions","I-24 MOTION: An instrument for freeway traffic science","Leveraging ride-hailing services for social good: Fleet optimal routing and system optimal pricing","Inferring vehicle spacing in urban traffic from trajectory data","Copula-based transferable models for synthetic population generation","The role of individual compensation and acceptance decisions in crowdsourced delivery"]
valid_1361_9209=["Special issue on “Urbanization, transportation and air quality in developing countries”","Unifying access","A century of evolution of the accessibility concept","Machine learning approach to ship fuel consumption: A case of container vessel","Level of traffic stress-based classification: A clustering approach for Bogotá, Colombia","A cycling-focused accessibility tool to support regional bike network connectivity","Examining equity in accessibility to bike share: A balanced floating catchment area approach","The 20-minute city: An equity analysis of Liverpool City Region"]

good_articles = valid_2590_1982+valid_0965_8564+valid_0191_2615+valid_1366_5545+valid_1369_8478+valid_0968_090X+valid_1361_9209

print(len(valid_2590_1982) + len(valid_0965_8564) + len(valid_0191_2615) + len(valid_1366_5545) + len(valid_1369_8478) + len(valid_0968_090X) + len(valid_1361_9209))
print(len(good_articles))

import time


for journal in journals:
    if journal != "mini-dataset.csv":
        githubs = []
        # connect the journal name with the issn from the journal_issn_df
        print(journal.replace('.csv', ''))
        print(journal_issn_df)
        journal_issn = journal_issn_df[journal_issn_df['issn'] == journal.replace('.csv', '')]['issn'].values[0]
        # print(journal_issn_df, journal.replace('.csv', ''))
        # print(journal_issn_df[journal_issn_df['issn'] == journal.replace('.csv', '')]['issn'].values)
        journal_meta = pd.read_csv(os.path.join(meta_folder, journal))

        journal_meta['issn'] = journal_issn
            
        for index, row in journal_meta.iterrows():
            # print(row['title'])
            # githubs.append('"'+f'''{row["title"].replace('"', '')}'''+'"')
            # print('"'+f'''{row["title"].replace('"', '')}'''+'"')
            if f'''{row["title"].replace('"', '')}''' in good_articles:
                # print('"'+f'''{row["title"].replace('"', '')}'''+'"')
                githubs.append(get_github_repo_link('"'+f'''{row["title"].replace('"', '')}'''+'"', "")) 
                time.sleep(1)
            else:
                # print("not found ", '"'+f'''{row["title"].replace('"', '')}'''+'"')
                githubs.append("Not found")

        journal_meta["githubs-scraped"] = githubs

        journal_meta.to_csv(os.path.join(meta_folder, "github-scraped-"+journal), index=False) # at the end, save the cleaned dataset

['2590-1982.csv', 'mini-dataset.csv', '0965-8564.csv', '0191-2615.csv', '1366-5545.csv', '1369-8478.csv', '0968-090X.csv', '1361-9209.csv']
82
82
2590-1982
        journal       issn
0           TRA  0965-8564
1           TRB  0191-2615
2           TRC  0968-090X
3           TRD  1361-9209
4           TRE  1366-5545
5           TRF  1369-8478
6          TRIP  2590-1982
7  mini-dataset  0968-090X
429 Client Error: Too Many Requests for url: https://www.googleapis.com/customsearch/v1?q=%22Decentralized+network+level+adaptive+signal+control+by+multi-agent+deep+reinforcement+learning%22%2C+&cx=d59d166aa06584204&key=AIzaSyDBYqpe-DnMlUA8oVd_le_kXMQVLUxA3W8
429 Client Error: Too Many Requests for url: https://www.googleapis.com/customsearch/v1?q=%22What+passengers+really+want%3A+Assessing+the+value+of+rail+innovation+to+improve+experiences%22%2C+&cx=d59d166aa06584204&key=AIzaSyDBYqpe-DnMlUA8oVd_le_kXMQVLUxA3W8
429 Client Error: Too Many Requests for url: https://www.googleapis.com/customsearc

KeyboardInterrupt: 

In [30]:
valid_2590_1982 = [
    "Decentralized network level adaptive signal control by multi-agent deep reinforcement learning",
    "What passengers really want: Assessing the value of rail innovation to improve experiences",
    "A systematic overview of transportation equity in terms of accessibility, traffic emissions, and safety outcomes: From conventional to emerging technologies",
    "Where to improve cycling infrastructure? Assessing bicycle suitability and bikeability with open data in the city of Paris",
]
valid_0965_8564=["The contradictions of bike-share benefits, purposes and outcomes","Value of demand information in autonomous mobility-on-demand systems","Driving aggressiveness management policy to enhance the performance of mixed traffic conditions in automated driving environments","The importance of user perspective in the evolution of MaaS","Do new bike share stations increase member use: A quasi-experimental study","How just is transportation justice theory? The issues of paternalism and production","Identifying commonly used and potentially unsafe transit transfers with crowdsourcing","Special Issue “Walking and Cycling for better Transport, Health and the Environment”","Increasing cycling for transportation in Canadian communities: Understanding what works","Associations between individual characteristics, availability of bicycle infrastructure, and city-wide safety perceptions of bicycling: A cross-sectional survey of bicyclists in 6 Canadian and U.S. cities","Impacts of an active travel intervention with a cycling focus in a suburban context: One-year findings from an evaluation of London's in-progress mini-Hollands programme","Exploring parental perceptions about school travel and walking school buses: A thematic analysis approach","A pooled RP/SP mode, route and destination choice model to investigate mode and user-type effects in the value of travel time savings","Transport poverty and subjective wellbeing","Why do people take e-scooter trips? Insights on temporal and spatial usage patterns of detailed trip data","Scale effects in ridesplitting: A case study of the City of Chicago","I can board, but I'd rather wait: Active boarding delay choice behaviour analysis using smart card data in metro systems","Working from self-driving cars"]
valid_0191_2615=["An epidemiological diffusion framework for vehicular messaging in general transportation networks","Bayesian estimation of mixed multinomial logit models: Advances and simulation-based evaluations","A hierarchical approach for splitting truck platoons near network discontinuities","Path-based dynamic pricing for vehicle allocation in ridesharing systems with fully compliant drivers","Editorial","A functional form with a physical meaning for the macroscopic fundamental diagram","Statistical inference of travelers’ route choice preferences with system-level data","The integer programing extreme value (IPEV) model: An application for estimation of the leisure trip demand"]
valid_1366_5545=["Real-time demand forecasting for an urban delivery platform","A branch-price-and-cut algorithm for the vehicle routing problem with release and due dates","Relief and stimulus in a cross-sector multi-product scarce resource supply chain network","Deep attention models with dimension-reduction and gate mechanisms for solving practical time-dependent vehicle routing problems"]
valid_1369_8478=["The impact of a child bike seat and trailer on the objective overtaking behaviour of motorized vehicles passing cyclists"]
valid_0968_090X=["Data-driven activity scheduler for agent-based mobility models","An artificial neural network based approach to investigate travellers’ decision rules","WRAP: An open-source kinematic aircraft performance model","Modeling competing free-floating carsharing operators – A case study for Zurich, Switzerland","A Bayesian tensor decomposition approach for spatiotemporal traffic data imputation","Deep reinforcement learning enabled self-learning control for energy efficient driving","Graph Markov network for traffic forecasting with missing data","Link-based traffic state estimation and prediction for arterial networks using license-plate recognition data","GE-GAN: A novel deep learning framework for road traffic state estimation","Differential variable speed limits control for freeway recurrent bottlenecks via deep actor-critic algorithm","Safe, efficient, and comfortable velocity control based on reinforcement learning for autonomous driving","A nonconvex low-rank tensor completion model for spatiotemporal traffic data imputation","Macroscopic parking dynamics modeling and optimal real-time pricing considering cruising-for-parking","Deep neural networks for choice analysis: Extracting complete economic information for interpretation","Stacked bidirectional and unidirectional LSTM recurrent neural network for forecasting network-wide traffic state with missing values","Forecast network-wide traffic states for multiple steps ahead: A deep learning approach considering dynamic non-local spatial correlation and non-stationary temporal dependency","Microsimulation of energy and flow effects from optimal automated driving in mixed traffic","A customized deep learning approach to integrate network-scale online traffic data imputation and prediction","Truck body type classification using a deep representation learning ensemble on 3D point sets","DDP-GCN: Multi-graph convolutional network for spatiotemporal traffic forecasting","A context-aware pedestrian trajectory prediction framework for automated vehicles","Processing, assessing, and enhancing the Waymo autonomous vehicle open dataset for driving behavior research","Traffic congestion propagation inference using dynamic Bayesian graph convolution network","Study of automated shuttle interactions in city traffic using surrogate measures of safety","Gaussian process latent class choice models","Long-term 4D trajectory prediction using generative adversarial networks","Routing battery-constrained delivery drones in a depot network: A business model and its optimization–simulation assessment","A novel one-stage approach for pointwise transportation mode identification inspired by point cloud processing","Virtual track networks: A hierarchical modeling framework and open-source tools for simplified and efficient connected and automated mobility (CAM) system design based on general modeling network specification (GMNS)","Mind the gap: Modelling difference between censored and uncensored electric vehicle charging demand","A physics-informed Transformer model for vehicle trajectory prediction on highways","A novel spatio-temporal generative inference network for predicting the long-term highway traffic speed","Deep trip generation with graph neural networks for bike sharing system expansion","OASIS: Optimisation-based Activity Scheduling with Integrated Simultaneous choice dimensions","I-24 MOTION: An instrument for freeway traffic science","Leveraging ride-hailing services for social good: Fleet optimal routing and system optimal pricing","Inferring vehicle spacing in urban traffic from trajectory data","Copula-based transferable models for synthetic population generation","The role of individual compensation and acceptance decisions in crowdsourced delivery"]
valid_1361_9209=["Special issue on “Urbanization, transportation and air quality in developing countries”","Unifying access","A century of evolution of the accessibility concept","Machine learning approach to ship fuel consumption: A case of container vessel","Level of traffic stress-based classification: A clustering approach for Bogotá, Colombia","A cycling-focused accessibility tool to support regional bike network connectivity","Examining equity in accessibility to bike share: A balanced floating catchment area approach","The 20-minute city: An equity analysis of Liverpool City Region"]

good_articles = valid_2590_1982+valid_0965_8564+valid_0191_2615+valid_1366_5545+valid_1369_8478+valid_0968_090X+valid_1361_9209

print(len(valid_2590_1982) + len(valid_0965_8564) + len(valid_0191_2615) + len(valid_1366_5545) + len(valid_1369_8478) + len(valid_0968_090X) + len(valid_1361_9209))
print(len(good_articles))

82
82


In [17]:
get_github_repo_link("Estimating network flow and travel behavior using day-to-day system-level data: A computational graph approach", "")

Estimating network flow and travel behavior using day-to-day system-level data: A computational graph approach


'https://github.com/pabloguarda/pesuelogit'