### File Structure of the Dataset Folder
```
├── journal-meta/
│   ├── 0968-090X.csv         # TRC
│   ├── mini-dataset.csv      # A mini dataset for tutorial, extracted from TRC
│   ├── journal-meta-dataset.csv   # the combined dataset for all journals
│   ├── github_data.json      # the links for the GitHub repository
│   ├── url_data.json         # the links for the data avaiablity urls
│   └── ... (other journal CSV files)
├── journal-full-text/
│   ├── 0968-090X/
│   │   └── 10.1016_j.trc.2023.104311.xml
│   └── ... (other DOI folders)
```

In [1]:
import pandas as pd
import os
from tqdm import tqdm
from utils import *
# replace it with your downloaded folder path
full_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-full-text'
meta_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-meta'
result_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-results'

In [2]:
data = pd.read_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'))
data.columns

Index(['title', 'doi', 'volume', 'date', 'year', 'month', 'abstract', 'issn',
       'journal_name', 'unique_id', 'is_github', 'num_of_github_urls',
       'is_availablity_statement', 'is_data_mentioned_in_section_title',
       'is_experiment_mentioned_in_section_title',
       'is_link_in_avaiablity_statement',
       'num_of_links_in_avaiablity_statement'],
      dtype='object')

In [4]:
data['is_availability_statement'] = 0
for i in range(len(data)):
# # In case for the mini test to debug the code
# for i in tqdm(range(1000)):
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    sections = extract_sections_and_text_from_xml(paper_path)
    reorganized_sections = postprocess_sections(sections)
    for section in reorganized_sections:
        section_title = section['title']
        if 'data availability' in section_title.lower():
            data.loc[i, 'is_availability_statement'] = 1

In [16]:
import openai
import yaml
import json
# Load API key from config.yaml
from openai import OpenAI
with open("/Users/junyi/Work/RR/config.yaml", "r") as config_file:
    config = yaml.safe_load(config_file)
openai.api_key = config["openai_api_key"]
client = OpenAI(api_key=config["openai_api_key"])
def analyze_with_openai(data_context):
    definition_context = """
                        ---------------------------
                        Definition of data source:
                        Real-world data is the data is collected from the real-world, such as data from sensors, surveys, or other sources.
                        Simulation data is the data generated from simulation or synthetic data, even though the scenario is based on real-world.
                        """
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": data_context + definition_context
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "data_source_description",
                "strict": True,
                "schema": {
                    "type": "object",
                    "properties": {
                        "source_description": {
                            "type": "string",
                            "description": "Description of the data source."
                        },
                        "real_world": {
                            "type": "boolean",
                            "description": "Is the data collected from real-world based on the definition?"
                        },
                        "simulation": {
                            "type": "boolean",
                            "description": "Is the data collected from simulation or synthetic data based on the definition?"
                        },
                        "details": {
                            "type": "object",
                            "properties": {
                                "dataset_size_description": {
                                    "type": "boolean",
                                    "description": "Indicates whether there is a description of the dataset size."
                                },
                                "data_collection_description": {
                                    "type": "boolean",
                                    "description": "Indicates whether there is a description of data collection."
                                },
                                "size_decription_detail": {
                                    "type": "string",
                                    "description": "Description of the dataset size."
                                },
                                "data_collection_detail": {
                                    "type": "string",
                                    "description": "Description of the data collection."
                                }
                            },
                            "required": [
                                "dataset_size_description",
                                "data_collection_description",
                                "size_decription_detail",
                                "data_collection_detail"
                            ],
                            "additionalProperties": False
                        }
                    },
                    "required": [
                        "source_description",
                        "real_world",
                        "simulation",
                        "details"
                    ],
                    "additionalProperties": False
                }
            }
        },
        temperature=0,
        max_completion_tokens=16383,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response.choices[0].message.content

In [15]:
import os
from tqdm import tqdm
data['source_description'] = ''
data['real_world'] = ''
data['simulation'] = ''
data['dataset_size_description'] = ''
data['data_collection_description'] = ''
data['is_data_mentioned'] = 0
if not os.path.exists(result_data_folder + '/data-description'):
    os.makedirs(result_data_folder + '/data-description')
for i in tqdm(range(len(data))):
# for i in tqdm(range(10)):
    # Initialize data context
    data_context = ''
    label = 0
    try:
        # Construct paths for journal and paper
        journal_path = os.path.join(full_data_folder, data['issn'][i])
        paper_path = os.path.join(journal_path, f"{data['unique_id'][i]}.xml")
        
        # get the abstract from the xml
        abstract = extract_abstract_from_xml(paper_path)
        if abstract:
            # if "data" in abstract.lower():
            if "data" in abstract.lower():
                label = 1
        data_context += abstract
        # Extract and process sections from the paper XML
        sections = extract_sections_and_text_from_xml(paper_path)
        reorganized_sections = postprocess_sections(sections)

        # Traverse the sections and collect data-related text
        for section in reorganized_sections:
            if 'data' in section['title'].lower():
                data_context += section['text']
                label = 1
            for subsection in section.get('subsections', []):
                if 'data' in subsection['title'].lower():
                    data_context += subsection['text']
                    label = 1
                for subsubsection in subsection.get('subsubsections', []):
                    if 'data' in subsubsection['title'].lower():
                        data_context += subsubsection['text']
                        label = 1
    except KeyError as e:
        print(f"Missing key in data: {e}")
    except FileNotFoundError as e:
        print(f"File not found: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
    if len(data_context)>0:
        response = analyze_with_openai(data_context)
        # print(data['title'][i])
        # print(response)
        response = json.loads(response)
        data.loc[i, 'source_description'] = response['source_description']
        data.loc[i, 'real_world'] = response['real_world']
        data.loc[i, 'simulation'] = response['simulation']
        data.loc[i, 'dataset_size_description'] = response['details']['dataset_size_description']
        data.loc[i, 'data_collection_description'] = response['details']['data_collection_description']
        data.loc[i, 'is_data_mentioned'] = label
        print(response['source_description'])
        # combine the response with the data context as json file
        data_description = {
            "data_context": data_context,
            "data_source_description": response
        }
        save_json(data_description, os.path.join(result_data_folder + '/data-description', f"{data['unique_id'][i]}.json"))
    else:
        print("No data context found")  
data.to_csv(os.path.join(full_data_folder, 'full-meta-dataset-data-descriptive.csv'), index=False)

  0%|          | 1/10990 [00:01<4:39:21,  1.53s/it]

The data source for this study consists of simulated traffic conditions that emulate real-world scenarios in a suburban traffic corridor, specifically designed to evaluate the performance of the proposed deep reinforcement learning algorithm for adaptive traffic signal control.


  0%|          | 2/10990 [00:02<4:29:36,  1.47s/it]

The data source is a longitudinal online survey conducted as part of the PASTA project, which aimed to understand the determinants of active mobility and its health impacts across seven European cities.


  0%|          | 3/10990 [00:04<4:26:09,  1.45s/it]

This study examines the critical components of China's aviation system, including airports, airlines, and air traffic management, through a review of academic literature, official reports, and news articles.


  0%|          | 4/10990 [00:06<4:38:24,  1.52s/it]

The data source for this research paper includes empirical data collected through face-to-face interviews, literature reviews, and secondary data analysis from various reports, articles, and statistics related to vocational education and training (VET) in the STEM workforce, particularly focusing on women's integration in the Japanese transport sector.


  0%|          | 5/10990 [00:07<4:39:53,  1.53s/it]

The data source for this study is derived from simulation outputs generated by the TSIS-CORSIM model, which simulates traffic conditions and control systems in urban areas. The data includes various measures of effectiveness (MOEs) related to traffic management parameters under different congestion scenarios.
No data context found


  0%|          | 7/10990 [00:08<3:11:52,  1.05s/it]

The data source consists of survey responses collected from train passengers in the UK, focusing on their preferences for technological innovations aimed at improving the rail journey experience.


  0%|          | 8/10990 [00:11<4:24:17,  1.44s/it]

The data source for this study includes a combination of real-world survey data, operational data from an e-hailing service provider, and simulated traffic network data. The primary dataset is the 2007 Origin Destination Survey (OD07) conducted by the São Paulo subway company, which provides detailed information on travel patterns in the São Paulo Metropolitan Region. Additionally, operational data from the e-hailing company 99 and traffic simulation data from TTC are utilized to analyze the impacts of e-hailing services.


  0%|          | 9/10990 [00:12<4:28:42,  1.47s/it]

The data source consists of recall accuracy scores and participant ratings collected from a study involving real-world driving experiences, where participants drove familiar routes and were later tested on their recall of details from those drives using various auditory and visual cues.


  0%|          | 10/10990 [00:14<4:21:33,  1.43s/it]

The data source consists of performance measures collected from various transit agencies across Michigan, assessing the quality of transit services provided in urban, semi-urban, and rural areas.


  0%|          | 11/10990 [00:15<4:03:37,  1.33s/it]

The data source consists of physiological measurements and subjective stress assessments collected from police officers performing traffic duties in real-world environments, specifically in Quebec City and Montreal, Canada.


  0%|          | 12/10990 [00:16<3:58:50,  1.31s/it]

The data source consists of empirical and theoretical insights drawn from various disciplines including education, organizational development, human resources, environmental sciences, and business strategy, focusing on learning processes that contribute to strategic capacity building in transportation planning.


  0%|          | 13/10990 [00:18<4:36:05,  1.51s/it]

The data source consists of real-world data collected from various sources to evaluate the effectiveness of the automated speed enforcement camera project along the Friendship Highway in Khon Kaen, Thailand. This includes vehicle speed measurements, accident records, and speed ticket issuance data.


  0%|          | 14/10990 [00:19<4:34:09,  1.50s/it]

The data source consists of life cycle assessment models developed to quantify the energy output and emissions of transportation alternatives in Qatar, specifically comparing private automobiles and the Doha Metro Project.


  0%|          | 15/10990 [00:21<4:32:28,  1.49s/it]

The data source consists of real-time incident reports from the Waze mobile navigation application, which allows users to report various roadway incidents such as crashes, disabled vehicles, and traffic conditions. This data was evaluated against ground truth evidence from traffic cameras to assess its accuracy.


  0%|          | 16/10990 [00:22<4:23:39,  1.44s/it]

The data source consists of panel data reflecting the development of the air logistics industry in the Beijing-Tianjin-Hebei region from 2005 to 2014, used to analyze the coordination degree of the industry.


  0%|          | 17/10990 [00:24<4:17:47,  1.41s/it]



  0%|          | 18/10990 [00:25<4:30:24,  1.48s/it]

The data source consists of qualitative data collected from interviews with practitioners involved in the planning and design of cycle highways across five European countries. This data provides insights into the definitions and conceptualizations of cycle highways and their impact on cycling experiences.


  0%|          | 19/10990 [00:27<4:38:57,  1.53s/it]

This study utilizes real-world data collected through remote-sensing, geographical information systems (GIS), field observations, and in-depth interviews to analyze the spatial distribution of informal e-bike taxi services in Shenzhen, China.


  0%|          | 20/10990 [00:28<4:25:21,  1.45s/it]

The data source consists of survey responses collected from individuals in the US regarding their behaviors and preferences related to crowd-shipping (CS) as driver-partners and requesters in the logistics market.


  0%|          | 21/10990 [00:29<4:11:33,  1.38s/it]

The data source for this study is based on an intercept survey conducted at Capital Bikeshare (CaBi) stations, collecting real-world user responses regarding their sensitivity to price changes and preferences for bikeshare services.


  0%|          | 22/10990 [00:32<4:58:05,  1.63s/it]

The dataset comprises public domain data on flight delays in Australia's domestic aviation market, covering 21 major routes linking state capitals from January 2004 to December 2015. It includes information from various airlines such as Qantas, Jetstar, and Virgin, focusing on the number of delayed flights relative to scheduled flights.


  0%|          | 23/10990 [00:33<4:53:01,  1.60s/it]

This study uses semi-structured interviews with experts from academia, industry, and government in the United States to explore the impacts of connected and automated vehicles (C/AVs) on active travel, specifically walking and cycling. The interviews aim to identify potential synergies and conflicts between C/AVs and active travel, as well as planning and policy priorities for integrating these technologies into transportation networks.


  0%|          | 24/10990 [00:34<4:36:14,  1.51s/it]

The data source for this paper includes real-world data on tax revenues from fuel and vehicle taxation in OECD countries, as well as data on transportation systems and the implications of ACES on public finance.


  0%|          | 25/10990 [00:35<4:12:46,  1.38s/it]

The data source consists of empirical traffic data collected over five years from loop detectors and Bluetooth sensors on an arterial road in Fort Myers, Florida, specifically during sporting events.


  0%|          | 26/10990 [00:36<3:52:48,  1.27s/it]

The data source consists of qualitative interviews conducted with city dwellers from Singapore and London to understand their transportation choices and experiences in relation to their respective city cultures and transport policies.


  0%|          | 27/10990 [00:38<4:03:40,  1.33s/it]

The data source consists of self-reported survey data collected from riders of various types of two-wheelers in Shanghai, including e-bikes, traditional bicycles, e-scooters, and motorized two-wheelers. The study focuses on the frequency of risky riding behaviors among these groups.


  0%|          | 28/10990 [00:39<4:06:17,  1.35s/it]

The data source consists of aggregate cycling data collected from the Strava app, which includes information on bicycle trips in Dresden, Germany. This data is used to analyze cycling behavior and route choice models.


  0%|          | 29/10990 [00:41<4:05:35,  1.34s/it]

The data source for this study includes qualitative interviews, office attendance data, and travel survey data collected by Transport for London (TfL) before and after the London 2012 Olympics, focusing on travel behaviour changes due to the TDM programme.


  0%|          | 30/10990 [00:42<4:12:01,  1.38s/it]

The data source consists of crash data obtained from the Critical Analysis Reporting Environment (CARE), developed by the Center for Advanced Public Safety (CAPS) at the University of Alabama. This dataset includes fatal and incapacitating injury crashes in Alabama from 2009 to 2013, focusing on driver characteristics and regional attributes to analyze crash outcomes.


  0%|          | 31/10990 [00:44<4:19:11,  1.42s/it]

The data source for this study consists of real-world traffic crash data collected from two and four-lane urban and rural highways, focusing on the effects of geometric, environmental, and demographic factors on crash occurrences.


  0%|          | 32/10990 [00:45<4:19:05,  1.42s/it]

The data source consists of real-world urbanization and motorization indicators collected from 287 Chinese prefectural cities over a period from 2001 to 2014, along with qualitative transportation policy documents from selected cities representing distinct clusters identified through time-series clustering analysis.


  0%|          | 33/10990 [00:46<4:03:14,  1.33s/it]

The data source for this study is derived from a controlled high-fidelity driving simulator experiment designed to replicate real driving conditions and assess driver responses to sudden unintended acceleration (SUA) events.


  0%|          | 34/10990 [00:50<6:40:19,  2.19s/it]

The data source consists of weather information obtained from the National Oceanic and Atmospheric Administration (NOAA) Rapid Refresh Products (RAP) weather files, which provide a three-dimensional grid of atmospheric conditions over the continental United States (CONUS). This data is used to analyze the potential for mitigating contrail formation by adjusting flight levels of aircraft in relation to ice super saturated (ISS) regions.


  0%|          | 35/10990 [00:52<6:31:37,  2.14s/it]

The data source consists of traffic crash reports collected from various traffic offices in the Kurdistan Region of Iraq (KRI), including reports from the General Directorate of Traffic, Forensic Medical Institute, and KRSO. These reports include information about victims, drivers, vehicles, crash locations, and conditions, but are noted to be rudimentary compared to developed countries.


  0%|          | 36/10990 [00:54<5:47:32,  1.90s/it]

The data source consists of a unique compilation of CEO profiles from the world's largest 100 airlines, including both primary data from interviews and secondary data from publicly available sources.


  0%|          | 37/10990 [00:55<5:16:08,  1.73s/it]

The data source consists of primary and secondary data collected through semi-structured interviews and literature review, focusing on the logistics environment and supply chain requirements for perishable agricultural goods in the Kyrgyz Republic.


  0%|          | 38/10990 [00:56<4:56:25,  1.62s/it]

The data source consists of visibility ratings collected from workers in a real-world setting, specifically from a health and safety training facility where participants evaluated visibility around construction equipment.


  0%|          | 39/10990 [00:57<4:14:03,  1.39s/it]

The data source is derived from a survey conducted with eighty-nine participants to assess their perceptions of vehicle designs as faces and their correlation with aggressive driving behaviors.


  0%|          | 40/10990 [00:59<4:08:51,  1.36s/it]

The data source consists of questionnaire survey responses collected from public transport users in Metro Manila, focusing on their perceived risks during flood events.


  0%|          | 41/10990 [01:00<4:10:51,  1.37s/it]

The data source consists of real-world data collected from case studies in Belgium, focusing on the operational factors and policy levers affecting consolidation-based freight transport systems, particularly in intermodal transport.


  0%|          | 42/10990 [01:01<4:08:27,  1.36s/it]

The data source consists of empirical observations of bus passenger boarding and alighting times collected from King County Metro in Seattle, Washington, using Automatic Passenger Counting (APC) and Automatic Vehicle Location (AVL) systems.


  0%|          | 43/10990 [01:03<4:01:07,  1.32s/it]

The data source consists of real-world data collected through extensive interviews and questionnaire surveys targeting three-wheeler drivers in three cities in Sri Lanka. This data aims to analyze the social capital factors affecting drivers' life satisfaction and the overall condition of the three-wheeler industry.
No data context found


  0%|          | 45/10990 [01:04<3:23:11,  1.11s/it]

The data source consists of real-world car sharing use data collected from 24Rent, a car sharing operator in Finland, along with urban form and mobility pattern monitoring data from the Finnish Environment Institute. This data includes the number of car sharing trips, their lengths, and demographic information related to urban zones.


  0%|          | 46/10990 [01:07<4:23:03,  1.44s/it]

The data source consists of archived Intelligent Transportation Systems (ITS) transit data, including Automatic Vehicle Location (AVL), Automatic Passenger Counters (APC), and Automatic Fare Card (AFC) data, as well as real-time subway and bus data from the Metropolitan Transportation Authority (MTA) in New York City, published in General Transit Feed Specifications (GTFS-R) and Service Interface for Real Time Information (SIRI) formats.


  0%|          | 47/10990 [01:08<4:21:47,  1.44s/it]

The data source consists of various quantitative recovery-focused civil infrastructure system (CIS) models that analyze population displacement due to natural disasters, incorporating both real-world data and simulation data to evaluate infrastructure interdependencies and recovery timelines.


  0%|          | 48/10990 [01:10<4:37:38,  1.52s/it]

The data source consists of primary data collected through surveys to evaluate pedestrian speed influenced by various factors such as age group, gender, group size, and trip purpose.


  0%|          | 49/10990 [01:11<4:18:22,  1.42s/it]

The data source consists of responses collected from high school students participating in an evaluation study of a serious game designed to improve knowledge, attitude, and willingness to use mobility on demand systems (MODS).


  0%|          | 50/10990 [01:13<4:30:05,  1.48s/it]

The Pikalert® system integrates real-time weather information and data from connected vehicles to enhance the safety and efficiency of surface transportation systems. It utilizes a combination of observational and model data, employing expert systems and machine learning algorithms for data processing and forecasting.


  0%|          | 51/10990 [01:14<4:15:28,  1.40s/it]

The data source consists of real estate market valuation data and Census data collected from Sydney, Australia, focusing on property prices in relation to the proximity of new transport infrastructure.


  0%|          | 52/10990 [01:15<4:06:36,  1.35s/it]

The data source for this study consists of experimental data collected from drivers interacting with different Human-Machine Interfaces (HMIs) designed to support cooperative driving scenarios. The data includes driver responses, performance metrics, and subjective assessments of the HMI effectiveness in facilitating cooperation during driving maneuvers.


  0%|          | 53/10990 [01:17<4:33:36,  1.50s/it]

The data source consists of both real-world incident data and simulation data derived from mechanical models and FEM simulations to assess aircraft part failures during hazardous conditions.


  0%|          | 54/10990 [01:18<4:14:42,  1.40s/it]

The data source for this study consists of peer-reviewed articles retrieved from major biomedical and health databases, specifically MEDLINE, EMBASE, and CINAHL, which provide real-world data on the outcomes of adult trauma patients following road traffic crashes.


  1%|          | 55/10990 [01:19<4:09:32,  1.37s/it]

The data source consists of real-world commuting data for employees of the Oak Ridge National Laboratory (ORNL), including employee residences by zip code, and associated geographic and transportation data used to analyze commuting patterns and develop a commuting program.


  1%|          | 56/10990 [01:21<4:33:17,  1.50s/it]

The data source consists of various geographic and environmental datasets used to analyze critical road segments in Volusia County, Florida, particularly in the context of post-disaster scenarios such as flooding from hurricanes. The datasets include road network shapefiles, land cover data, impervious surface data, and building footprint data, all of which are essential for understanding the infrastructure and its vulnerability during extreme weather events.


  1%|          | 57/10990 [01:23<4:59:41,  1.64s/it]

The data source for this study is derived from satellite images analyzed through object-based image analysis to identify zones with high potential for trip generation in João Pessoa, Brazil. This approach integrates remote sensing techniques to assess land use and land cover changes, which are then correlated with urban trip generation data.


  1%|          | 58/10990 [01:24<4:31:28,  1.49s/it]

The data source for this study consists of survey responses collected from airline passengers in the United States, focusing on their preferences between legacy and low-cost carriers.


  1%|          | 59/10990 [01:26<4:31:46,  1.49s/it]

The data source consists of survey responses and interviews conducted with management level employees in the Plaine Saint-Denis business district, focusing on their commuting patterns and scheduling choices.


  1%|          | 60/10990 [01:27<4:14:00,  1.39s/it]

The data source consists of simulation results derived from the microscopic simulation software INTEGRATION, which models traffic flow and vehicle behavior at isolated intersections under various conditions.


  1%|          | 61/10990 [01:28<4:13:33,  1.39s/it]

This study examines the impact of editorial patterns in traffic crash reporting on public perceptions of road safety, using an experimental design with human subjects.


  1%|          | 62/10990 [01:29<3:54:48,  1.29s/it]

The data source consists of experimental data collected from a driving simulator study involving participants' performance in driving tasks and their scores on various cognitive measures related to attention, inhibition, and working memory.


  1%|          | 63/10990 [01:34<7:02:41,  2.32s/it]

The data source consists of large-scale travel time data collected anonymously from various technological devices such as GPS, sensors, and smartphones. This data is used to analyze and develop link-level travel time measures for assessing the level of service (LOS) on urban road links in Charlotte, North Carolina.


  1%|          | 64/10990 [01:36<6:25:33,  2.12s/it]

The data source for this study is derived from real-world observations and analyses of the road transport system in Queensland, Australia, focusing on crash contributory factors and potential interventions for road safety.


  1%|          | 65/10990 [01:37<5:48:26,  1.91s/it]

The data source consists of GPS data collected from a fleet of class six delivery trucks operated by a major parcel delivery company in Columbus, Ohio. The data includes vehicle speed, local time and date, longitude and latitude, fuel consumption, coolant temperature, and engine speed, which were used to analyze delivery patterns and develop a freight delivery demand estimation model.


  1%|          | 66/10990 [01:39<5:14:45,  1.73s/it]

Data collected to understand the characteristics, costs, frequencies, and acceptability of rural transport services in Ghana, focusing on motorcycle taxis and public transport services.


  1%|          | 67/10990 [01:40<5:17:01,  1.74s/it]

The data source consists of real-time passenger information and communication data collected from the TravelBot system, which interacts with users on Twitter to provide updates on public transport disruptions and service changes.


  1%|          | 68/10990 [01:42<4:52:42,  1.61s/it]

This research utilizes real-world data collected from user-generated Yelp business reviews and parking supply data at the parcel level in the Phoenix, Arizona region to analyze customer sentiment towards parking and its association with business ratings.


  1%|          | 69/10990 [01:44<5:32:25,  1.83s/it]

The data source for this study includes qualitative and quantitative data collected from focus group meetings, telephone interviews with users, and chart audits of an existing adapted driver education program.


  1%|          | 70/10990 [01:45<5:02:18,  1.66s/it]

The data source consists of eye and head movement data collected from drivers in both a driving simulator and naturalistic driving conditions, using various eye-tracking and motion capture technologies.


  1%|          | 71/10990 [01:47<4:44:22,  1.56s/it]

The data source consists of numerical simulations and experimental data related to traffic flows and air quality in city tunnels, focusing on the interaction between vehicle emissions and airflow dynamics.


  1%|          | 72/10990 [01:48<4:40:42,  1.54s/it]

The data source consists of a systematic literature review of 111 interdisciplinary publications discussing the socio-technical aspects of civil drones for transportation purposes, focusing on barriers, problems, solutions, and benefits associated with their use.


  1%|          | 73/10990 [01:49<4:28:19,  1.47s/it]

The data source for this study is derived from workshops conducted with children and young people aged 8-18, utilizing Lego™ to capture their views and perceptions regarding Mobility as a Service (MaaS).


  1%|          | 74/10990 [01:51<4:18:00,  1.42s/it]

Data collected from surveys conducted among commuters in three urban cities in Metro Cebu, Philippines, focusing on their transport mode choices and the factors influencing these choices.


  1%|          | 75/10990 [01:52<4:13:03,  1.39s/it]

The data source consists of annual data on the quantity of crude oil, petroleum products, and dry cargo transported by sea, as well as world GDP and oil prices, sourced from UNCTAD and the World Bank.


  1%|          | 76/10990 [01:53<4:15:22,  1.40s/it]

The data source consists of real-world data collected from various European airlines regarding their implementation of Fatigue Risk Management Systems (FRMS), including operational metrics, crew duty times, rest durations, and safety performance indicators. This data is used to analyze the effectiveness and bureaucratic implications of FRMS in managing crew fatigue.


  1%|          | 77/10990 [01:55<4:39:25,  1.54s/it]

The data source consists of real-world evaluations of pavement conditions in the state of Paraná, Brazil, using the Pavement Condition Index (PCI) method. The study involved both objective and subjective assessments of pavement defects, which were analyzed and visualized using GIS software.


  1%|          | 78/10990 [01:57<4:39:11,  1.54s/it]

The data source consists of survey results from consumers and car sellers in The Netherlands regarding their knowledge and information about Advanced Driver Assistance Systems (ADAS).


  1%|          | 79/10990 [01:58<4:17:00,  1.41s/it]

The data source for this study consists of qualitative data collected from five real-world transport infrastructure planning projects in Sweden, focusing on collaborative planning and design work among professionals from different disciplines.


  1%|          | 80/10990 [01:59<4:01:17,  1.33s/it]

The data source consists of cognitive task performance and on-road driving test results from older adult participants, collected to evaluate driving safety and cognitive decline.


  1%|          | 81/10990 [02:00<3:50:24,  1.27s/it]

The data source consists of real-world data collected from a Brazilian city's transport system, including GHG emissions measurements and urban mobility project assessments.


  1%|          | 82/10990 [02:01<3:44:23,  1.23s/it]

The data source consists of real-world data collected from various countries in Asia, specifically Indonesia, Malaysia, and Vietnam, focusing on gasoline and diesel supply policies and fuel quality management.


  1%|          | 83/10990 [02:03<3:53:25,  1.28s/it]

The data source consists of over 1.6 million GPS coordinates collected from bikeshare users in Baltimore City over a four-month period, used to analyze bike route usage and identify potential new bike station locations.


  1%|          | 84/10990 [02:04<3:51:11,  1.27s/it]

This data source consists of original observational data collected on the parking practices and violations of e-scooters, bikes, and motor vehicles in various cities, focusing on their impact on sidewalk and roadway access.


  1%|          | 85/10990 [02:06<4:27:10,  1.47s/it]

The data source consists of real-world data collected from motor vehicle collision (MVC) incidents, focusing on the economic costs associated with injuries, compensation estimates, and various factors influencing the severity of collisions. This includes detailed statistics on occupant demographics, vehicle characteristics, environmental conditions, and crash types, as well as the expected compensation costs (ECCs) linked to injuries sustained in these incidents.


  1%|          | 86/10990 [02:07<4:13:36,  1.40s/it]

The data source consists of real-world observations and experiments conducted over 39 months to assess the nesting behavior of the mud-nesting keyhole wasp in aircraft pitot probes at Brisbane Airport, along with risk analysis and climate modeling.


  1%|          | 87/10990 [02:09<4:18:16,  1.42s/it]

This study reviews emerging data sources for monitoring non-motorized travel, categorizing them into mode-unspecified and mode-specified data, and discusses their applications, challenges, and potential for enhancing travel data accuracy and comprehensiveness.


  1%|          | 88/10990 [02:10<4:05:34,  1.35s/it]

Data collected from an intercept survey of travelers on a university campus to investigate pedestrian-cyclist interactions, safety perceptions, and experienced incidents.


  1%|          | 89/10990 [02:11<3:47:38,  1.25s/it]

The data source for this study includes physiological measurements (heart rate) and subjective evaluations (surveys) collected from real drivers during a driving test on different types of highway medians.


  1%|          | 90/10990 [02:13<4:08:37,  1.37s/it]

The data source consists of real-world transportation demand data collected from various publicly available sources, including flight data from airlines and traffic statistics from the California Department of Transportation. This data is used to estimate the number of passengers traveling between Los Angeles and San Francisco by air and road, which is critical for developing a simulation model for the Hyperloop system.


  1%|          | 91/10990 [02:14<4:06:52,  1.36s/it]

The data source consists of real-world driving data collected from vehicles equipped with the WICE system, which logs various parameters during naturalistic driving cycles to analyze the usage of Automated Driver Assistance Systems (ADAS) under different driving contexts.


  1%|          | 92/10990 [02:15<4:11:02,  1.38s/it]

The data source consists of survey responses collected from adolescents regarding their attitudes and intentions towards safe driving practices and the use of smartphone-based driving safety technology.


  1%|          | 93/10990 [02:16<4:02:12,  1.33s/it]

This paper reviews and synthesizes existing methodologies for assessing the equity of emerging transportation technologies, focusing on accessibility, traffic emissions, and safety outcomes.


  1%|          | 94/10990 [02:18<3:59:38,  1.32s/it]

The data source is the United Kingdom Time Use Survey, which collects information on how individuals allocate their time across various activities over a two-day period. It includes demographic data and subjective well-being ratings for different activities.


  1%|          | 95/10990 [02:19<3:51:35,  1.28s/it]

The data source consists of qualitative data collected from focus groups of vulnerable populations affected by California wildfires, exploring their perspectives on the sharing economy's role in evacuation scenarios.


  1%|          | 96/10990 [02:20<3:54:41,  1.29s/it]

The data source consists of experimental data collected from a driving simulator where participants made decisions under time pressure regarding which group of pedestrians to avoid. The study analyzed drivers' eye movements and ethical decision-making in simulated scenarios that mimic real-world driving situations.


  1%|          | 97/10990 [02:22<4:13:24,  1.40s/it]

The data source consists of a stated preference survey designed to explore individuals' attitudes and perceptions regarding the renting of personal vehicles in peer-to-peer carsharing. The survey collected socio-demographic information, travel behavior, and travel patterns from respondents.


  1%|          | 98/10990 [02:23<3:48:58,  1.26s/it]

The data source for this study is primarily simulation data generated from an agent-based model designed to assess vulnerabilities at airport security checkpoints. The model incorporates human behavior and decision-making processes of security operators, passengers, and potential attackers.


  1%|          | 99/10990 [02:24<3:39:38,  1.21s/it]

This study utilizes real-world data collected from surveys, interviews, and archival analysis of cities and transit agencies across Texas to examine the integration challenges between app-based, on-demand services and traditional fixed route transit services.


  1%|          | 100/10990 [02:25<3:57:08,  1.31s/it]

The data source includes real-world data collected from the automatic identification system (AIS) tracking the movement of ocean cruise ships globally, as well as data from the Centers for Disease Control and Prevention (CDC) regarding cruise ships affected by COVID-19.


  1%|          | 101/10990 [02:27<3:53:39,  1.29s/it]

The data source consists of origin and destination mobility surveys, high-resolution traffic assignment models, and emission models used to study air quality and CO2 emissions from on-road transportation in the Metropolitan Area of São Paulo (MASP).


In [9]:
data

Unnamed: 0,title,doi,volume,date,year,month,abstract,issn,journal_name,unique_id,...,is_availablity_statement,is_data_mentioned_in_section_title,is_experiment_mentioned_in_section_title,is_link_in_avaiablity_statement,num_of_links_in_avaiablity_statement,source_description,real_world,simulation,dataset_size_description,data_collection_description
0,Decentralized network level adaptive signal co...,10.1016/j.trip.2019.100020,1,2019-06-01,2019,6,Adaptive traffic signal control systems are de...,2590-1982,TRIP,10.1016_j.trip.2019.100020,...,0,0,1,0,0,The data source for this study is primarily si...,False,True,False,False
1,Physical activity of electric bicycle users co...,10.1016/j.trip.2019.100017,1,2019-06-01,2019,6,Physical activity has been widely associated w...,2590-1982,TRIP,10.1016_j.trip.2019.100017,...,0,0,0,0,0,The data source for this study is a longitudin...,True,False,True,True
2,Increasing civil aviation capacity in China re...,10.1016/j.trip.2019.100005,1,2019-06-01,2019,6,China is the world's second largest aviation m...,2590-1982,TRIP,10.1016_j.trip.2019.100005,...,0,0,0,0,0,The data source for this study includes academ...,True,False,False,True
3,Progress or regress on gender equality: The ca...,10.1016/j.trip.2019.100009,1,2019-06-01,2019,6,This paper examines the role of vocational edu...,2590-1982,TRIP,10.1016_j.trip.2019.100009,...,0,1,0,0,0,The data for this research paper was collected...,True,False,False,True
4,Multiobjective integrated signal-control syste...,10.1016/j.trip.2019.100011,1,2019-06-01,2019,6,"Parameters concerning real-time, advanced traf...",2590-1982,TRIP,10.1016_j.trip.2019.100011,...,0,1,1,0,0,The data used in this study is generated throu...,False,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10985,Energy savings and emissions reduction of BEVs...,10.1016/j.trd.2024.104403,136,2024-11-01,2024,11,Improving urban dwellers quality of life requi...,1361-9209,TRD,10.1016_j.trd.2024.104403,...,0,0,0,0,0,,,,,
10986,Effects of 1.5 °C global warming on pavement c...,10.1016/j.trd.2024.104393,136,2024-11-01,2024,11,This study compared 11 global climate models (...,1361-9209,TRD,10.1016_j.trd.2024.104393,...,1,0,0,0,0,,,,,
10987,Decarbonising transport: Can we rely on fuel t...,10.1016/j.trd.2024.104391,136,2024-11-01,2024,11,"Although not without criticism, carbon pricing...",1361-9209,TRD,10.1016_j.trd.2024.104391,...,1,1,0,0,0,,,,,
10988,Amphibian roadkill patterns in an Asian tropic...,10.1016/j.trd.2024.104396,136,2024-11-01,2024,11,Wildlife roadkills have emerged as one of the ...,1361-9209,TRD,10.1016_j.trd.2024.104396,...,1,1,0,0,0,,,,,


In [34]:
data_context

'This study used data from the longitudinal, online survey conducted as part of the European research project Physical Activity through Sustainable Transport Approaches (PASTA) (Dons et al., 2015; Gerike et al., 2016). PASTA aimed to strengthen the understanding of determinants of active mobility (Götschi et al., 2017) as well as its health impacts (Mueller et al., 2018) by integrating approaches from transport and health research. The survey took place from November 2014 to January 2017 in seven European cities: Antwerp (Belgium), Barcelona (Spain), London (United Kingdom), Örebro (Sweden), Rome (Italy), Vienna (Austria) and Zurich (Switzerland). Survey participants were recruited opportunistically on a rolling basis, applying a diverse set of approaches but following a common sampling strategy across cities (Gaupp-Berghausen and Raser, 2017). Active transport modes were intentionally oversampled to have sufficiently large sample sizes for different transport modes in each of the citi

In [12]:
data['is_data_mentioned_in_section_title'] = 0
url_pattern = r'(https?://\S+|www\.\S+)'  # URL pattern
for i in range(len(data)):
# # In case for the mini test to debug the code
# for i in tqdm(range(1000)):
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    sections = extract_sections_and_text_from_xml(paper_path)
    reorganized_sections = postprocess_sections(sections)
    label = 0
    url = []
    for section in reorganized_sections:
        section_title = section['title']
        if 'data' in section_title.lower():
            if section['text']:
                if 'avaiable' in section['text'].lower():
                    label = 1
                    print(section['text'])
        for subsection in section['subsections']:
            subsection_title = subsection['title']
            if 'data' in subsection_title.lower():
                section_text = subsection['text']
                label = 1
                if section['text']:
                    url.extend(re.findall(url_pattern, section['text']))
            for subsubsection in subsection['subsubsections']:
                subsubsection_title = subsubsection['title']
                if 'data' in subsubsection_title.lower():
                    label = 1
                    if section['text']:
                        url.extend(re.findall(url_pattern, section['text']))
    data.loc[i, 'is_data_mentioned_in_section_title'] = label
    # if url:
    #     print(url)
print(len(data[data['is_data_mentioned_in_section_title'] == 1])/(len(data)))
data.to_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'), index=False)

0.3935395814376706


In [None]:
data['is_experiment_mentioned_in_section_title'] = 0
for i in range(len(data)):
# # In case for the mini test to debug the code
# for i in tqdm(range(1000)):
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    sections = extract_sections_and_text_from_xml(paper_path)
    reorganized_sections = postprocess_sections(sections)
    label = 0
    for section in reorganized_sections:
        section_title = section['title']
        if 'experiment' in section_title.lower():
            label = 1
        for subsection in section['subsections']:
            subsection_title = subsection['title']
            if 'experiment' in subsection_title.lower():
                label = 1
            for subsubsection in subsection['subsubsections']:
                subsubsection_title = subsubsection['title']
                if 'experiment' in subsubsection_title.lower():
                    label = 1
    data.loc[i, 'is_experiment_mentioned_in_section_title'] = label
data.to_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'), index=False)

In [None]:
data['is_link_in_avaiablity_statement'] = 0
data['num_of_links_in_avaiablity_statement'] = 0
url_pattern = r'(https?://\S+|www\.\S+)'  # URL pattern
# # In case for the mini test to debug the code
url_data = []
for i in range(len(data)):
# for i in range(1000):
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    sections = extract_sections_and_text_from_xml(paper_path)
    reorganized_sections = postprocess_sections(sections)
    for section in reorganized_sections:
        section_title = section['title']
        if 'data availability' in section_title.lower():
            if 'http' in section['text']:
                data.loc[i, 'is_link_in_avaiablity_statement'] = 1
                url = re.findall(url_pattern, section['text'])
                unique_url = list(set(url))
                if url:
                    print(url)
                url_data.append({
                'issn': data['issn'][i],
                'unique_id': data['unique_idf'][i],
                'title': data['title'][i],
                'url': unique_url
                })
                data.loc[i, 'num_of_links_in_avaiablity_statement'] = len(unique_url)
data.to_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'), index=False)
save_json(url_data, os.path.join(meta_data_folder, 'url_data.json'))

In [40]:
content = response.choices[0].message.content

In [44]:
# transform a str to json
content

'{"real_world":true,"simulation":false,"details":{"dataset_size_description":true,"data_collection_description":true}}'

In [47]:
import json
json_content = json.loads(content)

In [50]:
json_content['real_world']

True