### File Structure of the Dataset Folder
```
├── journal-meta/
│   ├── 0968-090X.csv         # TRC
│   ├── mini-dataset.csv      # A mini dataset for tutorial, extracted from TRC
│   └── ... (other journal CSV files)
├── journal-full-text/
│   ├── 0968-090X/
│   │   └── 10.1016_j.trc.2023.104311.xml
│   └── ... (other DOI folders)
```

In [2]:
import pandas as pd
import os
from tqdm import tqdm
from utils import *
full_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-full-text'
meta_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-meta'

In [3]:
data = pd.read_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'))
data.head()

Unnamed: 0,title,doi,volume,date,year,month,abstract,issn,journal_name,unique_id
0,Decentralized network level adaptive signal co...,10.1016/j.trip.2019.100020,1,2019-06-01,2019,6,Adaptive traffic signal control systems are de...,2590-1982,TRIP,10.1016_j.trip.2019.100020
1,Physical activity of electric bicycle users co...,10.1016/j.trip.2019.100017,1,2019-06-01,2019,6,Physical activity has been widely associated w...,2590-1982,TRIP,10.1016_j.trip.2019.100017
2,Increasing civil aviation capacity in China re...,10.1016/j.trip.2019.100005,1,2019-06-01,2019,6,China is the world's second largest aviation m...,2590-1982,TRIP,10.1016_j.trip.2019.100005
3,Progress or regress on gender equality: The ca...,10.1016/j.trip.2019.100009,1,2019-06-01,2019,6,This paper examines the role of vocational edu...,2590-1982,TRIP,10.1016_j.trip.2019.100009
4,Multiobjective integrated signal-control syste...,10.1016/j.trip.2019.100011,1,2019-06-01,2019,6,"Parameters concerning real-time, advanced traf...",2590-1982,TRIP,10.1016_j.trip.2019.100011


In [10]:
data['is_github'] = 0
data['num_of_github_urls'] = 0
# create a new json file to store the github data
github_data = []
for i in range(len(data)):
# # In case for the mini test to debug the code
# for i in tqdm(range(1000)):
    github_urls = []
    github_full_urls = []
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    sections = extract_sections_and_text_from_xml(paper_path)
    reorganized_sections = postprocess_sections(sections)
    for section in reorganized_sections:
        urls = extract_github_urls(cleanup(section['text']))
        full_urls = extract_full_github_urls(cleanup(section['text']))
        if urls:
            github_urls.extend(urls)
        if full_urls:
            github_full_urls.extend(full_urls)
        for subsection in section['subsections']:
            urls = extract_github_urls(cleanup(subsection['text']))
            full_urls = extract_full_github_urls(cleanup(subsection['text']))
            if urls:
                github_urls.extend(urls)
            if full_urls:
                github_full_urls.extend(full_urls)
            for subsubsection in subsection['subsubsections']:
                urls = extract_github_urls(cleanup(subsubsection['text']))
                full_urls = extract_full_github_urls(cleanup(subsubsection['text']))
                if urls:
                    github_urls.extend(urls)
                if full_urls:
                    github_full_urls.extend(full_urls)
    if github_urls:
        # Mark as GitHub present
        data.loc[i, 'is_github'] = 1
        # Remove duplicate URLs and count unique URLs
        unique_github_urls = set(github_urls)
        print('GitHub URLs found:', unique_github_urls)
        unique_github_full_urls = set(github_full_urls)
        data.loc[i, 'num_of_github_urls'] = len(unique_github_urls)
        github_data.append({
            'issn': data['issn'][i],
            'unique_id': data['unique_id'][i],
            'title': data['title'][i],
            'github_urls': list(unique_github_urls),
            'github_full_urls': list(unique_github_full_urls)
        })
# save the json file github_data
save_json(github_data, os.path.join(meta_data_folder, 'github_data.json'))

GitHub URLs found: {'https://github.com/availabs/MTA_Subway_SIRI_Server'}
GitHub URLs found: {'https://github.com/numenta/NAB'}
GitHub URLs found: {'https://github.com/google-research/google-research'}
GitHub URLs found: {'https://github.com/CSSEGISandData/COVID-19'}
GitHub URLs found: {'https://github.com/gboeing/osmnx'}
GitHub URLs found: {'https://github.com/srezaei90/Park-and-Ride-Facility-Location-Optimization'}
GitHub URLs found: {'https://github.com/Jaiaid/TNDP_Evac_Heuristic', 'https://github.com/mahi045/JMetal4'}
GitHub URLs found: {'https://github.com/csipetas/MixedFleetsinFlexibleTransport'}
GitHub URLs found: {'https://github.com/Lilyhanig/transit_covid_precautions'}
GitHub URLs found: {'https://github.com/HRI-EU/e_adarp_material'}
GitHub URLs found: {'https://github.com/Moccino17/Transport_Mode_Sklearn'}
GitHub URLs found: {'https://github.com/maxime-gueriau/ITSC2020_CAV_impact'}
GitHub URLs found: {'https://github.com/valentijnstienen/PemPem-paper'}
GitHub URLs found: {'h

In [11]:
data.to_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'), index=False)

In [12]:
grouped_counts = data.groupby('journal_name')['is_github'].value_counts()
# grouped_counts
grouped_counts = grouped_counts.reset_index(name='count')
# Group by journal_name and calculate the sum of counts for each group
grouped_counts['percentage'] = (
    grouped_counts['count'] / grouped_counts.groupby('journal_name')['count'].transform('sum')
) * 100
# Display the updated DataFrame
grouped_counts[grouped_counts['is_github'] == 1].sort_values('percentage', ascending=False).head(10)

Unnamed: 0,journal_name,is_github,count,percentage
5,TRC,1,167,8.146341
3,TRB,1,53,6.191589
9,TRE,1,36,2.223595
1,TRA,1,34,2.010645
7,TRD,1,30,1.436782
13,TRIP,1,14,1.215278
11,TRF,1,10,0.65189


In [None]:
for journal in journals:
    if journal == 'mini-dataset.csv': # to take the mini dataset for tutorial
        journal_issn = journal_issn_df[journal_issn_df['journal'] == journal.replace('.csv', '')]['issn'].values[0]
        journal_meta = pd.read_csv(os.path.join(meta_folder, journal))
        journal_meta['unique_id'] = journal_meta['doi'].apply(doi_to_unique_id) # to convert the doi to a unique id
        # here we use the for loop to help understand how it works, it can be done in one line of code later
        for i in range(len(journal_meta)):
            github_urls = []
            fulltext_path = os.path.join(full_text_folder, journal_issn, journal_meta.iloc[i]['unique_id'] + '.xml')
            sections = extract_sections_and_text_from_xml(fulltext_path)
            reorganized_sections = postprocess_sections(sections)
            for section in reorganized_sections:
                # add a preprcessing for the text here to make it more readable
                urls  = extract_github_urls(cleanup_abstract(section['text']))
                if urls:
                    github_urls.extend(urls)
                for subsection in section['subsections']:
                    urls = github_urls.extend(extract_github_urls(cleanup_abstract(subsection['text'])))
                    if urls:
                        github_urls.extend(urls)
                    for subsubsection in subsection['subsubsections']:
                        urls = extract_github_urls(cleanup_abstract(subsubsection['text']))
                        if urls:
                            github_urls.extend(urls)
            if github_urls:
                print(github_urls)

['https://github.com/pabloguarda/isuelogit', 'https://github.com/pabloguarda/pesuelogit']
['https://github.com/zhandongxu/GP_RTAP']
['https://github.com/ethz-coss/voting_traffic']
['https://github.com/tjzxh/EADC']
['https://github.com/LehmannJonas/2E-MT-VRP-PTW-Instances', 'https://github.com/LehmannJonas/2E-MT-VRP-PTW-Instances']
['https://github.com/LiBiyue/MAST-GNN']
['https://github.com/HDDL/DPRDDM']


In [13]:
import openai
import yaml
import json
# Load API key from config.yaml
with open("/Users/junyi/Work/RR/config.yaml", "r") as config_file:
    config = yaml.safe_load(config_file)

openai.api_key = config["openai_api_key"]
def determine_repository_tool_openai(context_info):
    """
    Uses OpenAI's API to determine if the repository implements the described method,
    and ensures the output is structured as JSON, even if extra formatting is present.
    """
    prompt = """
    **Task**:
    Based on the description above, evaluate whether the provided code repository is an actual implementation of the method discussed in the article.

    **Guidelines**:

    - **Definition of Implementation**: The code is considered an implementation if it is written to test the method or to provide code for reproducing the results presented in the article or to share the code and data for the article.
    
    - **Non-Implementation**: Some repositories may be cited because they utilize existing tools or datasets rather than providing an implementation of the described method. If the GitHub link is just for an existing tool or dataset and not an implementation of the method, please respond with "no".

    - **Indicators of Implementation**:
    - Look for explicit statements such as "the code is available/provided/can be accessed at" or "the code is available at".
    - Such statements are good indicators that the repository contains an implementation of the method.

    Please respond with a structured output in the following format:

    ```json
    {
    "isImplementation": "yes" or "no",
    "reason": "[Provide a brief explanation here]",
    "keyIndicators": ["[Optional: List any key points or observations supporting your reasoning (direct quotes from the text)]"]
    }
    ```
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an expert researcher specializing in transportation systems and software tools."},
                {"role": "user", "content": (
                    context_info + prompt
                )}
            ],
            temperature=0,
            max_tokens=16383,
        )

        # Extract the model's response
        content = response.choices[0].message.content

        # Cleanup: Remove code block markers if present
        if content.startswith("```json"):
            content = content[7:-3].strip()  # Strip the ```json and ending ``` markers

        # Ensure the output is valid JSON
        try:
            structured_output = json.loads(content)
            return structured_output  # Return as a Python dictionary
        except json.JSONDecodeError:
            # Handle cases where the output isn't valid JSON
            return {
                "error": "Invalid JSON format returned by the model, even after cleanup.",
                "rawResponse": content
            }

    except Exception as e:
        # Handle API call errors
        return {
            "error": f"An error occurred: {str(e)}"
        }
    
def determine_repository_tool_openai_4o(context_info):
    """
    Uses OpenAI's API to determine if the repository implements the described method,
    and ensures the output is structured as JSON, even if extra formatting is present.
    """
    prompt = """
    **Task**:
    Based on the description above, evaluate whether the provided code repository is an actual implementation of the method discussed in the article.

    **Guidelines**:

    - **Definition of Implementation**: The code is considered an implementation if it is written to test the method or to provide code for reproducing the results presented in the article or to share the code and data for the article.
    
    - **Non-Implementation**: Some repositories may be cited because they utilize existing tools or datasets rather than providing an implementation of the described method. If the GitHub link is just for an existing tool or dataset and not an implementation of the method, please respond with "no".

    - **Indicators of Implementation**:
    - Look for explicit statements such as "the code is available/provided/can be accessed at" or "the code is available at".
    - Such statements are good indicators that the repository contains an implementation of the method.

    ```
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an expert researcher specializing in transportation systems and software tools."},
                {"role": "user", "content": (
                    context_info + prompt
                )},
            ],
            response_format={
                "type": "json_schema",
                "json_schema": {
                "name": "implementation_schema",
                "strict": True,
                "schema": {
                    "type": "object",
                    "properties": {
                    "isImplementation": {
                        "type": "string",
                        "description": "Indicates whether the implementation is present or not.",
                        "enum": [
                        "yes",
                        "no"
                        ]
                    },
                    "reason": {
                        "type": "string",
                        "description": "A brief explanation supporting the isImplementation value."
                    },
                    "keyIndicators": {
                        "type": "array",
                        "description": "A list of key points or observations supporting the reasoning.",
                        "items": {
                        "type": "string"
                        }
                    }
                    },
                    "required": [
                    "isImplementation",
                    "reason",
                    "keyIndicators"
                    ],
                    "additionalProperties": False
                }
                }
            },
            temperature=0,
            max_tokens=16383,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
            )
        # Extract the model's response
        content = response.choices[0].message.content

        # Cleanup: Remove code block markers if present
        if content.startswith("```json"):
            content = content[7:-3].strip()  # Strip the ```json and ending ``` markers

        # Ensure the output is valid JSON
        try:
            structured_output = json.loads(content)
            return structured_output  # Return as a Python dictionary
        except json.JSONDecodeError:
            # Handle cases where the output isn't valid JSON
            return {
                "error": "Invalid JSON format returned by the model, even after cleanup.",
                "rawResponse": content
            }

    except Exception as e:
        # Handle API call errors
        return {
            "error": f"An error occurred: {str(e)}"
        }    
        

In [None]:
for i in range(len(data)):
    

In [25]:
for journal in journals:
    if journal == '0968-090X.csv': # to take the trc dataset for test
        journal_issn = journal.replace('.csv', '')
        # journal_issn = journal_issn_df[journal_issn_df['journal'] == journal.replace('.csv', '')]['issn'].values[0]
        journal_name = journal_issn_df[journal_issn_df['issn'] == journal_issn]['journal'].values[0]
        print(journal_name)
        journal_meta = pd.read_csv(os.path.join(meta_folder, journal))
        journal_meta['unique_id'] = journal_meta['doi'].apply(doi_to_unique_id) # to convert the doi to a unique id
        # transform the journal_meta to a json
        journal_meta_json = journal_meta.to_json(orient='records', default_handler=str)
        # here we use the for loop to help understand how it works, it can be done in one line of code later
        count = 0
        for i in range(len(journal_meta)):
        # for i in range(50):
            github_urls = []
            fulltext_path = os.path.join(full_text_folder, journal_issn, journal_meta.iloc[i]['unique_id'] + '.xml')
            sections = extract_sections_and_text_from_xml(fulltext_path)
            reorganized_sections = postprocess_sections(sections)
            paragraphs = []
            for section in reorganized_sections:
                # add a preprcessing for the text here to make it more readable
                urls  = extract_github_urls(cleanup_abstract(section['text']))
                # get the paragraph where the github url is located
                if urls:
                    github_urls.extend(urls)
                    print(urls)
                for subsection in section['subsections']:
                    urls = github_urls.extend(extract_github_urls(cleanup_abstract(subsection['text'])))
                    if urls:
                        github_urls.extend(urls)
                    for subsubsection in subsection['subsubsections']:
                        urls = extract_github_urls(cleanup_abstract(subsubsection['text']))
                        if urls:
                            github_urls.extend(urls)
            if github_urls:
                count+=1
                # create a json file to store the results
                results = []
                for url in github_urls:
                    for section in reorganized_sections:
                        if url in section['text']:
                            # print(cleanup_abstract(section['text']))
                            location_text = cleanup_abstract(section['text'])
                            context_info = get_context_with_url(location_text, url, context_up_range=20, context_down_range=2)
                            tool_or_not = determine_repository_tool_openai_4o(context_info)
                            print(tool_or_not)
                            if tool_or_not['isImplementation'] == 'yes':
                                print(url)
                            results.append({
                                "url": url,
                                "context_identified": tool_or_not,
                                "context_info": context_info
                            })
                        for subsection in section['subsections']:
                            if url in subsection['text']:
                                # print(cleanup_abstract(subsection['text']))
                                location_text = cleanup_abstract(subsection['text'])
                                context_info = get_context_with_url(location_text, url, context_up_range=20, context_down_range=2)
                                tool_or_not = determine_repository_tool_openai_4o(context_info)
                                print(tool_or_not)
                                if tool_or_not['isImplementation'] == 'yes':
                                    print(url)
                                results.append({
                                    "url": url,
                                        "context_identified": tool_or_not,
                                        "context_info": context_info
                                    })
                            for subsubsection in subsection['subsubsections']:
                                if url in subsubsection['text']:
                                    # print(cleanup_abstract(subsubsection['text']))
                                    location_text = cleanup_abstract(subsubsection['text'])
                                    context_info = get_context_with_url(location_text, url, context_up_range=20, context_down_range=2)
                                    tool_or_not = determine_repository_tool_openai_4o(context_info)
                                    print(tool_or_not)
                                    if tool_or_not.get('error') != 'yes':
                                        results.append({
                                            "url": url,
                                            "context_identified": tool_or_not,
                                            "context_info": context_info
                                        })
                                    results.append({
                                        "url": url,
                                        "context_identified": tool_or_not,
                                        "context_info": context_info
                                    })
                # save the results to a json file
                save_context_path = os.path.join(save_folder, f'{journal_name}/{journal_meta.iloc[i]["unique_id"]}.json')
                # create the folder if not exists
                os.makedirs(save_folder, exist_ok=True)
                os.makedirs(save_folder + '/' + journal_name, exist_ok=True)
                # save the results to a json file for postprocessing
                with open(save_context_path, 'w') as f:
                    json.dump(results, f, indent=4)
                # break
        print(f'{journal_name}: {count}/{len(journal_meta)}')

TRC
{'isImplementation': 'yes', 'reason': 'The repository contains Matlab code specifically for conducting numerical experiments related to the performance of different models and data representations in a random missing scenario, as described in the article.', 'keyIndicators': ['The repository is explicitly mentioned in the context of conducting numerical experiments.', "The description indicates that the code is used to test different models and data representations, which aligns with the article's focus.", 'The repository is likely to contain scripts or functions that implement the methods discussed in the article.']}
https://github.com/lijunsun/bgcp_imputation
{'isImplementation': 'no', 'reason': 'The GitHub repository mentioned is a collection of transportation network datasets, including the Sioux Falls network, rather than an implementation of a specific method or algorithm discussed in the article.', 'keyIndicators': ["The repository is titled 'TransportationNetworks', indicati

In [60]:
results

[{'url': 'https://github.com/DanqingZ/CPS_TRC',
  'context_identified': {'isImplementation': 'yes',
   'reason': 'The provided GitHub link contains code developed for the experiments described, specifically for generating synthetic populations and simulating social networks within the San Francisco Bay Area context as discussed in the article.',
   'keyIndicators': ['The code we developed in this work is available at https://github.com/DanqingZ/CPS_TRC.',
    'To sum up, in this experiment section, we perform an illustrative experiment by generating a synthetic connected population.']}}]

In [31]:
tool_or_not['keyIndicators']

['The repository specifically mentions the BGCP model implementation.',
 'MCMC sampling algorithms are included in the code, as discussed in the experiments.',
 "The numerical experiments are structured to evaluate model performance, consistent with the article's exploration of different representations."]

In [32]:
journal_meta

Unnamed: 0,title,doi,volume,date,year,month,abstract,unique_id
0,A copula-based estimation of distribution algo...,10.1016/j.trc.2018.12.008,98,2019-01-01,2019,1,\n The importance of calibrat...,10.1016_j.trc.2018.12.008
1,A dynamic two-dimensional (D2D) weight-based m...,10.1016/j.trc.2018.12.009,98,2019-01-01,2019,1,\n Existing map-Matching (MM)...,10.1016_j.trc.2018.12.009
2,Where shall we sync? Clustering passenger flow...,10.1016/j.trc.2018.12.013,98,2019-01-01,2019,1,\n Minimizing passenger trans...,10.1016_j.trc.2018.12.013
3,Machine learning approach to predict aircraft ...,10.1016/j.trc.2018.09.007,98,2019-01-01,2019,1,\n Reliable and predictable g...,10.1016_j.trc.2018.09.007
4,Data-driven activity scheduler for agent-based...,10.1016/j.trc.2018.12.002,98,2019-01-01,2019,1,\n Activity-based modelling i...,10.1016_j.trc.2018.12.002
...,...,...,...,...,...,...,...,...
2045,A novel framework of the alternating direction...,10.1016/j.trc.2024.104843,169,2024-12-01,2024,12,\n This paper proposes a nove...,10.1016_j.trc.2024.104843
2046,A time-embedded attention-based transformer fo...,10.1016/j.trc.2024.104831,169,2024-12-01,2024,12,\n The real-time crash likeli...,10.1016_j.trc.2024.104831
2047,Copula-based transferable models for synthetic...,10.1016/j.trc.2024.104830,169,2024-12-01,2024,12,\n Population synthesis invol...,10.1016_j.trc.2024.104830
2048,The role of individual compensation and accept...,10.1016/j.trc.2024.104834,169,2024-12-01,2024,12,"\n High demand, rising custom...",10.1016_j.trc.2024.104834
