In [1]:
import openai
import yaml
import json
# Load API key from config.yaml
with open("/Users/junyi/Work/RR/config.yaml", "r") as config_file:
    config = yaml.safe_load(config_file)

openai.api_key = config["openai_api_key"]
def determine_repository_tool_openai(context_info):
    """
    Uses OpenAI's API to determine if the repository implements the described method,
    and ensures the output is structured as JSON, even if extra formatting is present.
    """
    prompt = """
    **Task**:
    Based on the description above, evaluate whether the provided code repository is an actual implementation of the method discussed in the article.

    **Guidelines**:

    - **Definition of Implementation**: The code is considered an implementation if it is written to test the method or to provide code for reproducing the results presented in the article or to share the code and data for the article.
    
    - **Non-Implementation**: Some repositories may be cited because they utilize existing tools or datasets rather than providing an implementation of the described method. If the GitHub link is just for an existing tool or dataset and not an implementation of the method, please respond with "no".

    - **Indicators of Implementation**:
    - Look for explicit statements such as "the code is available/provided/can be accessed at" or "the code is available at".
    - Such statements are good indicators that the repository contains an implementation of the method.

    Please respond with a structured output in the following format:

    ```json
    {
    "isImplementation": "yes" or "no",
    "reason": "[Provide a brief explanation here]",
    "keyIndicators": ["[Optional: List any key points or observations supporting your reasoning (direct quotes from the text)]"]
    }
    ```
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an expert researcher specializing in transportation systems and software tools."},
                {"role": "user", "content": (
                    context_info + prompt
                )}
            ],
            temperature=0,
            max_tokens=16383,
        )

        # Extract the model's response
        content = response.choices[0].message.content

        # Cleanup: Remove code block markers if present
        if content.startswith("```json"):
            content = content[7:-3].strip()  # Strip the ```json and ending ``` markers

        # Ensure the output is valid JSON
        try:
            structured_output = json.loads(content)
            return structured_output  # Return as a Python dictionary
        except json.JSONDecodeError:
            # Handle cases where the output isn't valid JSON
            return {
                "error": "Invalid JSON format returned by the model, even after cleanup.",
                "rawResponse": content
            }

    except Exception as e:
        # Handle API call errors
        return {
            "error": f"An error occurred: {str(e)}"
        }
    
def determine_repository_tool_openai_4o(context_info):
    """
    Uses OpenAI's API to determine if the repository implements the described method,
    and ensures the output is structured as JSON, even if extra formatting is present.
    """
    prompt = """
    **Task**:
    Based on the description above, evaluate whether the provided code repository is an actual implementation of the method discussed in the article.

    **Guidelines**:

    - **Definition of Implementation**: The code is considered an implementation if it is written to test the method or to provide code for reproducing the results presented in the article or to share the code and data for the article.
    
    - **Non-Implementation**: Some repositories may be cited because they utilize existing tools or datasets rather than providing an implementation of the described method. If the GitHub link is just for an existing tool or dataset and not an implementation of the method, please respond with "no".

    - **Indicators of Implementation**:
    - Look for explicit statements such as "the code is available/provided/can be accessed at" or "the code is available at".
    - Such statements are good indicators that the repository contains an implementation of the method.

    ```
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an expert researcher specializing in transportation systems and software tools."},
                {"role": "user", "content": (
                    context_info + prompt
                )},
            ],
            response_format={
                "type": "json_schema",
                "json_schema": {
                "name": "implementation_schema",
                "strict": True,
                "schema": {
                    "type": "object",
                    "properties": {
                    "isImplementation": {
                        "type": "string",
                        "description": "Indicates whether the implementation is present or not.",
                        "enum": [
                        "yes",
                        "no"
                        ]
                    },
                    "reason": {
                        "type": "string",
                        "description": "A brief explanation supporting the isImplementation value."
                    },
                    "keyIndicators": {
                        "type": "array",
                        "description": "A list of key points or observations supporting the reasoning.",
                        "items": {
                        "type": "string"
                        }
                    }
                    },
                    "required": [
                    "isImplementation",
                    "reason",
                    "keyIndicators"
                    ],
                    "additionalProperties": False
                }
                }
            },
            temperature=0,    # to make the response more deterministic
            max_tokens=16383,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
            )
        # Extract the model's response
        content = response.choices[0].message.content

        # Cleanup: Remove code block markers if present
        if content.startswith("```json"):
            content = content[7:-3].strip()  # Strip the ```json and ending ``` markers

        # Ensure the output is valid JSON
        try:
            structured_output = json.loads(content)
            return structured_output  # Return as a Python dictionary
        except json.JSONDecodeError:
            # Handle cases where the output isn't valid JSON
            return {
                "error": "Invalid JSON format returned by the model, even after cleanup.",
                "rawResponse": content
            }

    except Exception as e:
        # Handle API call errors
        return {
            "error": f"An error occurred: {str(e)}"
        }    

In [17]:
import os
import pandas as pd
# the utils.py file contains the reusable functions from the other notebooks
from utils import *
full_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-full-text'
meta_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-meta'
result_data_folder = '/Users/junyi/Work/RR/rr-measure-dataset/journal-results/GitHub-Implementation'
# if results folder does not exist, create it
if not os.path.exists(result_data_folder):
    os.makedirs(result_data_folder)
data = pd.read_csv(os.path.join(meta_data_folder, 'full-meta-dataset.csv'))
len(data)

10990

In [24]:
data['is_github'] = 0
data['num_of_github_urls'] = 0
data['is_implmentation'] = 0
# create a new json file to store the github data
github_data = []
for i in range(len(data)):
# # In case for the mini test to debug the code
# for i in range(100):
    github_urls = []
    github_full_urls = []
    journal_path = os.path.join(full_data_folder, data['issn'][i])  # Path to the journal folder
    paper_path = os.path.join(journal_path, data['unique_id'][i] + '.xml')  # Path to the paper folder
    sections = extract_sections_and_text_from_xml(paper_path)
    reorganized_sections = postprocess_sections(sections)
    for section in reorganized_sections:
        urls = extract_github_urls(cleanup(section['text']))
        full_urls = extract_full_github_urls(cleanup(section['text']))
        if urls:
            github_urls.extend(urls)
        if full_urls:
            github_full_urls.extend(full_urls)
        for subsection in section['subsections']:
            urls = extract_github_urls(cleanup(subsection['text']))
            full_urls = extract_full_github_urls(cleanup(subsection['text']))
            if urls:
                github_urls.extend(urls)
            if full_urls:
                github_full_urls.extend(full_urls)
            for subsubsection in subsection['subsubsections']:
                urls = extract_github_urls(cleanup(subsubsection['text']))
                full_urls = extract_full_github_urls(cleanup(subsubsection['text']))
                if urls:
                    github_urls.extend(urls)
                if full_urls:
                    github_full_urls.extend(full_urls)
    if github_urls:
        # Mark as GitHub present
        data.loc[i, 'is_github'] = 1
        # Remove duplicate URLs and count unique URLs
        unique_github_urls = set(github_urls)
        print('GitHub URLs found:', unique_github_urls)
        unique_github_full_urls = set(github_full_urls)
        data.loc[i, 'num_of_github_urls'] = len(unique_github_urls)
        github_data.append({
            'issn': data['issn'][i],
            'unique_id': data['unique_id'][i],
            'title': data['title'][i],
            'github_urls': list(unique_github_urls),
            'github_full_urls': list(unique_github_full_urls)
        })
        results = []
        for url in unique_github_urls:
            for section in reorganized_sections:
                if url in section['text']:
                    # print(cleanup_abstract(section['text']))
                    location_text = cleanup_abstract(section['text'])
                    context_info = get_context_with_url(location_text, url, context_up_range=20, context_down_range=5)
                    tool_or_not = determine_repository_tool_openai_4o(context_info)
                    print(tool_or_not)
                    results.append({
                        "title": data['title'][i],
                        "doi": data['doi'][i],
                        "journal_name": data['journal_name'][i],
                        "unique_id": data['unique_id'][i],
                        "section": section['title'],
                         "url": url,
                         "context_identified": tool_or_not,
                         "context_info": context_info
                         })
                    if tool_or_not['isImplementation'] == 'yes':
                        data.loc[i, 'is_implmentation'] = 1
                for subsection in section['subsections']:
                    if url in subsection['text']:
                        location_text = cleanup_abstract(subsection['text'])
                        context_info = get_context_with_url(location_text, url, context_up_range=20, context_down_range=5)
                        tool_or_not = determine_repository_tool_openai_4o(context_info)
                        print(tool_or_not)
                        results.append({
                            "title": data['title'][i],
                            "doi": data['doi'][i],
                            "journal_name": data['journal_name'][i],
                            "unique_id": data['unique_id'][i],    
                            "section": subsection['title'],
                            "url": url,
                            "context_identified": tool_or_not,
                            "context_info": context_info
                        })
                        if tool_or_not['isImplementation'] == 'yes':
                            data.loc[i, 'is_implmentation'] = 1
                    for subsubsection in subsection['subsubsections']:
                        if url in subsubsection['text']:
                            location_text = cleanup_abstract(subsubsection['text'])
                            context_info = get_context_with_url(location_text, url, context_up_range=20, context_down_range=5)
                            tool_or_not = determine_repository_tool_openai_4o(context_info)
                            print(tool_or_not)
                            results.append({
                                "title": data['title'][i],
                                "doi": data['doi'][i],
                                "journal_name": data['journal_name'][i],
                                "unique_id": data['unique_id'][i],    
                                "section": subsubsection['title'],
                                "url": url,
                                "context_identified": tool_or_not,
                                "context_info": context_info
                            })
                            if tool_or_not['isImplementation'] == 'yes':
                                data.loc[i, 'is_implmentation'] = 1
        if results:
            save_json(results, os.path.join(result_data_folder, data['unique_id'][i] + '_github_isImplememtaionResults.json'))
    # break
# save the json file github_data
save_json(github_data, os.path.join(meta_data_folder, 'github_data.json'))

GitHub URLs found: {'https://github.com/availabs/MTA_Subway_SIRI_Server'}
{'isImplementation': 'yes', 'reason': 'The description provides a detailed account of a system designed to translate GTFS-R messages into SIRI responses, and the GitHub repository link is provided as a source for the open-source code of this translation tool. This indicates that the repository contains the actual implementation of the described method.', 'keyIndicators': ['The description explicitly mentions that the open-source code for the translation tool is available at the provided GitHub link.', "The system's architecture and functionality are described in detail, suggesting that the repository contains the implementation of these components.", 'The repository is mentioned in the context of providing more information and access to the code, which aligns with the definition of an implementation.']}
GitHub URLs found: {'https://github.com/numenta/NAB'}
{'isImplementation': 'no', 'reason': 'The GitHub reposito

In [None]:
data.to_csv(os.path.join(meta_data_folder, 'full-meta-dataset-github-implementation.csv'), index=False)