In [None]:
import openai
import yaml
import json
# Load API key from config.yaml
with open("/Users/junyi/Work/RR/config.yaml", "r") as config_file:
    config = yaml.safe_load(config_file)

openai.api_key = config["openai_api_key"]
def determine_repository_tool_openai(context_info):
    """
    Uses OpenAI's API to determine if the repository implements the described method,
    and ensures the output is structured as JSON, even if extra formatting is present.
    """
    prompt = """
    **Task**:
    Based on the description above, evaluate whether the provided code repository is an actual implementation of the method discussed in the article.

    **Guidelines**:

    - **Definition of Implementation**: The code is considered an implementation if it is written to test the method or to provide code for reproducing the results presented in the article or to share the code and data for the article.
    
    - **Non-Implementation**: Some repositories may be cited because they utilize existing tools or datasets rather than providing an implementation of the described method. If the GitHub link is just for an existing tool or dataset and not an implementation of the method, please respond with "no".

    - **Indicators of Implementation**:
    - Look for explicit statements such as "the code is available/provided/can be accessed at" or "the code is available at".
    - Such statements are good indicators that the repository contains an implementation of the method.

    Please respond with a structured output in the following format:

    ```json
    {
    "isImplementation": "yes" or "no",
    "reason": "[Provide a brief explanation here]",
    "keyIndicators": ["[Optional: List any key points or observations supporting your reasoning (direct quotes from the text)]"]
    }
    ```
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an expert researcher specializing in transportation systems and software tools."},
                {"role": "user", "content": (
                    context_info + prompt
                )}
            ],
            temperature=0,
            max_tokens=16383,
        )

        # Extract the model's response
        content = response.choices[0].message.content

        # Cleanup: Remove code block markers if present
        if content.startswith("```json"):
            content = content[7:-3].strip()  # Strip the ```json and ending ``` markers

        # Ensure the output is valid JSON
        try:
            structured_output = json.loads(content)
            return structured_output  # Return as a Python dictionary
        except json.JSONDecodeError:
            # Handle cases where the output isn't valid JSON
            return {
                "error": "Invalid JSON format returned by the model, even after cleanup.",
                "rawResponse": content
            }

    except Exception as e:
        # Handle API call errors
        return {
            "error": f"An error occurred: {str(e)}"
        }
    
def determine_repository_tool_openai_4o(context_info):
    """
    Uses OpenAI's API to determine if the repository implements the described method,
    and ensures the output is structured as JSON, even if extra formatting is present.
    """
    prompt = """
    **Task**:
    Based on the description above, evaluate whether the provided code repository is an actual implementation of the method discussed in the article.

    **Guidelines**:

    - **Definition of Implementation**: The code is considered an implementation if it is written to test the method or to provide code for reproducing the results presented in the article or to share the code and data for the article.
    
    - **Non-Implementation**: Some repositories may be cited because they utilize existing tools or datasets rather than providing an implementation of the described method. If the GitHub link is just for an existing tool or dataset and not an implementation of the method, please respond with "no".

    - **Indicators of Implementation**:
    - Look for explicit statements such as "the code is available/provided/can be accessed at" or "the code is available at".
    - Such statements are good indicators that the repository contains an implementation of the method.

    ```
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an expert researcher specializing in transportation systems and software tools."},
                {"role": "user", "content": (
                    context_info + prompt
                )},
            ],
            response_format={
                "type": "json_schema",
                "json_schema": {
                "name": "implementation_schema",
                "strict": True,
                "schema": {
                    "type": "object",
                    "properties": {
                    "isImplementation": {
                        "type": "string",
                        "description": "Indicates whether the implementation is present or not.",
                        "enum": [
                        "yes",
                        "no"
                        ]
                    },
                    "reason": {
                        "type": "string",
                        "description": "A brief explanation supporting the isImplementation value."
                    },
                    "keyIndicators": {
                        "type": "array",
                        "description": "A list of key points or observations supporting the reasoning.",
                        "items": {
                        "type": "string"
                        }
                    }
                    },
                    "required": [
                    "isImplementation",
                    "reason",
                    "keyIndicators"
                    ],
                    "additionalProperties": False
                }
                }
            },
            temperature=0,
            max_tokens=16383,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
            )
        # Extract the model's response
        content = response.choices[0].message.content

        # Cleanup: Remove code block markers if present
        if content.startswith("```json"):
            content = content[7:-3].strip()  # Strip the ```json and ending ``` markers

        # Ensure the output is valid JSON
        try:
            structured_output = json.loads(content)
            return structured_output  # Return as a Python dictionary
        except json.JSONDecodeError:
            # Handle cases where the output isn't valid JSON
            return {
                "error": "Invalid JSON format returned by the model, even after cleanup.",
                "rawResponse": content
            }

    except Exception as e:
        # Handle API call errors
        return {
            "error": f"An error occurred: {str(e)}"
        }    
        

In [None]:
for journal in journals:
    if journal == '0968-090X.csv': # to take the trc dataset for test
        journal_issn = journal.replace('.csv', '')
        # journal_issn = journal_issn_df[journal_issn_df['journal'] == journal.replace('.csv', '')]['issn'].values[0]
        journal_name = journal_issn_df[journal_issn_df['issn'] == journal_issn]['journal'].values[0]
        print(journal_name)
        journal_meta = pd.read_csv(os.path.join(meta_folder, journal))
        journal_meta['unique_id'] = journal_meta['doi'].apply(doi_to_unique_id) # to convert the doi to a unique id
        # transform the journal_meta to a json
        journal_meta_json = journal_meta.to_json(orient='records', default_handler=str)
        # here we use the for loop to help understand how it works, it can be done in one line of code later
        count = 0
        for i in range(len(journal_meta)):
        # for i in range(50):
            github_urls = []
            fulltext_path = os.path.join(full_text_folder, journal_issn, journal_meta.iloc[i]['unique_id'] + '.xml')
            sections = extract_sections_and_text_from_xml(fulltext_path)
            reorganized_sections = postprocess_sections(sections)
            paragraphs = []
            for section in reorganized_sections:
                # add a preprcessing for the text here to make it more readable
                urls  = extract_github_urls(cleanup_abstract(section['text']))
                # get the paragraph where the github url is located
                if urls:
                    github_urls.extend(urls)
                    print(urls)
                for subsection in section['subsections']:
                    urls = github_urls.extend(extract_github_urls(cleanup_abstract(subsection['text'])))
                    if urls:
                        github_urls.extend(urls)
                    for subsubsection in subsection['subsubsections']:
                        urls = extract_github_urls(cleanup_abstract(subsubsection['text']))
                        if urls:
                            github_urls.extend(urls)
            if github_urls:
                count+=1
                # create a json file to store the results
                results = []
                for url in github_urls:
                    for section in reorganized_sections:
                        if url in section['text']:
                            # print(cleanup_abstract(section['text']))
                            location_text = cleanup_abstract(section['text'])
                            context_info = get_context_with_url(location_text, url, context_up_range=20, context_down_range=2)
                            tool_or_not = determine_repository_tool_openai_4o(context_info)
                            print(tool_or_not)
                            if tool_or_not['isImplementation'] == 'yes':
                                print(url)
                            results.append({
                                "url": url,
                                "context_identified": tool_or_not,
                                "context_info": context_info
                            })
                        for subsection in section['subsections']:
                            if url in subsection['text']:
                                # print(cleanup_abstract(subsection['text']))
                                location_text = cleanup_abstract(subsection['text'])
                                context_info = get_context_with_url(location_text, url, context_up_range=20, context_down_range=2)
                                tool_or_not = determine_repository_tool_openai_4o(context_info)
                                print(tool_or_not)
                                if tool_or_not['isImplementation'] == 'yes':
                                    print(url)
                                results.append({
                                    "url": url,
                                        "context_identified": tool_or_not,
                                        "context_info": context_info
                                    })
                            for subsubsection in subsection['subsubsections']:
                                if url in subsubsection['text']:
                                    # print(cleanup_abstract(subsubsection['text']))
                                    location_text = cleanup_abstract(subsubsection['text'])
                                    context_info = get_context_with_url(location_text, url, context_up_range=20, context_down_range=2)
                                    tool_or_not = determine_repository_tool_openai_4o(context_info)
                                    print(tool_or_not)
                                    if tool_or_not.get('error') != 'yes':
                                        results.append({
                                            "url": url,
                                            "context_identified": tool_or_not,
                                            "context_info": context_info
                                        })
                                    results.append({
                                        "url": url,
                                        "context_identified": tool_or_not,
                                        "context_info": context_info
                                    })
                # save the results to a json file
                save_context_path = os.path.join(save_folder, f'{journal_name}/{journal_meta.iloc[i]["unique_id"]}.json')
                # create the folder if not exists
                os.makedirs(save_folder, exist_ok=True)
                os.makedirs(save_folder + '/' + journal_name, exist_ok=True)
                # save the results to a json file for postprocessing
                with open(save_context_path, 'w') as f:
                    json.dump(results, f, indent=4)
                # break
        print(f'{journal_name}: {count}/{len(journal_meta)}')