# API Request

In [None]:
from openai import OpenAI
import time
def request_openai(client, prompt, model, max_retry=1, temperature=1., top_p=0.9, max_token=2048, batch_size=1):
    message_dict = {
        "role": "user",
        "content": prompt
    }
    messages = []
    messages.append(message_dict)
    retry = 0
    while True:
        try:
            completion = client.chat.completions.create(
                            model=model,
                            messages=messages,
                            max_tokens=max_token,
                            temperature=temperature,
                            top_p=top_p,
                            n=batch_size
                            
            )
            # 从响应中提取对话回复
            output = completion.choices[0].message.content
            break
        except Exception as e:
            if retry < max_retry:
                print(f"Exception occurred, wait 3s: {e}", flush=True)
                time.sleep(3)
            else:
                output = ''
                break
            retry += 1
    return output



model = "deepseek-coder"
# prompt = 'write me a quicksort algorithm in python.'
eval_client = OpenAI(base_url="https://api.deepseek.com", api_key='sk-8c1dd14f03e24d6197d660908dc2de4c')
# res = request_openai(eval_client, prompt, model, batch_size=3)
# print(res)

# SWE Bench Eval

In [None]:
from datasets import load_dataset
# swe_bench_data = load_dataset("princeton-nlp/SWE-bench", split="train")  # Too slow 
swe_bench_data = load_dataset("/home/llm/.cache/huggingface/datasets/princeton-nlp___swe-bench/default/0.0.0/f5351ee8c6663736817027db3ad03fe662cb5bb8", split="train")


# Parse Ground Truth Location

In [None]:
import re
from tqdm import tqdm

def parse_git_patch(patch):
    results = []
    
    # Regex to capture the file names and hunk information
    file_pattern = re.compile(r'diff --git a/(.+) b/(.+)')
    hunk_pattern = re.compile(r'@@ -(\d+),(\d+) \+(\d+),(\d+) @@(?: (.*))?')

    # Split the patch by 'diff --git' to handle each file separately
    file_match_locs = [m for m in re.finditer(file_pattern, patch)]
    for f_m_idx, f_match in enumerate(file_match_locs):
        file_start_idx = f_match.start()
        file_end_idx = file_match_locs[f_m_idx+1].start() if (f_m_idx+1) < len(file_match_locs) else None
        modified_file_content = patch[file_start_idx:file_end_idx]
        # get match file path info
        orig_file_path, new_file_path = f_match.groups() 
        # split each modified file by the git patch header
        hunk_match_locs =  [m for m in re.finditer(hunk_pattern, modified_file_content)]
        for h_m_idx, h_match in enumerate(hunk_match_locs):
            hunk_start_idx = h_match.start()
            hunk_end_idx = hunk_match_locs[h_m_idx+1].start() if (h_m_idx+1) < len(hunk_match_locs) else None
            modified_hunk_content = modified_file_content[hunk_start_idx:hunk_end_idx]
            # Get the hunk header info 
            orig_start_line, orig_line_count, new_start_line, new_line_count, hunk_context = re.findall(hunk_pattern, modified_hunk_content)[0]
            # get original and new content
            hunk_lines = modified_file_content[h_match.end(): hunk_end_idx].split('\n')
            orig_hunk_lines, new_hunk_lines = [], []
            for line in hunk_lines:
                if line == '\\ No newline at end of file': continue # ignore the format string added by git
                if line and not line.startswith('+'):
                    if line.startswith('-'):
                        orig_hunk_lines.append(line[1:])
                    else:
                        orig_hunk_lines.append(line)
                if line and not line.startswith('-'):
                    if line.startswith('+'):
                        new_hunk_lines.append(line[1:])
                    else:
                        new_hunk_lines.append(line)
            results.append({
                "original_file_path": orig_file_path,
                "new_file_path": new_file_path,
                "original_start_line": orig_start_line,
                "new_start_line": new_start_line,
                "hunk_context": hunk_context,
                "original_hunk_lines": orig_hunk_lines,
                "new_hunk_lines": new_hunk_lines,
                "original_line_count":orig_line_count,
                "new_line_count": new_line_count,
                "hunk_content": modified_hunk_content
            })
    return results



count = 0
for bench_data in tqdm(swe_bench_data):
    patch = bench_data['patch']
    parsed_patch_info = parse_git_patch(patch)    
    mismatch = False
    # for info in parsed_patch_info:
    #     if not (abs(int(info["original_line_count"])-len(info["original_hunk_lines"])) <= 1
    #             and abs(int(info["new_line_count"])-len(info["new_hunk_lines"])) <=1):
    #         mismatch = True # This happens when the patch itself contains git header info
    if not len(parsed_patch_info): count +=1
count    

# Locate File Structures

In [None]:
from get_repo_structure.get_repo_structure import (get_project_structure_from_scratch)
from agentless.util.preprocess_data import (show_project_structure)

bench_data = swe_bench_data[0]
d = get_project_structure_from_scratch(
        bench_data["repo"], bench_data["base_commit"],bench_data["instance_id"] , "/dataset/zszeng/AgentlessOutputs/playground"
)
patch_info = parse_git_patch(bench_data['patch'])


In [None]:
obtain_relevant_files_prompt = """
Please look through the following GitHub problem description and Repository structure and provide a list of files that one would need to edit to fix the problem.

### GitHub Problem Description ###
{problem_statement}

###

### Repository Structure ###
{structure}

###

Based on the problem description and repo structure, please give a brief analysis on which set of files are necessary to edit, then retrieve the relevant directory structures and return at most 10 files in full path.
Following is the desired format:
### 
Analysis: [Insert a brief analysis on which set of files are necessary to edit based on the problem description and repo structure]
###
Relevant Directories: [Retrieve the **COMPLETE** relevant directories strcuture from the Repository Structure here based on your analysis. Make sure the retrieved directory IS NOT a sub directory but contains the root folder!]
###
Relevant File Paths: [Put the **FULL PATHS** of the files that are relevant to the problem here. Each path should be in a single line. Return the necessary files only but limit the maximum number of files to 10.]
###
"""

obtain_relevant_files_prompt_with_hint = """
Please look through the following GitHub problem description and Repository structure and provide a list of files that one would need to edit to fix the problem.

### GitHub Problem Description ###
{problem_statement}

###

### Repository Structure ###
{structure}

###

Based on the problem description and repo structure, please give a brief analysis on which set of files are necessary to edit, then retrieve the relevant directory structures and return at most 10 files in full path.
Following is the desired format:
### 
Analysis: [Insert a brief analysis on which set of files are necessary to edit based on the problem description and repo structure]
###
Relevant Directories: [Retrieve the **COMPLETE** relevant directories strcuture from the Repository Structure here based on your analysis. Make sure the retrieved directory IS NOT a sub directory but contains the root folder!]
###
Relevant File Paths: [Put the **FULL PATHS** of the files that are relevant to the problem here. Each path should be in a single line. Return the necessary files only but limit the maximum number of files to 10.]
###

Hint: The followings are the ground truth files that need to be modified, please construct your formatted response based on this info:
{ground_truth_modified_files}
"""
ground_truth_modified_files = set([info["original_file_path"] for info in patch_info])

message = obtain_relevant_files_prompt.format(
            problem_statement=bench_data["problem_statement"],
            structure=show_project_structure(d["structure"]).strip(),
).strip()

# message = obtain_relevant_files_prompt_with_hint.format(
#             problem_statement=bench_data["problem_statement"],
#             structure=show_project_structure(d["structure"]).strip(),
#             ground_truth_modified_files="\n".join(ground_truth_modified_files)
# ).strip()


res = request_openai(eval_client, message, model, temperature=0.5)
print(res)

In [None]:
print(show_project_structure(d["structure"]).strip())
# ground_truth_modified_files

# Locate to functions/class/variables

In [None]:
from agentless.util.preprocess_data import (
    correct_file_paths,
    get_full_file_paths_and_classes_and_functions,
    get_repo_files
)
from agentless.util.compress_file import get_skeleton
file_content_in_block_template = """
### File: {file_name} ###
```python
{file_content}
```
"""
obtain_relevant_functions_and_vars_from_compressed_files_prompt_more = """
Please look through the following GitHub Problem Description and the Skeleton of Relevant Files.
Identify all locations that need inspection or editing to fix the problem, including directly related areas as well as any potentially related global variables, functions, and classes.
For each location you provide, either give the name of the class, the name of a method in a class, the name of a function, or the name of a global variable.

### GitHub Problem Description ###
{problem_statement}

### Skeleton of Relevant Files ###
{file_contents}

###

Please provide the complete set of locations as either a class name, a function name, or a variable name.
Note that if you include a class, you do not need to list its specific methods.
You can include either the entire class or don't include the class name and instead include specific methods in the class.
### Examples:
```
full_path1/file1.py
function: my_function_1
class: MyClass1
function: MyClass2.my_method

full_path2/file2.py
variable: my_var
function: MyClass3.my_method

full_path3/file3.py
function: my_function_2
function: my_function_3
function: MyClass4.my_method_1
class: MyClass5
```

Return just the locations.
"""
model_found_files = res.split("Relevant File Paths:")[-1].strip().split("\n")
files, classes, functions = get_full_file_paths_and_classes_and_functions(d["structure"])

# sort based on order of appearance in model_found_files
found_files = correct_file_paths(model_found_files, files, True)

file_contents = get_repo_files(d["structure"], found_files)
compressed_file_contents = {
    fn: get_skeleton(code) for fn, code in file_contents.items()
}
contents = [
    file_content_in_block_template.format(file_name=fn, file_content=code)
    for fn, code in compressed_file_contents.items()
]
file_contents = "".join(contents)
template = (
    obtain_relevant_functions_and_vars_from_compressed_files_prompt_more
)
message = template.format(
    problem_statement=bench_data["problem_statement"], file_contents=file_contents
)
res = request_openai(eval_client, message, model, temperature=0.)
print(res)

In [None]:
set(["3","2", "1"])  == set(["2", "1", "3" ])