
* https://github.com/haesleinhuepf/human-eval-bia/blob/main/demo/create_cases.ipynb
* https://github.com/haesleinhuepf/human-eval-bia/blob/main/demo/create_samples.ipynb

In order to make this notebook work, install human-eval-bia as explained [here](https://github.com/haesleinhuepf/human-eval-bia/tree/main?tab=readme-ov-file#installation):
```
git clone https://github.com/haesleinhuepf/human-eval-bia/
cd human-eval-bia
pip install -e .
pip install -r requirements.txt
``` 

In [1]:
directory = "data/"
num_samples_per_task = 10

In [2]:
import requests
import json
import pandas as pd
import time
import os
import openai
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from fine_tuning_utilities import load_jsonl_file, save_jsonl_file
from human_eval.data import write_jsonl, read_problems, extract_python


In [3]:
url = "https://raw.githubusercontent.com/haesleinhuepf/human-eval-bia/main/data/human-eval-bia.jsonl"

response = requests.get(url)
data = [json.loads(line) for line in response.text.splitlines()]
df = pd.DataFrame(data)

df.head(2)

Unnamed: 0,task_id,prompt,canonical_solution,entry_point,test
0,../test_cases/apply_otsu_threshold_and_count_p...,def apply_otsu_threshold_and_count_postiive_pi...,\n import skimage\n import numpy as np\n...,apply_otsu_threshold_and_count_postiive_pixels,def check(candidate):\n import numpy as np\...
1,../test_cases/binary_closing.ipynb,"def binary_closing(binary_image, radius:int=1)...",\n import numpy as np\n import skimage\n...,binary_closing,def check(candidate):\n import numpy as np\...


In [4]:
validation_data = load_jsonl_file("validation_data.jsonl")
prompts = [t["messages"][0]["content"] for t in validation_data]
prompts

['def apply_otsu_threshold_and_count_postiive_pixels(image):\n    """\n    Takes an image, applies Otsu\'s threshold method to it to create a binary image and \n    counts the positive pixels.\n    """',
 'def convex_hull_measure_area(point_cloud):\n    """\n    Take a 3D point_cloud, determines the convex hull around the points and returns the surface area of the convex hull.\n    """',
 'def measure_properties_of_regions(label_image, intensity_image):\n    """\n    Takes a label image and an intensity image, and returns pandas dataframe\n    with measurements for area, perimeter and mean_intensity.\n    """',
 'def detect_edges(image):\n    """\n    Applies an edge-detection filter to an image.\n    """',
 'def open_zarr(zarr_file_location):\n    """\n    Opens a zarr file and returns the array\n    """',
 'def workflow_segment_measure_umap(image):\n    """\n    This function takes a single channel intensity image, \n    segments objects with intensity above half the maximum intensit

In [5]:
training_df = df[df['prompt'].isin(prompts) == False]
validation_df = df[df['prompt'].isin(prompts)]
print(len(training_df), len(validation_df))


45 12


In [6]:
import os
os.makedirs(directory, exist_ok=True)

In [7]:
def setup_prompt(input_code):
    prompt = f"""Complete the following code. 
    First, write down a plan as comments how to solve the problem step-by-step.
    Then, import the python libraries you think you will use.
    Then, write the function you were asked for.
    Write python code only.
    Do NOT write any code for testing the function.
    Return the complete code including my code.

```python
{input_code}
```
    """
    return prompt

In [8]:
def convert_df_to_jsonl(df):
    return [
        {
            'task_id': row['task_id'],
            'prompt': row['prompt'],
            'canonical_solution': row['canonical_solution'],
            'entry_point': row['entry_point'],
            'test': row['test']
        }
        for _, row in df.iterrows()
    ]

save_jsonl_file(convert_df_to_jsonl(training_df), directory + "training_human-eval-bia.jsonl")
save_jsonl_file(convert_df_to_jsonl(validation_df), directory + "validation_human-eval-bia.jsonl")

## Sampling

In [9]:
use_reference = False
use_gpt_4o = False
use_gpt_4o_fine_tuned = True

In [10]:
for problem_file in ["training_human-eval-bia.jsonl"]: #, "validation_human-eval-bia.jsonl"]:
    
    problems = read_problems(directory + problem_file)

    code_generators = {}

    if use_reference:
        # actually not a model, but to the evaluation framework it appears like:
        model_reference = 'reference'
        problems_data = read_problems(directory + problem_file)
    
        def generate_reference(input_code):
            # This is a computationally wasteful solution, 
            # but like this it fits well in the framework
            for task_id, problem in problems_data.items():
                if problem['prompt'] == input_code:
                    return problem['canonical_solution']
        
        code_generators[model_reference] = generate_reference

    if use_gpt_4o:
        model_gpt_4o = "gpt-4o-2024-08-06"
        def generate_one_completion_gpt_4o(input_code):
            import openai
            client = openai.OpenAI()
            response = client.chat.completions.create(
                model=model_gpt_4o,
                messages=[{"role": "user", "content": setup_prompt(input_code)}],
            )
            return response.choices[0].message.content.strip()
            
        code_generators[model_gpt_4o] = generate_one_completion_gpt_4o

    if use_gpt_4o_fine_tuned:      
        model_gpt_4o_fine_tuned = "ft:gpt-4o-2024-08-06:leipzig-university::9ydjNWWH"
        def generate_one_completion_gpt_4o_fine_tuned(input_code):
            import openai
            client = openai.OpenAI()
            response = client.chat.completions.create(
                model=model_gpt_4o_fine_tuned,
                messages=[{"role": "user", "content": setup_prompt(input_code)}],
            )
            return response.choices[0].message.content.strip()
            
        code_generators[model_gpt_4o_fine_tuned] = generate_one_completion_gpt_4o_fine_tuned

    subset = problem_file.split("_")[0]
    
    for model_name, generate_one_completion in code_generators.items():
        samples = []
    
        for i in range(num_samples_per_task):
            for task_id in problems:
                print(model_name, task_id, i)
    
                response = generate_one_completion(problems[task_id]["prompt"])
                code = extract_python(response)
                
                samples.append(dict(task_id=task_id, completion=code, full_response=response))
        
        write_jsonl(f"{directory}samples_{subset}_{model_name}.jsonl".replace(":", "_"), samples)


ft:gpt-4o-2024-08-06:leipzig-university::9ydjNWWH ../test_cases/binary_closing.ipynb 0
ft:gpt-4o-2024-08-06:leipzig-university::9ydjNWWH ../test_cases/binary_skeleton.ipynb 0


InternalServerError: Error code: 500 - {'error': {'message': 'The server had an error while processing your request. Sorry about that!', 'type': 'server_error', 'param': None, 'code': None}}