In [1]:
!pip install transformers datasets evaluate

Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transforme

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import evaluate

from edge_case_feedback import EdgeCaseFeedback

## ***Get Dataset - BigCodeBench***

In [None]:
dataset = load_dataset("bigcode/bigcodebench", split="v0.1.0_hf[:10]")

## ***Initialize Models***

In [None]:
model_names = {                               #TODO : Change Model to check improvement
    "CodeLlama": "codellama/CodeLlama-7b-hf",
}

tokenizers = {}
models = {}

In [None]:
for name, model_id in model_names.items():
    tokenizers[name] = AutoTokenizer.from_pretrained(model_id)
    models[name] = AutoModelForCausalLM.from_pretrained(model_id)

## ***Feedback Model***

In [None]:
def generate_feedback(prompt, generated_code, solution):
    edge_feedback = EdgeCaseFeedback(models["CodeLlama"])  # or whichever model you're using
    test_cases = edge_feedback.generate_test_cases(prompt, generated_code)
    results = edge_feedback.evaluate_test_cases(generated_code, test_cases)
    
    # Create enhanced prompt with edge case feedback
    enhanced_prompt = f"""
    Original prompt: {prompt}
    
    Consider these edge cases and their requirements:
    {results['feedback']}
    
    Please regenerate the code to handle these edge cases correctly.
    """
    
    return enhanced_prompt

In [1]:
def generate_code(prompt, model, tokenizer, solution, max_length=1000):
    n = 5  
    current_prompt = prompt
    best_score = 0
    best_code = None
    
    for i in range(n):
        inputs = tokenizer(current_prompt, return_tensors="pt")
        output = model.generate(**inputs, max_length=max_length, temperature=0.7, top_p=0.9, do_sample=True)
        generated_code = tokenizer.decode(output[0], skip_special_tokens=True)
        
        # Calculate BLEU score
        bleu_score = bleu_metric.compute(predictions=[generated_code], references=[solution])["bleu"]
        
        if bleu_score > best_score:
            best_score = bleu_score
            best_code = generated_code
        
        # Generate feedback for next iteration
        current_prompt = generate_feedback(prompt, generated_code, solution)
    
    return best_code

In [None]:
generated_codes = {name: [] for name in model_names}

for example in dataset:
    print("Prompt:")
    prompt = example["complete_prompt"]
    solution = example["canonical_solution"]
    for model_name in model_names:
        generated_code = generate_code(prompt, models[model_name], tokenizers[model_name], solution)
        generated_codes[model_name].append(generated_code)

## ***Evaluation***

In [None]:
# Evaluation Setup with BLEU (or CodeBLEU if available)
bleu_metric = evaluate.load("bleu")

# Prepare reference code for evaluation
references = [example["canonical_solution"] for example in dataset]

# Evaluate each model's generated code against the reference code
evaluation_scores = {}
for model_name, codes in generated_codes.items():
    bleu_score = bleu_metric.compute(predictions=codes, references=references)
    evaluation_scores[model_name] = bleu_score["bleu"]
    print(f"{model_name} BLEU Score:", bleu_score["bleu"])

# Print final evaluation summary
print("\n=== Evaluation Summary ===")
for model_name, score in evaluation_scores.items():
    print(f"{model_name} BLEU Score: {score:.4f}")