In [None]:
!pip install transformers datasets evaluate python-dotenv

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import evaluate

## ***Get Dataset - BigCodeBench***

In [None]:
dataset = load_dataset("bigcode/bigcodebench", split="v0.1.0_hf[:10]")

## ***Initialize Models***

In [None]:
model_names = {                               #TODO : Change Model to check improvement
    "CodeLlama": "codellama/CodeLlama-7b-hf",
}

tokenizers = {}
models = {}

In [None]:
for name, model_id in model_names.items():
    tokenizers[name] = AutoTokenizer.from_pretrained(model_id)
    models[name] = AutoModelForCausalLM.from_pretrained(model_id)

## ***Feedback Model***

In [None]:
import openai
from dotenv import load_dotenv
import os

# Load environment variables from the .env file
load_dotenv()

# Retrieve the API key from the .env file
openai.api_key = os.getenv("OPENAI_API_KEY")

def generate_feedback(initial_prompt, generated_code, evaluation_results):
    """
    Generate a refined prompt using GPT-4 based on evaluation feedback.

    Args:
        initial_prompt (str): The original user prompt.
        generated_code (str): The code generated by Code Llama or similar model.
        evaluation_results (dict): Feedback from the evaluator, including issue type and suggestions.

    Returns:
        str: The refined prompt returned by GPT-4.
    """
    # Determine the type of issue and feedback details
    issue_type = evaluation_results.get('type', 'General Issue')
    feedback_details = evaluation_results.get('feedback', 'No feedback provided.')
    
    # Map issue types to corresponding tasks
    issue_tasks = {
        "Syntax Error": "1. Refine the original prompt to avoid reserved keywords or invalid syntax.\n"
                        "2. Suggest corrections or modifications to the generated code to resolve syntax issues.",
        "Logical Error": "1. Refine the original prompt to ensure it explicitly includes all necessary requirements.\n"
                         "2. Suggest corrections or modifications to the generated code to address logical errors.",
        "Optimization Issue": "1. Refine the original prompt to emphasize optimization and efficiency.\n"
                              "2. Suggest corrections or modifications to the generated code to improve algorithm efficiency.",
        "Test Case Failure": "1. Refine the original prompt to include handling of edge cases.\n"
                             "2. Suggest corrections or modifications to the generated code to pass all test cases."
    }

    # Default task if issue type is not found
    task = issue_tasks.get(issue_type, "1. Refine the original prompt.\n2. Suggest improvements to the generated code.")
    
    # Construct GPT-4 input message
    system_message = "You are an expert in refining coding prompts and debugging issues."
    user_message = f"""
    Below is a scenario where the generated code has a {issue_type}. Help me refine the original prompt or suggest improvements.

    Original Prompt:
    {initial_prompt}

    Generated Code:
    {generated_code}

    Feedback from Evaluator:
    {feedback_details}

    Task for GPT-4:
    {task}
    """
    
    # Call GPT-4 API
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ]
        )
        # Extract and return the refined prompt
        refined_prompt = response["choices"][0]["message"]["content"]
        return refined_prompt
    except Exception as e:
        return f"Error during GPT-4 processing: {e}"

In [None]:
#[IMPORTANT : Run evaluation block before running this cell]
def generate_score(prompt, model, tokenizer, data, max_length=1000):
    n = 5 #TODO : Set N based on your inplementation
    for _ in range(n):
      inputs = tokenizer(prompt, return_tensors="pt")
      output = model.generate(**inputs, max_length=max_length, temperature=0.7, top_p=0.9, do_sample=True)
      evaluation_result = evaluate_code_generation(output, data['test'], data['libs'])
      prompt = generate_feedback(evaluation_result) #TODO : Implement funtion partametrs for Reinforcement
    return evaluation_result['pass_ratio']

In [None]:
generated_scores = {name: [] for name in model_names}

for example in dataset:
    prompt = example["instruct_prompt"]
    print("Prompt:" + prompt)
    for model_name in model_names:
        generated_score = generate_score(prompt, models[model_name], tokenizers[model_name], example)
        generated_scores[model_name].append(generated_score)

## ***Evaluation***

In [None]:
# # Evaluation Setup with BLEU (or CodeBLEU if available)
# bleu_metric = evaluate.load("bleu")

# # Prepare reference code for evaluation
# references = [example["canonical_solution"] for example in dataset]

# # Evaluate each model's generated code against the reference code
# evaluation_scores = {}
# for model_name, codes in generated_codes.items():
#     bleu_score = bleu_metric.compute(predictions=codes, references=references)
#     evaluation_scores[model_name] = bleu_score["bleu"]
#     print(f"{model_name} BLEU Score:", bleu_score["bleu"])

# # Print final evaluation summary
# print("\n=== Evaluation Summary ===")
# for model_name, score in evaluation_scores.items():
#     print(f"{model_name} BLEU Score: {score:.4f}")

In [None]:
import unittest
from unittest.mock import patch

def evaluate_code_generation(generated_code, test_cases_code, libraries):
    """
    Dynamically evaluates the generated code based on pass ratio and executability.

    Parameters:
    - generated_code: A string containing the code to be evaluated.
    - test_cases_code: A string containing the test cases to be executed.
    - libraries: A list of library names to import and make available in the execution context.

    Returns:
    - A dictionary containing 'pass_ratio', 'executability', and categorized errors.
    """
    # Prepare a local namespace for executing the generated code
    local_namespace = {}
    errors = {
        'Syntax Errors': [],
        'Logical Errors': [],
        'Optimization Issues': [],
        'Test Case Failures': [],
    }

    # Import libraries dynamically and add them to the namespace
    for lib in libraries:
        try:
            local_namespace[lib] = __import__(lib)
        except ImportError as e:
            errors['Syntax Errors'].append({'type': 'ImportError', 'message': f"Error importing {lib}: {e}"})
            return {
                'pass_ratio': 0,
                'executability': False,
                'errors': errors
            }

    try:
        # Execute the generated code in the provided namespace
        exec(generated_code, local_namespace, local_namespace)
        executability = True
    except SyntaxError as e:
        errors['Syntax Errors'].append({'type': 'SyntaxError', 'message': str(e)})
        executability = False
    except Exception as e:
        errors['Logical Errors'].append({'type': 'ExecutionError', 'message': str(e)})
        executability = False

    if not executability:
        return {
            'pass_ratio': 0,
            'executability': executability,
            'errors': errors
        }

    # Add task_func to local_namespace so it can be accessed by tests
    task_func = local_namespace.get('task_func')

    if not task_func:
        errors['Logical Errors'].append({'type': 'FunctionError', 'message': "task_func is not defined in the generated code."})
        return {
            'pass_ratio': 0,
            'executability': False,
            'errors': errors
        }

    # Dynamically create a TestCase class from the provided test cases code
    try:
        exec(test_cases_code, {'task_func': task_func, 'patch': patch}, local_namespace)
    except SyntaxError as e:
        errors['Syntax Errors'].append({'type': 'TestCaseSyntaxError', 'message': str(e)})
        return {
            'pass_ratio': 0,
            'executability': False,
            'errors': errors
        }
    except Exception as e:
        errors['Logical Errors'].append({'type': 'TestCaseError', 'message': str(e)})
        return {
            'pass_ratio': 0,
            'executability': False,
            'errors': errors
        }

    # Extract the TestCase class from the local namespace
    TestClass = local_namespace.get('TestCases')

    if not TestClass:
        errors['Logical Errors'].append({'type': 'TestCaseError', 'message': "TestCases class not found in provided test cases code."})
        return {
            'pass_ratio': 0,
            'executability': False,
            'errors': errors
        }

    # Define a custom test suite
    class CustomTestSuite(unittest.TestSuite):
        def run(self, result, debug=False):
            super().run(result, debug)
            return result

    # Run the tests using unittest framework
    suite = CustomTestSuite()
    suite.addTest(unittest.makeSuite(TestClass))

    runner = unittest.TextTestRunner()

    # Capture results
    result = runner.run(suite)

    # Calculate pass ratio
    pass_ratio = (result.testsRun - len(result.failures) - len(result.errors)) / result.testsRun if result.testsRun > 0 else 0

    # Categorize test-related errors
    for failure in result.failures:
        errors['Test Case Failures'].append({'type': 'TestFailure', 'message': str(failure)})
    for error in result.errors:
        errors['Test Case Failures'].append({'type': 'TestError', 'message': str(error)})

    # Clear the local namespace after execution
    local_namespace.clear()

    return {
        'pass_ratio': pass_ratio,
        'executability': len(result.errors) == 0 and len(result.failures) == 0,
        'errors': errors
    }


In [11]:
# Example usage with dynamic inputs and libraries list
generated_code = """
import itertools
from random import shuffle

def task_func(numbers=list(range(1, 11))):
    permutations = list(itertools.permutations(numbers))
    sum_diffs = 0

    for perm in permutations:
        perm = list(perm)
        shuffle(perm)
        diffs = [abs(perm[i] - perm[i+1]) for i in range(len(perm)-1)]
        sum_diffs += sum(diffs)

    avg_sum_diffs = sum_diffs / len(permutations) if permutations else 0

    return avg_sum_diffs
"""

test_cases_code = """
import unittest from unittest.mock import patch from random import seed, shuffle import itertools class TestCases(unittest.TestCase): def test_default_numbers(self): # Test with default number range (1 to 10) to check that the result is a positive float. result = task_func() self.assertIsInstance(result, float) self.assertGreater(result, 0) def test_custom_list(self): # Test with a custom list of small positive integers to ensure proper handling and positive result. result = task_func([1, 2, 3]) self.assertIsInstance(result, float) self.assertGreater(result, 0) def test_negative_numbers(self): # Test with negative numbers to verify the function handles and returns a positive result. result = task_func([-3, -2, -1]) self.assertIsInstance(result, float) self.assertGreater(result, 0) def test_single_element(self): # Test with a single element list to confirm the return is zero since no pairs exist. result = task_func([5]) self.assertIsInstance(result, float) self.assertEqual(result, 0) def test_empty_list(self): # Test with an empty list to ensure the function handles it gracefully and returns zero. result = task_func([]) self.assertIsInstance(result, float) self.assertEqual(result, 0) def test_identical_elements(self): # Test with a list of identical elements to confirm that differences are zero and the average is zero. result = task_func([2, 2, 2]) self.assertIsInstance(result, float) self.assertEqual(result, 0) def test_mixed_numbers(self): # Test with a list of mixed positive and negative numbers to check correct average of differences. result = task_func([-10, 10, -5]) self.assertIsInstance(result, float) self.assertGreater(result, 0) def test_specific_value_with_seed(self): # Set seed for reproducibility and check the computed value with patch('random.shuffle', side_effect=lambda x: seed(42) or shuffle(x)): result = task_func([1, 2, 3]) self.assertAlmostEqual(result, 2.5, delta=0.5) # This expected value should be calculated beforehand def test_large_list_with_seed(self): # Set seed and test with a larger list for specific computed value with patch('random.shuffle', side_effect=lambda x: seed(99) or shuffle(x)): result = task_func(list(range(1, 11))) self.assertAlmostEqual(result, 33.0, delta=0.5) # This expected value should be calculated beforehand def test_random_behavior(self): # Test to ensure different seeds produce different outputs, demonstrating randomness with patch('random.shuffle', side_effect=lambda x: seed(1) or shuffle(x)): result1 = task_func([1, 2, 3]) with patch('random.shuffle', side_effect=lambda x: seed(1) or shuffle(x)): result2 = task_func([1, 2, 4]) self.assertNotEqual(result1, result2)
"""

# List of libraries to import and use within exec()
libraries_to_import = ['random', 'itertools']

# Evaluate the generated code with dynamic inputs and specified libraries
evaluation_result = evaluate_code_generation(generated_code, test_cases_code, libraries_to_import)
print(evaluation_result)

{'pass_ratio': 0, 'executability': False, 'errors': [{'type': 'TestCaseError', 'message': 'invalid syntax (<string>, line 2)'}]}


In [None]:
#TODO : Implement a function to average teh scores from each list in generate_score which is sthe final score of the model