**Generative Models for Code** -- Final Project<br><br>
**Maria Gancayco (mig2131@columbia.edu)**<br>
**Stephen Wright (svw2112@columbia.edu)**<br>
*Due:* Wednesday, 12 Dec 2024 at 11:59pm ET

In [None]:
# Setup: Environment and Memory Management

import torch
import gc
from pathlib import Path
from dataclasses import dataclass
from typing import Optional

# Check and display GPU availability for transparency
print("CUDA available:", torch.cuda.is_available())
print("GPU device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")

# Memory management utilities
def clear_memory() -> None:
    """
    Clears GPU memory cache and performs garbage collection.

    This function is crucial for maintaining optimal memory usage during model evaluation,
    especially when loading and comparing multiple large language models.
    """
    if torch.cuda.is_available():
        torch.cuda.empty_cache()  # Clear CUDA cache
    gc.collect()  # Trigger Python garbage collection

def get_memory_status() -> None:
    """
    Displays current GPU memory usage statistics.

    Reports both allocated and reserved memory in megabytes (MB).
    This helps monitor memory consumption during model operations.

    Note:
        - Allocated memory: Actually used GPU memory
        - Reserved memory: Total memory reserved by PyTorch
    """
    if torch.cuda.is_available():
        # Convert bytes to MB for better readability
        allocated = torch.cuda.memory_allocated() / 1024**2
        reserved = torch.cuda.memory_reserved() / 1024**2
        print(f"GPU Memory: Allocated: {allocated:.2f}MB, Reserved: {reserved:.2f}MB")
clear_memory()
# Initialize by checking current memory status
get_memory_status()

CUDA available: True
GPU device name: NVIDIA A100-SXM4-40GB
GPU Memory: Allocated: 0.00MB, Reserved: 0.00MB


In [None]:
# Configuration and Setup

@dataclass
class ExperimentConfig:
    """
    Configuration dataclass containing all hyperparameters and settings for model evaluation.

    Attributes:
        model_name (str): Name/path of the model to be evaluated
        batch_size (int): Number of samples processed in each batch
        learning_rate (float): Learning rate for model optimization
        num_epochs (int): Number of training epochs
        max_seq_length (int): Maximum sequence length for input tokenization
        gradient_accumulation_steps (int): Number of steps to accumulate gradients
        warmup_steps (Optional[int]): Number of warmup steps for learning rate scheduler
        weight_decay (float): L2 regularization factor
        eval_steps (int): Frequency of evaluation steps
        save_steps (int): Frequency of model checkpoint saves
        logging_steps (int): Frequency of logging training metrics
    """
    model_name: str
    batch_size: int
    learning_rate: float
    num_epochs: int
    max_seq_length: int
    gradient_accumulation_steps: int
    warmup_steps: Optional[int] = None
    weight_decay: float = 0.01
    eval_steps: int = 100
    save_steps: int = 100
    logging_steps: int = 10

# Initialize configuration with DeepSeek model parameters
config = ExperimentConfig(
    model_name="deepseek-ai/deepseek-coder-6.7b-instruct",  # Using DeepSeek's 6.7B instruction-tuned model
    batch_size=1,                    # Small batch size due to model size
    learning_rate=5e-5,             # Conservative learning rate for fine-tuning
    num_epochs=3,                   # Number of training epochs
    max_seq_length=512,            # Maximum sequence length for input processing
    gradient_accumulation_steps=32, # Accumulate gradients to simulate larger batch size
    warmup_steps=100               # Warmup steps for learning rate scheduler
)

# Set up results directory for storing evaluation outputs
results_dir = Path("./results")
results_dir.mkdir(parents=True, exist_ok=True)  # Create directory if it doesn't exist

print("Configuration and directories initialized!")

Configuration and directories initialized!


In [None]:
# Model Dependencies and Imports

# Install core dependencies for transformer model handling and evaluation
!pip install transformers torch timeout-decorator

# Import required libraries
import torch  # PyTorch for deep learning operations
from transformers import (
    AutoTokenizer,         # For tokenization of input text
    AutoModelForCausalLM   # For loading pre-trained causal language models
)
from typing import List, Dict  # Type hints for better code documentation

Collecting timeout-decorator
  Downloading timeout-decorator-0.5.0.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: timeout-decorator
  Building wheel for timeout-decorator (setup.py) ... [?25l[?25hdone
  Created wheel for timeout-decorator: filename=timeout_decorator-0.5.0-py3-none-any.whl size=5007 sha256=eeff42d8712ebeac611a3375e0802ebfcdef4627c1a08cfc687fbd0e2ea438cf
  Stored in directory: /root/.cache/pip/wheels/68/2f/bc/76f1192d474666d41ae6f09813fccbd00fe3f07e8261c4cff5
Successfully built timeout-decorator
Installing collected packages: timeout-decorator
Successfully installed timeout-decorator-0.5.0


In [None]:
# Model Loading and Code Generation

def load_model_and_tokenizer(config: ExperimentConfig) -> tuple[AutoModelForCausalLM, AutoTokenizer]:

    try:
        # Clear memory before loading new model to prevent OOM errors
        clear_memory()

        print(f"Loading {config.model_name}...")

        # Initialize tokenizer with remote code execution enabled
        tokenizer = AutoTokenizer.from_pretrained(
            config.model_name,
            trust_remote_code=True  # Required for custom tokenizer implementations
        )

        # Load model with memory-efficient settings
        model = AutoModelForCausalLM.from_pretrained(
            config.model_name,
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,    # Use bfloat16 for memory efficiency
            device_map="auto",             # Optimize model placement across available devices
            low_cpu_mem_usage=True         # Minimize CPU memory during loading
        )

        # Enable gradient checkpointing if available
        if hasattr(model, "gradient_checkpointing_enable"):
            model.gradient_checkpointing_enable()  # Trade compute for memory savings

        print("Model loaded successfully!")
        get_memory_status()  # Display current memory usage

        return model, tokenizer

    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise

def generate_code(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    prompt: str,
    max_new_tokens: int = 512,
    temperature: float = 0.8,
    top_p: float = 0.95,
    top_k: int = 50
) -> str:

    try:
        # Format prompt as chat message
        messages = [{"role": "user", "content": prompt}]

        # Tokenize input with chat template
        inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)

        # Generate code with specified parameters
        outputs = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,  # Control generation length
            do_sample=True,                 # Enable sampling-based generation
            temperature=temperature,         # Control randomness
            top_p=top_p,                    # Nucleus sampling threshold
            top_k=top_k,                    # Top-k sampling parameter
            num_return_sequences=1,         # Generate single sequence
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

        # Decode and return only the generated portion (excluding prompt)
        return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

    except Exception as e:
        print(f"Error in code generation: {str(e)}")
        return ""

# Initialize model and tokenizer using configuration
model, tokenizer = load_model_and_tokenizer(config)

Loading deepseek-ai/deepseek-coder-6.7b-instruct...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Model loaded successfully!
GPU Memory: Allocated: 12856.52MB, Reserved: 12858.00MB


In [None]:
# Test the generation pipeline with a simple prompt
test_prompt = "Write a quicksort algorithm in Python."
generated_code = generate_code(model, tokenizer, test_prompt)
print("\nGenerated Code:\n", generated_code)


Generated Code:
 Here is a Python implementation of the quicksort algorithm:

```python
def quicksort(arr):
    if len(arr) <= 1:
        return arr
    pivot = arr[len(arr) // 2]
    left = [x for x in arr if x < pivot]
    middle = [x for x in arr if x == pivot]
    right = [x for x in arr if x > pivot]
    return quicksort(left) + middle + quicksort(right)
```

The function `quicksort` takes a list `arr` as input. If the list has one or no elements, it is already sorted, so the function returns the list. Otherwise, the function selects a pivot element from the list, partitions the other elements into two sub-lists, according to whether they are less than or greater than the pivot, and then recursively applies the algorithm to the two sub-lists. The base case is an empty list or a list with one element, which is already sorted.



In [None]:
# Code Generation Management System

from datetime import datetime

class CodeGenerator:
    """
    A class to manage code generation with retry logic and generation history tracking.

    Attributes:
        model: The language model for code generation
        tokenizer: The model's tokenizer
        generation_history (list): History of all generation attempts
    """

    def __init__(self, model, tokenizer):
        """
        Initialize the code generator with a model and tokenizer.

        Args:
            model: The language model to use for generation
            tokenizer: The corresponding tokenizer
        """
        self.model = model
        self.tokenizer = tokenizer
        self.generation_history = []

    def generate_with_retry(self, prompt: str, max_attempts: int = 3) -> Dict:
        """
        Generate code with automatic retry mechanism and comprehensive logging.

        Args:
            prompt (str): The input prompt for code generation
            max_attempts (int): Maximum number of retry attempts

        Returns:
            Dict: Generation result containing:
                - prompt: Original input prompt
                - code: Generated code
                - attempt: Attempt number
                - generation_time: Time taken
                - timestamp: Generation timestamp

        Note:
            Temperature increases with each retry attempt to encourage diversity
        """
        for attempt in range(max_attempts):
            try:
                # Track generation time
                start_time = datetime.now()

                # Generate code with adaptive temperature
                generated_code = generate_code(
                    self.model,
                    self.tokenizer,
                    prompt,
                    temperature=0.8 if attempt > 0 else 0.6  # Higher temperature for retries
                )

                # Calculate generation duration
                end_time = datetime.now()
                generation_time = (end_time - start_time).total_seconds()

                # Create comprehensive result log
                result = {
                    "prompt": prompt,
                    "code": generated_code,
                    "attempt": attempt + 1,
                    "generation_time": generation_time,
                    "timestamp": end_time.isoformat()
                }

                # Update generation history
                self.generation_history.append(result)

                # Return successful generation
                if generated_code:
                    return result

            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {str(e)}")

        # Return error if all attempts fail
        return {"error": "All generation attempts failed"}

    def get_generation_stats(self) -> Dict:
        """
        Calculate and return statistics about code generation performance.

        Returns:
            Dict containing:
                - total_generations: Total number of generation attempts
                - average_generation_time: Average time per generation
                - successful_generations: Number of successful generations
        """
        if not self.generation_history:
            return {}

        total_generations = len(self.generation_history)
        avg_time = sum(g["generation_time"] for g in self.generation_history) / total_generations

        return {
            "total_generations": total_generations,
            "average_generation_time": avg_time,
            "successful_generations": sum(1 for g in self.generation_history if "code" in g)
        }

# Initialize the code generation system
code_generator = CodeGenerator(model, tokenizer)

# Test the generation system with a sample prompt
test_result = code_generator.generate_with_retry("Write a binary search function in Python.")
print("\nGeneration Result:", test_result)
print("\nGeneration Stats:", code_generator.get_generation_stats())


Generation Result: {'prompt': 'Write a binary search function in Python.', 'code': 'Sure, here is a simple binary search function in Python:\n\n```python\ndef binary_search(arr, low, high, x):\n \n    if high >= low:\n \n        mid = (high + low) // 2\n \n        if arr[mid] == x:\n            return mid\n \n        elif arr[mid] > x:\n            return binary_search(arr, low, mid - 1, x)\n \n        else:\n            return binary_search(arr, mid + 1, high, x)\n \n    else:\n        return -1\n```\n\nIn this function, `arr` is the input array, `low` and `high` are the starting and ending indices of the array, and `x` is the element to be searched. If the element is found, the function returns the index of the element. If the element is not found, the function returns -1.\n\nPlease note that binary search works only on sorted arrays.\n', 'attempt': 1, 'generation_time': 44.309555, 'timestamp': '2024-12-08T23:48:52.432868'}

Generation Stats: {'total_generations': 1, 'average_genera

In [None]:
# SemCoder Model Setup

# Clear GPU memory before new model setup
clear_memory()  # Ensure clean memory state for new model

# Install Git LFS and clone SemCoder repository
print("Installing Git LFS and cloning SemCoder...")
!git lfs install  # Initialize Git Large File Storage for model weights

# Clone SemCoder from HuggingFace repository
!git clone https://huggingface.co/semcoder/semcoder_1030 /content/SemCoder

# Verify successful repository cloning
import os
if os.path.exists('/content/SemCoder'):
    print("SemCoder repository cloned successfully!")
else:
    raise RuntimeError("Failed to clone SemCoder repository")  # Critical error if clone fails

Installing Git LFS and cloning SemCoder...
Git LFS initialized.
Cloning into '/content/SemCoder'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 22 (delta 3), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (22/22), 402.01 KiB | 8.55 MiB/s, done.
Filtering content: 100% (3/3), 4.55 GiB | 11.86 MiB/s, done.
Encountered 2 file(s) that may not have been copied correctly on Windows:
	model-00001-of-00003.safetensors
	model-00002-of-00003.safetensors

See: `git lfs help smudge` for more details.
SemCoder repository cloned successfully!


In [None]:
# SemCoder File Verification

import os
from typing import List

def verify_semcoder_files() -> None:
    """
    Verifies the presence of all required SemCoder model files.

    Checks for:
        - Configuration files (config.json, tokenizer.json)
        - Model weight files in safetensors format
        - Model index file

    Raises:
        RuntimeError: If any required files are missing from the installation
    """
    # Define required files for model functionality
    required_files = [
        'config.json',           # Model configuration
        'tokenizer.json',        # Tokenizer configuration
        'model.safetensors.index.json',  # Model weights index
        # Sharded model weights in safetensors format
        'model-00001-of-00003.safetensors',
        'model-00002-of-00003.safetensors',
        'model-00003-of-00003.safetensors'
    ]
    missing_files: List[str] = []

    # Display current directory contents for debugging
    print("SemCoder directory contents:")
    files = os.listdir('/content/SemCoder')
    print("\n".join(files))

    # Check for missing files
    for file in required_files:
        if file not in files:
            missing_files.append(file)

    # Handle verification results
    if missing_files:
        raise RuntimeError(f"Missing required files: {', '.join(missing_files)}")
    else:
        print("\nAll required files present!")
        print("\nModel files verification successful!")

# Execute verification
verify_semcoder_files()

SemCoder directory contents:
special_tokens_map.json
model-00003-of-00003.safetensors
config.json
model.safetensors.index.json
.gitattributes
generation_config.json
README.md
model-00001-of-00003.safetensors
tokenizer_config.json
.git
model-00002-of-00003.safetensors
tokenizer.json
trainer_state.json

All required files present!

Model files verification successful!


In [None]:
# SemCoder Model Implementation

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from typing import Optional

class SemCoderModel:
    """
    A class implementing the SemCoder model with optimized loading and generation.

    Attributes:
        model_path (str): Path to the local SemCoder model files
        model: The loaded language model (initialized in load())
        tokenizer: The model's tokenizer (initialized in load())
    """

    def __init__(self, model_path: str):
        """
        Initialize SemCoder model instance.

        Args:
            model_path (str): Path to the local model directory
        """
        self.model_path = model_path
        self.model: Optional[AutoModelForCausalLM] = None
        self.tokenizer: Optional[AutoTokenizer] = None

    def load(self) -> None:
        """
        Load the SemCoder model and tokenizer with memory optimizations.

        Implements:
            - Memory clearing before load
            - bfloat16 precision for efficiency
            - Automatic device mapping
            - Gradient checkpointing

        Raises:
            Exception: If model loading fails
        """
        try:
            # Ensure clean memory state
            clear_memory()

            # Load tokenizer first
            print("Loading SemCoder tokenizer...")
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)

            # Load model with optimizations
            print("Loading SemCoder model...")
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_path,
                torch_dtype=torch.bfloat16,    # Use bfloat16 for memory efficiency
                device_map="auto",             # Automatic device placement
                low_cpu_mem_usage=True         # Minimize CPU memory usage
            )

            # Enable memory optimization
            if hasattr(self.model, "gradient_checkpointing_enable"):
                self.model.gradient_checkpointing_enable()

            print("Successfully loaded SemCoder!")
            get_memory_status()  # Display memory usage

        except Exception as e:
            print(f"Error loading SemCoder: {str(e)}")
            raise

    def generate_code(self, prompt: str, max_new_tokens: int = 512) -> str:
        """
        Generate code using the loaded SemCoder model.

        Args:
            prompt (str): Input prompt for code generation
            max_new_tokens (int): Maximum number of tokens to generate

        Returns:
            str: Generated code or empty string if generation fails

        Note:
            Uses sampling-based generation with temperature=0.7 and top_p=0.95
            for balanced creativity and coherence
        """
        try:
            # Tokenize input with proper device placement
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                padding=True,
                truncation=True
            ).to(self.model.device)

            # Generate with specified parameters
            outputs = self.model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=max_new_tokens,
                do_sample=True,         # Enable sampling
                temperature=0.8,        # Control randomness
                top_p=0.95,             # Nucleus sampling threshold
                top_k=50
            )

            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        except Exception as e:
            print(f"Error generating code: {str(e)}")
            return ""

# Initialize and load SemCoder model
semcoder = SemCoderModel("/content/SemCoder")
semcoder.load()

Loading SemCoder tokenizer...
Loading SemCoder model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Successfully loaded SemCoder!
GPU Memory: Allocated: 14518.59MB, Reserved: 14520.00MB


In [None]:
# SemCoder Generation Testing

def extract_code(generated_text: str) -> str:
    """
    Extract code from between ```python and ``` markers.

    Args:
        generated_text (str): Raw generated text from model

    Returns:
        str: Extracted code or original text if no markers found
    """
    if "```python" in generated_text:
        return generated_text.split("```python")[1].split("```")[0]
    return generated_text

def test_semcoder_generation() -> None:
    """
    Tests SemCoder's code generation capabilities with a standard programming task.

    Test includes:
        1. Code generation for Fibonacci sequence
        2. Basic validation of generated code structure
        3. Memory usage monitoring

    The test uses the Fibonacci sequence as it requires:
        - Function definition
        - Loop or recursion
        - Return statement
        - Basic algorithm implementation

    Prints:
        - Input prompt
        - Generated code
        - Validation results
        - Memory status
    """
    # Define test prompt using SemCoder's format
    CODEGEN_REQUEST = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable <Code> according to <NL_Description>

<NL_Description>
{desc}

<Code>
"""
    desc = "Write a Python function to calculate the Fibonacci sequence."
    prompt = CODEGEN_REQUEST.format(desc=desc)

    print("Testing SemCoder with Fibonacci sequence prompt...")
    print(f"Input prompt: {prompt}")

    try:
        # Generate code using SemCoder
        generated_text = semcoder.generate_code(prompt)
        generated_code = extract_code(generated_text)

        # Display generation results
        print("\nGenerated Code:")
        print(generated_code)

        # Perform basic structural validation
        validation_checks = {
            "function_definition": "def" in generated_code,
            "return_statement": "return" in generated_code
        }

        if all(validation_checks.values()):
            print("\nCode generation appears successful!")
            print("✓ Found function definition")
            print("✓ Found return statement")
        else:
            print("\nWarning: Generated code might be incomplete!")
            print("Missing elements:")
            for check, passed in validation_checks.items():
                if not passed:
                    print(f"✗ Missing {check.replace('_', ' ')}")

        # Monitor memory usage after generation
        print("\nMemory status after generation:")
        get_memory_status()

    except Exception as e:
        print(f"Error in test generation: {str(e)}")
        print(f"Error type: {type(e).__name__}")

# Execute the test
print("Initiating SemCoder generation test...")
test_semcoder_generation()

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Initiating SemCoder generation test...
Testing SemCoder with Fibonacci sequence prompt...
Input prompt: You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable <Code> according to <NL_Description>

<NL_Description>
Write a Python function to calculate the Fibonacci sequence.

<Code>


Generated Code:

def fibonacci(n):
    if n < 0:
        return "Incorrect input"
    elif n == 0:
        return 0
    elif n == 1 or n == 2:
        return 1
    else:
        a, b = 1, 1
        for _ in range(2, n):
            a, b = b, a + b
        return b


Code generation appears successful!
✓ Found function definition
✓ Found return statement

Memory status after generation:
GPU Memory: Allocated: 25083.15MB, Reserved: 25248.00MB


In [None]:
# Evaluation Framework Setup

# Install essential evaluation packages with version specifications
!pip install --upgrade pip  # Ensure pip is up to date
!pip install 'datasets>=3.1.0' 'tqdm>=4.66.0' 'fsspec==2024.10.0' --no-deps
!pip install 'gcsfs>=2024.10.0'  # Install after fsspec to ensure compatibility

from importlib.metadata import version
print("\nInstalled versions:")
for package in ['datasets', 'tqdm', 'fsspec', 'gcsfs']:
    try:
        ver = version(package)
        print(f"{package}: {ver}")
    except ImportError:
        print(f"{package}: Not found")


Installed versions:
datasets: 3.1.0
tqdm: 4.66.6
fsspec: 2024.10.0
gcsfs: 2024.10.0


In [None]:
# Test Execution Framework

# Import required libraries for code parsing and system operations
from typing import List
import ast
import sys

def run_tests(solution_code, test_code, namespace):
    """
    Executes and validates test cases against a generated solution.

    Args:
        solution_code: The code solution to be tested
        test_code: The test cases to run against the solution
        namespace: The execution environment for running tests

    Returns:
        bool: True if all tests pass, False otherwise
    """
    # Clean up input code by removing quotes and whitespace
    solution_code = solution_code.strip('"\'\n ')
    test_code = test_code.strip('"\'\n ')

    # Execute solution code in provided namespace
    try:
        exec(solution_code, namespace)
    except:
        print(f"Error occurred in solution code: {str(e)}")
        print(f"Error type: {type(e).__name__}")
        print(f"Solution code: {solution_code}")
        return False

    try:
        # Parse solution code to extract function name
        tree = ast.parse(solution_code)
        function_name = None
        for node in ast.walk(tree):
            if isinstance(node, ast.FunctionDef):
                function_name = node.name
                break

        if not function_name:
            raise ValueError("Could not find function definition in solution code")

        # Modify test code to collect results instead of using assertions
        modified_test_code = test_code.replace("def check(candidate):",
            f"def check(candidate):\n    global test_results\n    test_results = []")

        # Convert assertion statements to result collection
        test_lines = [line for line in test_code.split('\n') if line.strip().startswith('assert')]
        for i, line in enumerate(test_lines):
            modified_line = line.replace("assert ", "test_results.append((")
            modified_line = f"{modified_line}, {repr(line)}))"
            test_lines[i] = modified_line

        # Construct complete test execution code
        modified_test_code = "\n".join([
            "test_results = []",          # Initialize results list
            modified_test_code,           # Modified test function
            "\n".join(test_lines),        # Modified assertions
            f"check({function_name})"     # Execute tests
        ])

        # Execute modified test code
        exec(modified_test_code, namespace)
    except Exception as e:
        print(f"Error occurred for executing modified test code: {str(e)}")
        print(f"Error type: {type(e).__name__}")
        print(f"Modified test code: {modified_test_code}")
        return False

    # Process and display test results
    test_results = namespace.get('test_results', [])
    print(f"\nExecuting {len(test_results)} tests:\n")

    # Track test results and display each test outcome
    all_passed = True
    for i, (result, test_code) in enumerate(test_results, 1):
        if result:
            print(f"✓ Test {i} passed: {test_code}")
        else:
            print(f"✗ Test {i} failed: {test_code}")
            all_passed = False

    # Display test summary
    print(f"\nSummary: {sum(r[0] for r in test_results)}/{len(test_results)} tests passed")
    return all_passed

# Example usage demonstration
if __name__ == "__main__":
    # Initialize test environment with required imports
    setup_code = """from typing import List, Dict, Optional, Any, TypeVar, Tuple
import math
import string
import re

M = TypeVar('M')
"""
    namespace = {}
    exec(setup_code, namespace)

    # Example solution implementation
    solution_code = """def has_close_elements(numbers: List[float], threshold: float) -> bool:
    numbers.sort()
    for i in range(1, len(numbers)):
        if numbers[i] - numbers[i - 1] < threshold:
            return True
    return False"""

    # Example test cases
    test_code = '''METADATA = {
        'author': 'jt',
        'dataset': 'test'
}

def check(candidate):
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False
    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True
    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False
    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True
    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True
    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False'''

    # Execute test suite
    run_tests(solution_code, test_code, namespace)


Executing 7 tests:

✓ Test 1 passed:     assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
✓ Test 2 passed:     assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False
✓ Test 3 passed:     assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True
✓ Test 4 passed:     assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False
✓ Test 5 passed:     assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True
✓ Test 6 passed:     assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True
✓ Test 7 passed:     assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False

Summary: 7/7 tests passed


In [None]:
!pip install xxhash



In [None]:
!pip install multiprocess



In [None]:
# Model Evaluation Framework

# Import required libraries
from datasets import load_dataset
from typing import Dict, List, Any, TypeVar
import json
from tqdm import tqdm
import torch
import re

class ModelEvaluator:

    def __init__(self):
        """Initialize evaluator with HumanEval dataset and empty results"""
        self.human_eval = load_dataset("openai_humaneval")
        self.results = {}
        self.debug = True  # Control debug output

    def format_prompt(self, prompt: str, model_type: str) -> str:
        """
        Format input prompt according to model-specific requirements.

        Args:
            prompt: Original task prompt
            model_type: Type of model ("deepseek" or "semcoder")

        Returns:
            Formatted prompt string
        """
        # Format for DeepSeek model
        if model_type == "deepseek":
            return (
                "Write a Python function that solves the following task. "
                "Provide ONLY the function implementation starting with 'def' and proper indentation. "
                "The function should be properly indented with 4 spaces. "
                "Do not include any explanations, comments, docstrings, type hints, or test code. "
                "Do not include any print statements or assertions. "
                "Only include the function definition and its implementation.\n\n"
                "Example format:\n"
                "def example_function(param1, param2):\n"
                "    result = param1 + param2\n"
                "    return result\n\n"
                "Your task:\n"
                f"{prompt}"
            )
        # Format for SemCoder model
        elif model_type == "semcoder":
            return (
                "# Task: Implement the following Python function\n"
                f"{prompt}\n"
                "# Provide only the function implementation with proper indentation.\n"
            )
        return prompt

    def clean_generated_code(self, code: str) -> str:

        # Debug output of original code
        if self.debug:
            print("\nOriginal generated code:")
            print(code)

        # Normalize line endings and split into lines
        code = code.replace('\r\n', '\n')
        lines = code.splitlines()

        cleaned_lines = []
        target_function_found = False
        indent_level = 0
        INDENT = "    "
        has_seen_def = False

        # Process each line
        for line in lines:
            stripped = line.strip()
            if not stripped: continue

            function_def_found = stripped.startswith('def ')
            if function_def_found:
                if has_seen_def:
                    cleaned_lines = []
                    indent_level = 0
                else:
                    has_seen_def = True
            target_function_found = has_seen_def

            if not target_function_found: continue

            if function_def_found:
                # Clean function definition
                function_def = stripped
                # Remove return type hints
                function_def = re.sub(r'\s*->\s*(?:List|Dict|Tuple|Optional|Set|Union|Any|float|int|str|bool)\[?[^\]]*\]?\s*:', ':', function_def)

                # Clean parameter type hints
                parts = function_def.split('(', 1)
                if len(parts) == 2:
                    func_name, params_part = parts
                    params_and_rest = params_part.split(')', 1)
                    if len(params_and_rest) == 2:
                        params, rest = params_and_rest
                        param_list = params.split(',')
                        cleaned_params = []
                        for param in param_list:
                            cleaned_param = re.sub(r':\s*(?:List|Dict|Tuple|Optional|Set|Union|Any|float|int|str|bool)\[?[^\]]*\]?\s*(?=[,)])?', '', param.strip())
                            cleaned_params.append(cleaned_param)
                        function_def = f"{func_name}({', '.join(cleaned_params)}){rest}"

                # Normalize spacing
                function_def = re.sub(r'\s+:', ':', function_def)
                function_def = re.sub(r'\(\s+', '(', function_def)
                function_def = re.sub(r'\s+\)', ')', function_def)

                cleaned_lines.append(function_def)
                indent_level += 1
                continue

            # Filter out unwanted lines
            if any(skip in stripped for skip in ['print(', 'assert', 'if __name__']):
                continue

            cleaned_lines.append(line)

        # Join lines with Unix-style newlines
        cleaned_code = '\n'.join(cleaned_lines)

        if self.debug:
            print("\nCleaned code:")
            print(cleaned_code)
            print("\nCleaned code (repr):")
            print(repr(cleaned_code))

        return cleaned_code if target_function_found else ""

    def evaluate_single_solution(self, solution_code, test_cases, entry_point) -> Dict:
        """
        Evaluate a single generated solution against its test cases.

        Args:
            solution_code: Generated solution to evaluate
            test_cases: Test cases to run
            entry_point: Name of the function to test

        Returns:
            Dictionary containing evaluation metrics
        """
        print(test_cases)

        # Setup environment
        setup_code = """from typing import List, Dict, Optional, Any, TypeVar, Tuple
import math
import string
import re

M = TypeVar('M')
"""
        # Validate syntax
        try:
            compile(solution_code, '<string>', 'exec')
        except SyntaxError as e:
            if self.debug:
                print(f"Syntax error: {str(e)}")
                print(f"Generated code:\n{solution_code}")
            return {
                "pass@1": 0,
                "pass@10": 0,
                "pass@100": 0,
                "syntax_validity": 0,
                "execution_accuracy": 0
            }

        # Execute tests
        namespace = {}
        try:
            exec(setup_code, namespace)
        except Exception as e:
            if self.debug:
                print(f"Execution error for setup code: {str(e)}")
                print(f"Setup code:\n{setup_code}")
            execution_success = False

        execution_success = run_tests(solution_code, test_cases, namespace)
        return {
            "pass@1": int(execution_success),
            "pass@10": int(execution_success),
            "pass@100": int(execution_success),
            "syntax_validity": 1,
            "execution_accuracy": int(execution_success)
        }

    def evaluate_model(self, model, tokenizer, model_type: str, num_samples: int = None):

        results = {
            "pass@1": 0,
            "pass@10": 0,
            "pass@100": 0,
            "syntax_validity": 0,
            "execution_accuracy": 0
        }

        total_samples = len(self.human_eval["test"]) if num_samples is None else num_samples

        # Process each task
        for idx in tqdm(range(total_samples)):
            task = self.human_eval["test"][idx]
            formatted_prompt = self.format_prompt(task["prompt"], model_type)

            if self.debug:
                print(f"\n\nProcessing task {idx + 1}/{total_samples}")
                print("Prompt:")
                print(formatted_prompt)

            try:
                # Generate code based on model type
                if model_type == "deepseek":
                    messages = [{"role": "user", "content": formatted_prompt}]
                    inputs = tokenizer.apply_chat_template(
                        messages,
                        return_tensors="pt",
                        padding=True
                    ).to(model.device)

                    attention_mask = torch.ones_like(inputs)

                    outputs = model.generate(
                        inputs,
                        attention_mask=attention_mask,
                        max_new_tokens=512,
                        do_sample=True,
                        temperature=0.7,
                        top_p=0.95,
                        pad_token_id=tokenizer.eos_token_id
                    )
                    generated_code = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

                else:  # semcoder
                    inputs = tokenizer(
                        formatted_prompt,
                        return_tensors="pt",
                        padding=True,
                        truncation=True,
                        max_length=512
                    ).to(model.device)

                    outputs = model.generate(
                        input_ids=inputs["input_ids"],
                        attention_mask=inputs["attention_mask"],
                        max_new_tokens=512,
                        do_sample=True,
                        temperature=0.7,
                        top_p=0.95,
                        pad_token_id=tokenizer.eos_token_id
                    )
                    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

                # Process and evaluate generated code
                cleaned_code = self.clean_generated_code(generated_code)
                if cleaned_code:
                    evaluation = self.evaluate_single_solution(
                        cleaned_code,
                        task["test"],
                        task["entry_point"]
                    )

                    if self.debug:
                        print("\nEvaluation results:")
                        for metric, value in evaluation.items():
                            print(f"{metric}: {value}")

                    # Update metrics
                    for metric in results:
                        results[metric] += evaluation[metric]

            except Exception as e:
                if self.debug:
                    print(f"Error processing sample {idx}: {str(e)}")
                continue

        # Calculate final averages
        for metric in results:
            results[metric] /= total_samples

        return results

# Initialize the evaluator
evaluator = ModelEvaluator()

In [None]:
# DeepSeek Model Evaluation

# Begin DeepSeek model evaluation
print("Evaluating DeepSeek base model...")

# Run evaluation with limited sample size for initial testing
# num_samples=10 provides a quick assessment of model performance
deepseek_results = evaluator.evaluate_model(
    model=model,              # Previously loaded DeepSeek model
    tokenizer=tokenizer,      # DeepSeek tokenizer
    model_type="deepseek",    # Specify model type for proper prompt formatting
    num_samples=10           # Number of test cases to evaluate
)

# Display evaluation results
print("\nDeepSeek Base Results:")
print(json.dumps(deepseek_results, indent=2))  # Pretty print results in JSON format

Evaluating DeepSeek base model...


  0%|          | 0/10 [00:00<?, ?it/s]



Processing task 1/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



 10%|█         | 1/10 [01:43<15:27, 103.06s/it]


Original generated code:
    numbers = sorted(numbers)
    for i in range(1, len(numbers)):
        if numbers[i] - numbers[i-1] < threshold:
            return True
    return False


def main():
    pass


if __name__ == "__main__":
    main()























































































































































































































































































































































































































































Cleaned code:
def main():
    pass
    main()

Cleaned code (repr):
'def main():\n    pass\n    main()'


METADATA = {
    'author': 'jt',
    'dataset': 'test'
}


def check(candidate):
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) ==

 20%|██        | 2/10 [02:20<08:36, 64.51s/it] 


Original generated code:


    pass


Solution:

```python
from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:
    paren_string = paren_string.replace(' ', '')
    result = []
    stack = []
    temp = ''
    for char in paren_string:
        if char == '(':
            if stack:
                temp += char
                stack.append(char)
            else:
                stack.append(char)
                temp += char
        elif char == ')':
            if len(stack) == 1:
                temp += char
                result.append(temp)
                temp = ''
            else:
                temp += char
                stack.pop()
    return result
```


Cleaned code:
def separate_paren_groups(paren_string: str):
    paren_string = paren_string.replace(' ', '')
    result = []
    stack = []
    temp = ''
    for char in paren_string:
        if char == '(':
            if stack:
                temp += char
                stack.append(cha

 30%|███       | 3/10 [04:03<09:33, 81.95s/it]


Original generated code:
    # Split number into integer and decimal parts
    integer_part = int(number)
    decimal_part = number - integer_part

    # Return the decimal part
    return decimal_part




















































































































































































































































































































































































































































































Cleaned code:


Cleaned code (repr):
''


Processing task 4/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type h

 40%|████      | 4/10 [04:09<05:13, 52.21s/it]


Original generated code:
    balance = 0
    for op in operations:
        balance += op
        if balance < 0:
            return True
    return False


Cleaned code:


Cleaned code (repr):
''


Processing task 5/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List


def mean_absolute_deviation(numbers: List[float]) -> float:
    """ For a given list of input numbers, calculate Mean Absolute Deviation
    around the mean of this dataset.
    Mean Absolute Deviation is the average absolute difference betwe

 50%|█████     | 5/10 [04:29<03:21, 40.38s/it]


Original generated code:
    # calculate the mean of the numbers
    mean_numbers = sum(numbers) / len(numbers)

    # calculate the absolute deviation for each number and the mean
    abs_deviations = [abs(num - mean_numbers) for num in numbers]

    # calculate the mean absolute deviation
    mad = sum(abs_deviations) / len(abs_deviations)

    return mad


Cleaned code:


Cleaned code (repr):
''


Processing task 6/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List


def intersperse(numbers: List[int], 

 60%|██████    | 6/10 [04:40<02:02, 30.61s/it]


Original generated code:
    result = []
    for i in range(len(numbers)):
        result.append(numbers[i])
        if i != len(numbers) - 1:
            result.append(delimeter)
    return result


Cleaned code:


Cleaned code (repr):
''


Processing task 7/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List


def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group,

 70%|███████   | 7/10 [05:29<01:49, 36.62s/it]


Original generated code:
    result = []
    groups = paren_string.split()

    for group in groups:
        count = 0
        max_count = 0
        for char in group:
            if char == '(':
                count += 1
                if count > max_count:
                    max_count = count
            elif char == ')':
                count -= 1
        result.append(max_count)

    return result


# Leave the following lines to test the function
# assert parse_nested_parens('(()()) ((())) () ((())()())') == [2, 3, 1, 3]
# assert parse_nested_parens('') == []
# assert parse_nested_parens('()') == [1]
# assert parse_nested_parens('()()()') == [1, 1, 1]
# assert parse_nested_parens('((()))') == [3]
# assert parse_nested_parens('(()(()()))') == [2, 3]


Cleaned code:


Cleaned code (repr):
''


Processing task 8/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function shoul

 80%|████████  | 8/10 [06:02<01:10, 35.23s/it]


Original generated code:
    result = [s for s in strings if substring in s]

    return result



This task is a function that filters a list of strings by a given substring. The function should return a list of strings that contain the given substring.

For instance, if the function is called with the list ['abc', 'bacd', 'cde', 'array'] and the substring 'a', it should return ['abc', 'bacd', 'array'].

The function should not be case sensitive, i.e., it treats 'A' and 'a' as the same character.

You can assume that the input will always be a list of strings and that the substring will always be a string.


Cleaned code:


Cleaned code (repr):
''


Processing task 9/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statemen

 90%|█████████ | 9/10 [06:16<00:28, 28.80s/it]


Original generated code:
    # write your code here
    total_sum = 0
    product = 1

    for num in numbers:
        total_sum += num
        product *= num

    return (total_sum, product)

Please note that the function should return a tuple (sum, product) instead of a list.


Cleaned code:


Cleaned code (repr):
''


Processing task 10/10
Prompt:
Write a Python function that solves the following task. Provide ONLY the function implementation starting with 'def' and proper indentation. The function should be properly indented with 4 spaces. Do not include any explanations, comments, docstrings, type hints, or test code. Do not include any print statements or assertions. Only include the function definition and its implementation.

Example format:
def example_function(param1, param2):
    result = param1 + param2
    return result

Your task:
from typing import List, Tuple


def rolling_max(numbers: List[int]) -> List[int]:
    """ From a given list of integers, generate a list of r

100%|██████████| 10/10 [08:00<00:00, 48.02s/it]


Original generated code:

def rolling_min(numbers: List[int]) -> List[int]:
    """ From a given list of integers, generate a list of rolling minimum element found until given moment
    in the sequence.
    >>> rolling_min([1, 2, 3, 2, 3, 4, 2])
    [1, 1, 1, 1, 1, 1, 1]
    """


def rolling_sum(numbers: List[int]) -> List[int]:
    """ From a given list of integers, generate a list of rolling sum of elements found until given moment
    in the sequence.
    >>> rolling_sum([1, 2, 3, 2, 3, 4, 2])
    [1, 3, 6, 8, 11, 15, 17]
    """


def rolling_average(numbers: List[int]) -> List[float]:
    """ From a given list of integers, generate a list of rolling average of elements found until given moment
    in the sequence.
    >>> rolling_average([1, 2, 3, 2, 3, 4, 2])
    [1.0, 1.5, 2.0, 2.5, 2.3333333333333335, 2.2, 2.0]
    """


def rolling_median(numbers: List[int]) -> List[float]:
    """ From a given list of integers, generate a list of rolling median of elements found until give




In [None]:
# SemCoder Model Evaluation

# Begin SemCoder evaluation
print("Evaluating SemCoder...")

# Run evaluation using identical parameters as DeepSeek for fair comparison
semcoder_results = evaluator.evaluate_model(
    model=semcoder.model,        # Previously loaded SemCoder model
    tokenizer=semcoder.tokenizer, # SemCoder tokenizer
    model_type="semcoder",       # Specify model type for appropriate prompt formatting
    num_samples=10              # Match DeepSeek sample size for direct comparison
)

# Display evaluation results
print("\nSemCoder Results:")
print(json.dumps(semcoder_results, indent=2))  # Pretty print results in JSON format

Evaluating SemCoder...


  0%|          | 0/10 [00:00<?, ?it/s]



Processing task 1/10
Prompt:
# Task: Implement the following Python function
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """

# Provide only the function implementation with proper indentation.



 10%|█         | 1/10 [00:04<00:39,  4.44s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """

# Provide only the function implementation with proper indentation.

def has_close_elements(numbers: List[float], threshold: float) -> bool:
    # Step 1: Sort the list of numbers
    numbers.sort()
    
    # Step 2: Iterate through the sorted list
    for i in range(len(numbers) - 1):
        # Check the difference between consecutive elements
        if numbers[i + 1] - numbers[i] <= threshold:
            return True
            
    # If no pairs were found that were closer than the threshold
    return False


Cleaned code:
def has_close_elements(numbers, threshold

 20%|██        | 2/10 [00:09<00:39,  4.94s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and return the list of those.
    Separate groups are balanced (each open brace is properly closed) and not nested within each other
    Ignore any spaces in the input string.
    >>> separate_paren_groups('( ) (( )) (( )( ))')
    ['()', '(())', '(()())']
    """

# Provide only the function implementation with proper indentation.

def separate_paren_groups(paren_string: str) -> List[str]:
    result = []
    current_group = []
    open_count = 0

    for char in paren_string:
        if char == '(':
            open_count += 1
            current_group.append(char)
        elif char == ')':
            if open_count > 0:
                current_group.append(char)
           

 30%|███       | 3/10 [00:14<00:35,  5.05s/it]


Original generated code:
# Task: Implement the following Python function


def truncate_number(number: float) -> float:
    """ Given a positive floating point number, it can be decomposed into
    and integer part (largest integer smaller than given number) and decimals
    (leftover part always smaller than 1).

    Return the decimal part of the number.
    >>> truncate_number(3.5)
    0.5
    """

# Provide only the function implementation with proper indentation.

def truncate_number(number: float) -> float:
    """
    Returns the decimal part of the given floating-point number.
    
    Args:
    number (float): A positive floating-point number.
    
    Returns:
    float: The decimal part of the number.
    
    Raises:
    ValueError: If the input number is not positive.
    """
    if number <= 0:
        raise ValueError("Input must be a positive floating-point number.")
    
    # Calculate the decimal part
    decimal_part = number - int(number)
    return decimal_part



 40%|████      | 4/10 [00:16<00:22,  3.77s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def below_zero(operations: List[int]) -> bool:
    """ You're given a list of deposit and withdrawal operations on a bank account that starts with
    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and
    at that point function should return True. Otherwise it should return False.
    >>> below_zero([1, 2, 3])
    False
    >>> below_zero([1, 2, -4, 5])
    True
    """

# Provide only the function implementation with proper indentation.

def below_zero(operations: List[int]) -> bool:
    balance = 0
    for operation in operations:
        balance += operation
        if balance < 0:
            return True
    return False

Cleaned code:
def below_zero(operations):
    balance = 0
    for operation in operations:
        balance += operation
        if balance < 0:
            return True
    return False

Cleaned code (repr):
'def below

 50%|█████     | 5/10 [00:21<00:20,  4.08s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def mean_absolute_deviation(numbers: List[float]) -> float:
    """ For a given list of input numbers, calculate Mean Absolute Deviation
    around the mean of this dataset.
    Mean Absolute Deviation is the average absolute difference between each
    element and a centerpoint (mean in this case):
    MAD = average | x - x_mean |
    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])
    1.0
    """

# Provide only the function implementation with proper indentation.

def mean_absolute_deviation(numbers: List[float]) -> float:
    if not numbers:
        return 0.0  # Return 0 if the list is empty
    
    mean = sum(numbers) / len(numbers)
    
    # Calculate absolute deviations
    absolute_deviations = [abs(num - mean) for num in numbers]
    
    # Calculate the mean of absolute deviations
    mad = sum(absolute_deviations) / len(numbers)
    
    return mad


Cleaned code:
def mea

 60%|██████    | 6/10 [00:26<00:18,  4.58s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def intersperse(numbers: List[int], delimeter: int) -> List[int]:
    """ Insert a number 'delimeter' between every two consecutive elements of input list `numbers'
    >>> intersperse([], 4)
    []
    >>> intersperse([1, 2, 3], 4)
    [1, 4, 2, 4, 3]
    """

# Provide only the function implementation with proper indentation.

def intersperse(numbers: List[int], delimeter: int) -> List[int]:
    if not numbers:  # Check for empty list
        return []
    
    result = []  # Initialize an empty result list
    for i in range(len(numbers) - 1):  # Loop through the numbers except the last one
        result.append(numbers[i])  # Append the current number
        result.append(delimeter)  # Append the delimeter
    
    # Append the last number after the loop
    result.append(numbers[-1])
    
    return result  # Return the result list

Cleaned code:
def intersperse(numbers, delimeter)

 70%|███████   | 7/10 [00:31<00:13,  4.54s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group, output the deepest level of nesting of parentheses.
    E.g. (()()) has maximum two levels of nesting while ((())) has three.

    >>> parse_nested_parens('(()()) ((())) () ((())()())')
    [2, 3, 1, 3]
    """

# Provide only the function implementation with proper indentation.

def parse_nested_parens(paren_string: str) -> List[int]:
    results = []
    for group in paren_string.split():
        max_depth = 0
        current_depth = 0
        for char in group:
            if char == '(':
                current_depth += 1
                max_depth = max(max_depth, current_depth)
            elif char == ')':
                current_depth -= 1
        results.append(max_depth)
    return r

 80%|████████  | 8/10 [00:32<00:07,  3.57s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List


def filter_by_substring(strings: List[str], substring: str) -> List[str]:
    """ Filter an input list of strings only for ones that contain given substring
    >>> filter_by_substring([], 'a')
    []
    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')
    ['abc', 'bacd', 'array']
    """

# Provide only the function implementation with proper indentation.

def filter_by_substring(strings: List[str], substring: str) -> List[str]:
    return [s for s in strings if substring in s]

Cleaned code:
def filter_by_substring(strings, substring):
    return [s for s in strings if substring in s]

Cleaned code (repr):
'def filter_by_substring(strings, substring):\n    return [s for s in strings if substring in s]'


METADATA = {
    'author': 'jt',
    'dataset': 'test'
}


def check(candidate):
    assert candidate([], 'john') == []
    assert candidate(['xxx', 'asd', 'xxy', 'john d

 90%|█████████ | 9/10 [00:36<00:03,  3.44s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List, Tuple


def sum_product(numbers: List[int]) -> Tuple[int, int]:
    """ For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.
    Empty sum should be equal to 0 and empty product should be equal to 1.
    >>> sum_product([])
    (0, 1)
    >>> sum_product([1, 2, 3, 4])
    (10, 24)
    """

# Provide only the function implementation with proper indentation.

def sum_product(numbers: List[int]) -> Tuple[int, int]:
    total_sum = 0
    product = 1
    
    for num in numbers:
        total_sum += num  # Adding each number to the total sum
        product *= num    # Multiplying each number to the product
    
    return (total_sum, product)

Cleaned code:
def sum_product(numbers):
    total_sum = 0
    product = 1
    for num in numbers:
        total_sum += num  # Adding each number to the total sum
        product *= num    # Multip

100%|██████████| 10/10 [00:40<00:00,  4.02s/it]


Original generated code:
# Task: Implement the following Python function
from typing import List, Tuple


def rolling_max(numbers: List[int]) -> List[int]:
    """ From a given list of integers, generate a list of rolling maximum element found until given moment
    in the sequence.
    >>> rolling_max([1, 2, 3, 2, 3, 4, 2])
    [1, 2, 3, 3, 3, 4, 4]
    """

# Provide only the function implementation with proper indentation.

def rolling_max(numbers: List[int]) -> List[int]:
    if not numbers:  # Handle empty list case
        return []
    
    result = []
    current_max = float('-inf')  # Start with the lowest possible value
    
    for number in numbers:
        current_max = max(current_max, number)  # Update current maximum
        result.append(current_max)  # Append the current maximum to the result list
    
    return result

Cleaned code:
def rolling_max(numbers):
    if not numbers:  # Handle empty list case
        return []
    result = []
    current_max = float('-in




In [None]:
# Benchmark Dataset Loading and Testing

from datasets import load_dataset

def generate_code_with_semcoder(prompt: str) -> str:

    # Format prompt for SemCoder
    formatted_prompt = (
        "# Task: Implement the following Python function\n"
        f"{prompt}\n"
        "# Provide only the function implementation with proper indentation.\n"
    )

    # Generate code using previously loaded SemCoder model
    return semcoder.generate_code(formatted_prompt)

# Load the complete HumanEval benchmark
human_eval = load_dataset("openai_humaneval")  # Contains 164 Python programming tasks

# Extract first task for initial testing
task = human_eval["test"][0]  # Index 0 contains first benchmark problem
prompt = task["prompt"]       # Extract problem description

# Display task details for verification
print("HumanEval Prompt:\n", prompt)  # Show problem description
print("Expected Solution:\n", task["canonical_solution"])  # Show reference solution

# Test code generation with SemCoder
generated_code = generate_code_with_semcoder(prompt)  # Generate solution using SemCoder
print("Generated Code:\n", generated_code)  # Display generated solution

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


HumanEval Prompt:
 from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """

Expected Solution:
     for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):
            if idx != idx2:
                distance = abs(elem - elem2)
                if distance < threshold:
                    return True

    return False

Generated Code:
 # Task: Implement the following Python function
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements(

In [None]:
# Compare generated code with canonical solution
if generated_code.strip() == task["canonical_solution"].strip():
    print("The generated code matches the expected solution!")
else:
    print("The generated code does not match the expected solution.")

The generated code does not match the expected solution.


In [None]:
# Doctest Validation

# First attempt to execute the generated code
try:
    exec(generated_code)  # Load the generated function into namespace
except Exception as e:
    print(f"Error in executing generated code: {e}")
    print("Generated code that failed:")
    print(generated_code)

# If code execution succeeded, run doctests
try:
    import doctest
    doctest.testmod()  # Run all doctests in the current namespace
except Exception as e:
    print(f"Error running doctests: {e}")
    print("Doctest execution failed. This might indicate:")
    print("- Syntax errors in the docstring examples")
    print("- Mismatched output formatting")
    print("- Function behavior different from examples")


sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/lib/python3.10/doctest.py", line 1501, in run
    sys.settrace(save_trace)



In [None]:
# Custom Test Suite Execution

# Define comprehensive test cases
test_cases = [
    ([1.0, 2.0, 3.0], 0.5, False),          # Basic case with no close elements
    ([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3, True)  # Case with close elements
]

def run_tests(func):

    for numbers, threshold, expected in test_cases:
        result = func(numbers, threshold)
        assert result == expected, f"Test failed: {numbers}, {threshold} -> {result}"

# Execute tests on generated function
try:
    # Load the generated function into current namespace
    exec(generated_code)

    # Run test suite against the loaded function
    run_tests(has_close_elements)
    print("All tests passed successfully!")

except Exception as e:
    print(f"Test failed: {e}")
    print("\nDetails:")
    print(f"- Error type: {type(e).__name__}")
    print(f"- Generated code being tested:")
    print(generated_code)

All tests passed successfully!


In [None]:
# Multi-Task Evaluation Loop

# Define test cases for each function type
test_cases_by_function = {
    "has_close_elements": [
        ([1.0, 2.0, 3.0], 0.5, False),
        ([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3, True)
    ],
    "separate_paren_groups": [
        ('( ) (( )) (( )( ))', ['()', '(())', '(()())']),
        ('(()()) ((())) () ((())()())', ['(()())', '((()))', '()', '((())()())'])
    ],
    "truncate_number": [
        (3.5, 0.5),
        (1.33, 0.33),
        (123.456, 0.456)
    ],
    "below_zero": [
        ([1, 2, 3], False),
        ([1, 2, -4, 5], True)
    ],
    "mean_absolute_deviation": [
        ([1.0, 2.0, 3.0, 4.0], 1.0),
        ([1.0, 2.0, 3.0], 2.0/3.0)
    ]
}

def run_function_tests(func_name, func, test_cases):
    """Run tests specific to the function type"""
    passed = 0
    for test_case in test_cases:
        try:
            args = test_case[:-1]  # All but last element are arguments
            expected = test_case[-1]  # Last element is expected result
            result = func(*args)
            if abs(result - expected) < 1e-6 if isinstance(expected, float) else result == expected:
                passed += 1
            else:
                print(f"Test failed: {args} -> Expected {expected}, got {result}")
        except Exception as e:
            print(f"Test error: {str(e)}")
    return passed == len(test_cases)

# Evaluate first 5 tasks from HumanEval
for i in range(5):
    task = human_eval["test"][i]
    prompt = task["prompt"]
    func_name = task["entry_point"]

    print(f"\nTask {i + 1} ({func_name}) Prompt:\n{prompt}")

    generated_code = generate_code_with_semcoder(prompt)
    print("Generated Code:\n", generated_code)

    try:
        # Create new namespace for each function
        namespace = {}
        exec(generated_code, namespace)

        # Get the function from namespace
        func = namespace[func_name]

        # Run appropriate tests for this function
        if func_name in test_cases_by_function:
            success = run_function_tests(func_name, func, test_cases_by_function[func_name])
            if success:
                print(f"Task {i + 1}: All tests passed successfully!\n")
            else:
                print(f"Task {i + 1}: Some tests failed.\n")
        else:
            print(f"No test cases defined for function: {func_name}")

    except Exception as e:
        print(f"Task {i + 1}: Error - {e}")
        print(f"Error type: {type(e).__name__}")
        print("Generated code that failed:")
        print(generated_code)
        print()

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Task 1 (has_close_elements) Prompt:
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Code:
 # Task: Implement the following Python function
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """

# Provide only the function implementation with proper indentation.

def has_close_elements(numbers: List[float], threshold: float) -> bool:
    # Sort the list
    numbers.sort()
    
    # Check every adjacent pair
    for i in range(len(numbers) - 1):
        if abs(numbers[i] - numbers[i + 1]) < threshold:
            return True
            
    return False
Task 1: All tests passed successfully!


Task 2 (separate_paren_groups) Prompt:
from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing m

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Code:
 # Task: Implement the following Python function
from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and return the list of those.
    Separate groups are balanced (each open brace is properly closed) and not nested within each other
    Ignore any spaces in the input string.
    >>> separate_paren_groups('( ) (( )) (( )( ))')
    ['()', '(())', '(()())']
    """

# Provide only the function implementation with proper indentation.

def separate_paren_groups(paren_string: str) -> List[str]:
    result = []
    current_group = ""
    open_count = 0

    for char in paren_string:
        if char == '(':
            open_count += 1
            current_group += char
        elif char == ')':
            if open_count > 0:
                current_group += char
                open_count -= 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Code:
 # Task: Implement the following Python function


def truncate_number(number: float) -> float:
    """ Given a positive floating point number, it can be decomposed into
    and integer part (largest integer smaller than given number) and decimals
    (leftover part always smaller than 1).

    Return the decimal part of the number.
    >>> truncate_number(3.5)
    0.5
    """

# Provide only the function implementation with proper indentation.

def truncate_number(number: float) -> float:
    if number < 0:
        raise ValueError("Input must be a positive floating-point number.")
    integer_part = int(number)
    return number - integer_part

Task 3: All tests passed successfully!


Task 4 (below_zero) Prompt:
from typing import List


def below_zero(operations: List[int]) -> bool:
    """ You're given a list of deposit and withdrawal operations on a bank account that starts with
    zero balance. Your task is to detect if at any point the balance of account fallls 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Code:
 # Task: Implement the following Python function
from typing import List


def below_zero(operations: List[int]) -> bool:
    """ You're given a list of deposit and withdrawal operations on a bank account that starts with
    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and
    at that point function should return True. Otherwise it should return False.
    >>> below_zero([1, 2, 3])
    False
    >>> below_zero([1, 2, -4, 5])
    True
    """

# Provide only the function implementation with proper indentation.

def below_zero(operations: List[int]) -> bool:
    balance = 0
    for operation in operations:
        balance += operation
        if balance < 0:
            return True
    return False

Task 4: All tests passed successfully!


Task 5 (mean_absolute_deviation) Prompt:
from typing import List


def mean_absolute_deviation(numbers: List[float]) -> float:
    """ For a given list of input numbers, calculate Mean 

In [None]:
# Advanced Code Evaluation System

from typing import List, Dict
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import timeout_decorator

class CodeEvaluator:

    def __init__(self, dataset="openai_humaneval"):
        """
        Initialize evaluator with specified dataset and metrics.

        Args:
            dataset: Name of the evaluation dataset
        """
        self.dataset = load_dataset(dataset)
        self.metrics = {
            "pass@1": 0.0,      # Single-attempt success rate
            "pass@10": 0.0,     # Success within 10 attempts
            "pass@100": 0.0,    # Success within 100 attempts
            "syntax_validity": 0.0,  # Syntactic correctness
            "execution_accuracy": 0.0  # Functional correctness
        }

    @timeout_decorator.timeout(5)  # Prevent infinite loops/hanging
    def execute_test_case(self, code: str, test_case: str) -> bool:

        try:
            namespace = {}
            exec(code, namespace)
            exec(test_case, namespace)
            return True
        except Exception as e:
            return False

    def check_syntax(self, code: str) -> bool:

        try:
            compile(code, '<string>', 'exec')
            return True
        except SyntaxError:
            return False

    def evaluate_single_solution(self,
                               task_id: int,
                               generated_code: str,
                               num_samples: int = 1) -> Dict:

        task = self.dataset["test"][task_id]

        # Verify syntax first
        syntax_valid = self.check_syntax(generated_code)

        # Execute test cases if syntax is valid
        if syntax_valid:
            test_cases = task["test_cases"]
            # Use thread pool for parallel test execution
            with ThreadPoolExecutor() as executor:
                results = list(executor.map(
                    lambda tc: self.execute_test_case(generated_code, tc),
                    test_cases
                ))
                print("Results")
                print(results)
            execution_success = all(results)
        else:
            execution_success = False

        return {
            "syntax_valid": syntax_valid,
            "execution_success": execution_success
        }

    def evaluate_model(self, model, tokenizer, n_tasks: int = None):

        if n_tasks is None:
            n_tasks = len(self.dataset["test"])

        results = []
        for i in range(n_tasks):
            task = self.dataset["test"][i]
            prompt = task["prompt"]

            # Generate solution
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            outputs = model.generate(
                inputs["input_ids"],
                max_new_tokens=512,
                num_return_sequences=1,
                temperature=0.8
            )
            generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Evaluate solution
            result = self.evaluate_single_solution(i, generated_code)
            results.append(result)

        # Calculate aggregate metrics
        self.metrics["syntax_validity"] = np.mean([r["syntax_valid"] for r in results])
        self.metrics["execution_accuracy"] = np.mean([r["execution_success"] for r in results])

        return self.metrics

# Initialize the evaluation system
evaluator = CodeEvaluator()

def evaluate_stage(model, tokenizer, stage_name: str):

    print(f"\nEvaluating {stage_name}...")
    metrics = evaluator.evaluate_model(model, tokenizer)

    print(f"\nResults for {stage_name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

    return metrics

# SemCoder Test Cases and Oracle Generation

In [None]:
# Verify SemCoder is loaded and working
print("SemCoder loaded:", hasattr(semcoder, 'model') and semcoder.model is not None)
print("SemCoder tokenizer loaded:", hasattr(semcoder, 'tokenizer') and semcoder.tokenizer is not None)

# Test with a simple prompt
test_prompt = "Write a simple function that adds two numbers."
generated_code = semcoder.generate_code(test_prompt)
print("\nTest generation result:")
print(generated_code)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


SemCoder loaded: True
SemCoder tokenizer loaded: True

Test generation result:
Write a simple function that adds two numbers.

```python
def add(a, b):
    return a + b
```



In [None]:
def clean_generated_code(code: str) -> str:
    """Clean up generated code to extract only the functions."""
    lines = code.split('\n')
    cleaned_lines = []
    in_function = False

    for line in lines:
        if line.strip().startswith('def '):
            in_function = True
            cleaned_lines.append(line)
        elif in_function and (line.startswith('    ') or not line.strip()):
            cleaned_lines.append(line)
        elif in_function and line.strip() and not line.startswith('    '):
            in_function = False
            cleaned_lines.append('')

    return '\n'.join(cleaned_lines).strip()

prompt = """
Write a complete test suite for this average calculation function:

def sample_function(numbers: list) -> float:
    '''
    Calculate the average of a list of numbers.
    Returns None if the list is empty.
    Raises TypeError if any element is not a number or if input is invalid.
    '''
    if numbers is None:
        raise TypeError("Input cannot be None")
    if not isinstance(numbers, list):
        raise TypeError("Input must be a list")
    if not numbers:
        return None
    if not all(isinstance(x, (int, float)) for x in numbers):
        raise TypeError("All elements must be numbers")
    return sum(numbers) / len(numbers)

Generate separate test functions for each category:

def test_normal_cases():
    # Test positive integers
    assert sample_function([1, 2, 3]) == 2.0
    # Test negative numbers
    assert sample_function([-1, -2, -3]) == -2.0
    # Test mixed numbers
    assert sample_function([-1, 0, 1]) == 0.0
    # Test floating point
    assert sample_function([1.5, 2.5, 3.5]) == 2.5

def test_edge_cases():
    # Test empty list
    assert sample_function([]) is None
    # Test single element
    assert sample_function([5]) == 5.0
    # Test zeros
    assert sample_function([0, 0, 0]) == 0.0
    # Test large numbers
    assert sample_function([1000000, 2000000, 3000000]) == 2000000.0

def test_error_cases():
    # Test None input
    with pytest.raises(TypeError):
        sample_function(None)
    # Test non-list input
    with pytest.raises(TypeError):
        sample_function("not a list")
    # Test non-numeric elements
    with pytest.raises(TypeError):
        sample_function(["a", "b", "c"])
    # Test mixed types
    with pytest.raises(TypeError):
        sample_function([1, "a", 2])

Generate all three test functions with the exact test cases shown above."""

generated_code = semcoder.generate_code(
    prompt,
    max_new_tokens=4096
)
print("GENERATED CODE:")
print(generated_code)

print("\nCLEANED CODE:")
cleaned_code = clean_generated_code(generated_code)
print(cleaned_code)

# Test execution
print("\nExecuting test suite...")
try:
    # Import required modules
    import pytest
    # Execute the generated test suite
    exec(cleaned_code)  # First execute the code to define the functions

    # Execute all test functions
    test_functions = re.findall(r'def (test_[^\(]+)', cleaned_code)
    for test_function in test_functions:
        exec(f"{test_function}()")
    print("✓ All tests passed successfully!")
except Exception as e:
    print(f"✗ Test failed: {str(e)}")
    import traceback
    traceback.print_exc()

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


GENERATED CODE:

Write a complete test suite for this average calculation function:

def sample_function(numbers: list) -> float:
    '''
    Calculate the average of a list of numbers.
    Returns None if the list is empty.
    Raises TypeError if any element is not a number or if input is invalid.
    '''
    if numbers is None:
        raise TypeError("Input cannot be None")
    if not isinstance(numbers, list):
        raise TypeError("Input must be a list")
    if not numbers:
        return None
    if not all(isinstance(x, (int, float)) for x in numbers):
        raise TypeError("All elements must be numbers")
    return sum(numbers) / len(numbers)

Generate separate test functions for each category:

def test_normal_cases():
    # Test positive integers
    assert sample_function([1, 2, 3]) == 2.0
    # Test negative numbers
    assert sample_function([-1, -2, -3]) == -2.0
    # Test mixed numbers
    assert sample_function([-1, 0, 1]) == 0.0
    # Test floating point
    asser

In [None]:
class TestCaseEvaluator:
    def __init__(self):
        self.metrics = {
            "syntax_validity": 0.0,
            "execution_accuracy": 0.0,
            "normal_case_coverage": 0.0,
            "edge_case_coverage": 0.0,
            "error_case_coverage": 0.0,
            "total_test_cases": 0
        }

        # Define the sample function code
        self.sample_function_code = """
def sample_function(numbers: list) -> float:
    '''
    Calculate the average of a list of numbers.
    Returns None if the list is empty.
    Raises TypeError if any element is not a number or if input is invalid.
    '''
    if numbers is None:
        raise TypeError("Input cannot be None")
    if not isinstance(numbers, list):
        raise TypeError("Input must be a list")
    if not numbers:
        return None
    if not all(isinstance(x, (int, float)) for x in numbers):
        raise TypeError("All elements must be numbers")
    return sum(numbers) / len(numbers)
"""

    def clean_generated_code(self, code: str) -> str:
        """Clean up generated code to extract only the functions."""
        lines = code.split('\n')
        cleaned_lines = []
        in_function = False

        for line in lines:
            if line.strip().startswith('def '):
                in_function = True
                cleaned_lines.append(line)
            elif in_function and (line.startswith('    ') or not line.strip()):
                cleaned_lines.append(line)
            elif in_function and line.strip() and not line.startswith('    '):
                in_function = False
                cleaned_lines.append('')

        return '\n'.join(cleaned_lines).strip()

    @timeout_decorator.timeout(5)  # Prevent infinite loops/hanging
    def execute_test_case(self, code: str, test_case: str) -> bool:
        try:
            namespace = {}
            exec(code, namespace)
            exec(test_case, namespace)
            return True
        except Exception as e:
            print(f"Test execution error: {str(e)}")
            return False

    def check_syntax(self, code: str) -> bool:
        try:
            compile(code, '<string>', 'exec')
            return True
        except SyntaxError:
            return False

    def evaluate_test_coverage(self, generated_tests: str) -> Dict:
        # Improved regex patterns
        normal_pattern = r'assert sample_function\(\[(?!0|1000).*?\]\)'
        edge_patterns = {
            'empty': r'assert sample_function\(\[\]\)',
            'single': r'assert sample_function\(\[\d+\]\)',
            'zeros': r'assert sample_function\(\[0[,\s]*0[,\s]*0\]\)',
            'large': r'assert sample_function\(\[.*?000.*?\]\)'
        }
        error_pattern = r'pytest\.raises\(TypeError\)'

        # Count cases
        normal_cases = len(re.findall(normal_pattern, generated_tests))
        edge_cases = sum(1 for pattern in edge_patterns.values()
                        if re.search(pattern, generated_tests))
        error_cases = len(re.findall(error_pattern, generated_tests))

        # Detailed edge case analysis
        edge_coverage = {name: bool(re.search(pattern, generated_tests))
                        for name, pattern in edge_patterns.items()}

        metrics = {
            "total_test_cases": normal_cases + edge_cases + error_cases,
            "normal_case_coverage": normal_cases / 4 if normal_cases <= 4 else 1.0,
            "edge_case_coverage": edge_cases / 4,
            "error_case_coverage": error_cases / 4 if error_cases <= 4 else 1.0,
            "syntax_valid": self.check_syntax(generated_tests),
            "execution_success": self.execute_test_case(self.sample_function_code, generated_tests)
        }

        print("\nDetailed counts:")
        print(f"Normal cases: {normal_cases}")
        print(f"Edge cases: {edge_cases}")
        print(f"Error cases: {error_cases}")

        print("\nEdge case coverage:")
        for case, covered in edge_coverage.items():
            print(f"{case}: {'✓' if covered else '✗'}")

        return metrics

# Use the evaluator
evaluator = TestCaseEvaluator()
generated_tests = semcoder.generate_code(prompt, max_new_tokens=4096)
cleaned_tests = evaluator.clean_generated_code(generated_tests)

print("Generated Tests:")
print(cleaned_tests)

metrics = evaluator.evaluate_test_coverage(cleaned_tests)
print("\nTest Coverage Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.2f}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Tests:
def sample_function(numbers: list) -> float:
    '''
    Calculate the average of a list of numbers.
    Returns None if the list is empty.
    Raises TypeError if any element is not a number or if input is invalid.
    '''
    if numbers is None:
        raise TypeError("Input cannot be None")
    if not isinstance(numbers, list):
        raise TypeError("Input must be a list")
    if not numbers:
        return None
    if not all(isinstance(x, (int, float)) for x in numbers):
        raise TypeError("All elements must be numbers")
    return sum(numbers) / len(numbers)


def test_normal_cases():
    # Test positive integers
    assert sample_function([1, 2, 3]) == 2.0
    # Test negative numbers
    assert sample_function([-1, -2, -3]) == -2.0
    # Test mixed numbers
    assert sample_function([-1, 0, 1]) == 0.0
    # Test floating point
    assert sample_function([1.5, 2.5, 3.5]) == 2.5

def test_edge_cases():
    # Test empty list
    assert sample_function([]) i

In [None]:
# Oracle Generation Prompt
oracle_prompt = """
Create a test oracle function for the average calculation function that validates inputs and returns results with status messages.
The oracle should return a tuple: (result, is_valid, message)

Example oracle structure:
def oracle_average_calculator(numbers: list) -> tuple:
    '''Test oracle for average calculation function.
    Returns tuple: (result, is_valid, message)
    '''
    # Input validation
    if numbers is None:
        return None, False, "Input cannot be None"

    if not isinstance(numbers, list):
        return None, False, "Input must be a list"

    if not numbers:
        return None, True, "Valid empty list"

    if not all(isinstance(x, (int, float)) for x in numbers):
        return None, False, "All elements must be numbers"

    # Calculate result
    result = sum(numbers) / len(numbers)
    return result, True, "Valid calculation"

Generate the complete oracle function following this pattern."""

# Generate oracle using SemCoder
generated_oracle = semcoder.generate_code(oracle_prompt, max_new_tokens=4096)
print("Generated Oracle:")
print(generated_oracle)

# Clean and validate the oracle
cleaned_oracle = clean_generated_code(generated_oracle)
print("\nCleaned Oracle:")
print(cleaned_oracle)

# Test the oracle
try:
    exec(cleaned_oracle)
    print("\nTesting oracle with sample cases:")
    test_cases = [
        ([1, 2, 3], "normal case"),
        ([], "empty list"),
        (None, "None input"),
        ("not a list", "invalid type"),
        ([1, "a", 2], "mixed types")
    ]

    for input_case, desc in test_cases:
        result = eval(f"oracle_average_calculator({input_case})")
        print(f"\n{desc}: {result}")

except Exception as e:
    print(f"Error testing oracle: {str(e)}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Oracle:

Create a test oracle function for the average calculation function that validates inputs and returns results with status messages.
The oracle should return a tuple: (result, is_valid, message)

Example oracle structure:
def oracle_average_calculator(numbers: list) -> tuple:
    '''Test oracle for average calculation function.
    Returns tuple: (result, is_valid, message)
    '''
    # Input validation
    if numbers is None:
        return None, False, "Input cannot be None"
    
    if not isinstance(numbers, list):
        return None, False, "Input must be a list"
        
    if not numbers:
        return None, True, "Valid empty list"
        
    if not all(isinstance(x, (int, float)) for x in numbers):
        return None, False, "All elements must be numbers"
    
    # Calculate result
    result = sum(numbers) / len(numbers)
    return result, True, "Valid calculation"

Generate the complete oracle function following this pattern.


Cleaned Oracle:
def or

In [None]:
# Oracle Generation and Testing
oracle_prompt = """
Create a test oracle function for the average calculation function that validates inputs and returns results with status messages.
The oracle should return a tuple: (result, is_valid, message)

Example oracle structure:
def oracle_average_calculator(numbers: list) -> tuple:
    '''Test oracle for average calculation function.
    Returns tuple: (result, is_valid, message)
    '''
    # Input validation
    if numbers is None:
        return None, False, "Input cannot be None"

    if not isinstance(numbers, list):
        return None, False, "Input must be a list"

    if not numbers:
        return None, True, "Valid empty list"

    if not all(isinstance(x, (int, float)) for x in numbers):
        return None, False, "All elements must be numbers"

    # Calculate result
    result = sum(numbers) / len(numbers)
    return result, True, "Valid calculation"

Generate the complete oracle function following this pattern."""

# Generate and clean oracle
generated_oracle = semcoder.generate_code(oracle_prompt, max_new_tokens=4096)
cleaned_oracle = clean_generated_code(generated_oracle)

print("Generated Oracle:")
print(cleaned_oracle)

# Test the oracle with proper error handling
def test_oracle():
    # First, execute the oracle code
    try:
        namespace = {}
        exec(cleaned_oracle, namespace)
        oracle_func = namespace['oracle_average_calculator']

        # Test cases with expected results
        test_cases = [
            ([1, 2, 3], (2.0, True, "Valid calculation")),
            ([], (None, True, "Valid empty list")),
            (None, (None, False, "Input cannot be None")),
            ("not a list", (None, False, "Input must be a list")),
            ([1, "a", 2], (None, False, "All elements must be numbers")),
            ([1.5, 2.5, 3.5], (2.5, True, "Valid calculation")),
            ([0, 0, 0], (0.0, True, "Valid calculation")),
            ([1000000, 2000000], (1500000.0, True, "Valid calculation"))
        ]

        print("\nTesting oracle with sample cases:")
        for input_case, expected in test_cases:
            try:
                result = oracle_func(input_case)
                print(f"\nInput: {input_case}")
                print(f"Result: {result}")
                print(f"Expected: {expected}")
                print(f"Match: {'✓' if result == expected else '✗'}")
            except Exception as e:
                print(f"\nError testing input {input_case}: {str(e)}")

    except Exception as e:
        print(f"Error executing oracle: {str(e)}")

# Run the tests
test_oracle()

# Now let's combine oracle and test case generation
def generate_complete_test_suite():
    test_suite_prompt = """
    Generate a complete test suite that uses both the oracle and direct assertions.

    Example:
    def test_average_calculation():
        # Test using oracle
        assert oracle_average_calculator([1, 2, 3]) == (2.0, True, "Valid calculation")
        assert oracle_average_calculator([]) == (None, True, "Valid empty list")

        # Test using direct assertions
        assert sample_function([1, 2, 3]) == 2.0
        assert sample_function([]) is None

        # Test error cases
        with pytest.raises(TypeError):
            sample_function(None)
    """

    generated_suite = semcoder.generate_code(test_suite_prompt, max_new_tokens=4096)
    return clean_generated_code(generated_suite)

# Generate and test the complete suite
print("\nGenerating complete test suite...")
complete_suite = generate_complete_test_suite()
print(complete_suite)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Oracle:
def oracle_average_calculator(numbers: list) -> tuple:
    '''Test oracle for average calculation function.
    Returns tuple: (result, is_valid, message)
    '''
    # Input validation
    if numbers is None:
        return None, False, "Input cannot be None"
    
    if not isinstance(numbers, list):
        return None, False, "Input must be a list"
        
    if not numbers:
        return None, True, "Valid empty list"
        
    if not all(isinstance(x, (int, float)) for x in numbers):
        return None, False, "All elements must be numbers"
    
    # Calculate result
    result = sum(numbers) / len(numbers)
    return result, True, "Valid calculation"

Testing oracle with sample cases:

Input: [1, 2, 3]
Result: (2.0, True, 'Valid calculation')
Expected: (2.0, True, 'Valid calculation')
Match: ✓

Input: []
Result: (None, True, 'Valid empty list')
Expected: (None, True, 'Valid empty list')
Match: ✓

Input: None
Result: (None, False, 'Input cannot be None')

In [None]:
# Combined Test Suite Generation
combined_prompt = """
Generate a complete test suite that uses both the oracle and direct assertions for the average calculator.
Include separate test functions for different categories and use both oracle and direct testing.

Example structure:
def test_normal_cases():
    # Oracle validation
    assert oracle_average_calculator([1, 2, 3]) == (2.0, True, "Valid calculation")
    assert oracle_average_calculator([-1, -2, -3]) == (-2.0, True, "Valid calculation")

    # Direct assertions
    assert sample_function([1, 2, 3]) == 2.0
    assert sample_function([-1, -2, -3]) == -2.0

def test_edge_cases():
    # Oracle validation
    assert oracle_average_calculator([]) == (None, True, "Valid empty list")
    assert oracle_average_calculator([5]) == (5.0, True, "Valid calculation")

    # Direct assertions
    assert sample_function([]) is None
    assert sample_function([5]) == 5.0

def test_error_cases():
    # Oracle validation
    assert oracle_average_calculator(None) == (None, False, "Input cannot be None")
    assert oracle_average_calculator("not a list") == (None, False, "Input must be a list")

    # Direct assertions
    with pytest.raises(TypeError):
        sample_function(None)
    with pytest.raises(TypeError):
        sample_function("not a list")

Generate complete test functions following this pattern, covering all test cases."""

# Generate and test the combined suite
generated_suite = semcoder.generate_code(combined_prompt, max_new_tokens=4096)
cleaned_suite = clean_generated_code(generated_suite)

print("Generated Test Suite:")
print(cleaned_suite)

# Execute the combined test suite
print("\nExecuting test suite...")
try:
    # Create namespace with both functions
    namespace = {}
    exec(cleaned_oracle, namespace)  # Add oracle function
    exec("""
def sample_function(numbers: list) -> float:
    if numbers is None:
        raise TypeError("Input cannot be None")
    if not isinstance(numbers, list):
        raise TypeError("Input must be a list")
    if not numbers:
        return None
    if not all(isinstance(x, (int, float)) for x in numbers):
        raise TypeError("All elements must be numbers")
    return sum(numbers) / len(numbers)
    """, namespace)  # Add sample function

    # Execute test suite
    exec(cleaned_suite, namespace)
    print("✓ All tests passed successfully!")
except Exception as e:
    print(f"✗ Test failed: {str(e)}")
    import traceback
    traceback.print_exc()

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Test Suite:
def test_normal_cases():
    # Oracle validation
    assert oracle_average_calculator([1, 2, 3]) == (2.0, True, "Valid calculation")
    assert oracle_average_calculator([-1, -2, -3]) == (-2.0, True, "Valid calculation")
    
    # Direct assertions
    assert sample_function([1, 2, 3]) == 2.0
    assert sample_function([-1, -2, -3]) == -2.0

def test_edge_cases():
    # Oracle validation
    assert oracle_average_calculator([]) == (None, True, "Valid empty list")
    assert oracle_average_calculator([5]) == (5.0, True, "Valid calculation")
    
    # Direct assertions
    assert sample_function([]) is None
    assert sample_function([5]) == 5.0

def test_error_cases():
    # Oracle validation
    assert oracle_average_calculator(None) == (None, False, "Input cannot be None")
    assert oracle_average_calculator("not a list") == (None, False, "Input must be a list")
    
    # Direct assertions
    with pytest.raises(TypeError):
        sample_function(None)
    w

In [None]:
from datasets import load_dataset

# Examine a HumanEval problem
dataset = load_dataset("openai_humaneval")
example_problem = dataset['test'][0]  # Get first problem

print("Example HumanEval Problem:")
print("Prompt:", example_problem['prompt'])
print("\nEntry Point:", example_problem['entry_point'])
print("\nCanonical Solution:", example_problem['canonical_solution'])

Example HumanEval Problem:
Prompt: from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """


Entry Point: has_close_elements

Canonical Solution:     for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):
            if idx != idx2:
                distance = abs(elem - elem2)
                if distance < threshold:
                    return True

    return False



In [None]:
class HumanEvalTestGenerator:
    def __init__(self, semcoder_model):
        self.model = semcoder_model

    def generate_tests(self, problem_prompt: str, entry_point: str):
        test_prompt = f"""
Generate test cases and an oracle for this function:

{problem_prompt}

Create:
1. An oracle function that validates inputs and returns (result, is_valid, message)
2. Comprehensive test cases covering:
   - Normal cases (using examples from docstring)
   - Edge cases (empty list, single element, identical elements)
   - Error cases (None input, invalid types)

Example test structure:
def oracle_has_close_elements(numbers: List[float], threshold: float) -> tuple:
    '''Oracle for has_close_elements function.
    Returns: (result, is_valid, message)
    '''
    if numbers is None:
        return None, False, "Input list cannot be None"
    if not isinstance(numbers, list):
        return None, False, "First argument must be a list"
    if not isinstance(threshold, (int, float)):
        return None, False, "Threshold must be numeric"
    if threshold < 0:
        return None, False, "Threshold cannot be negative"

    # Check elements
    if not all(isinstance(x, (int, float)) for x in numbers):
        return None, False, "All elements must be numeric"

    # Compute result
    for i, elem in enumerate(numbers):
        for j, elem2 in enumerate(numbers):
            if i != j and abs(elem - elem2) < threshold:
                return True, True, "Found close elements"
    return False, True, "No close elements found"

def test_normal_cases():
    # Test cases from docstring
    assert has_close_elements([1.0, 2.0, 3.0], 0.5) == False
    assert has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3) == True

    # Additional normal cases
    assert has_close_elements([1.0, 1.5, 2.0], 0.7) == True
    assert has_close_elements([10.0, 20.0, 30.0], 5.0) == False

def test_edge_cases():
    # Empty list
    assert has_close_elements([], 1.0) == False
    # Single element
    assert has_close_elements([1.0], 0.5) == False
    # Identical elements
    assert has_close_elements([2.0, 2.0], 0.1) == True
    # Zero threshold
    assert has_close_elements([1.0, 2.0], 0.0) == False

def test_error_cases():
    with pytest.raises(TypeError):
        has_close_elements(None, 1.0)
    with pytest.raises(TypeError):
        has_close_elements([1, "2", 3], 1.0)
    with pytest.raises(TypeError):
        has_close_elements([1, 2, 3], "0.5")

Generate complete test functions and oracle following this pattern."""

        # Generate tests using SemCoder
        generated_code = self.model.generate_code(test_prompt)
        return self.clean_and_validate(generated_code)

    def clean_and_validate(self, generated_code: str) -> str:
        """Clean up generated code to extract only the functions."""
        if not generated_code:
            return None

        lines = generated_code.split('\n')
        cleaned_lines = []
        in_function = False

        for line in lines:
            if line.strip().startswith('def '):
                in_function = True
                cleaned_lines.append(line)
            elif in_function and (line.startswith('    ') or not line.strip()):
                cleaned_lines.append(line)
            elif in_function and line.strip() and not line.startswith('    '):
                in_function = False
                cleaned_lines.append('')

        return '\n'.join(cleaned_lines).strip()

# Test the generator
test_generator = HumanEvalTestGenerator(semcoder)
generated_tests = test_generator.generate_tests(example_problem['prompt'], example_problem['entry_point'])

print("Generated Tests:")
print(generated_tests)

# If tests are generated, try executing them
if generated_tests:
    print("\nExecuting tests...")
    try:
        # Define the original function
        exec(example_problem['prompt'] + example_problem['canonical_solution'])
        # Execute the generated tests
        exec(generated_tests)
        print("✓ All tests passed!")
    except Exception as e:
        print(f"✗ Test execution failed: {str(e)}")
        import traceback
        traceback.print_exc()

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Tests:
def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



def oracle_has_close_elements(numbers: List[float], threshold: float) -> tuple:
    '''Oracle for has_close_elements function.
    Returns: (result, is_valid, message)
    '''
    if numbers is None:
        return None, False, "Input list cannot be None"
    if not isinstance(numbers, list):
        return None, False, "First argument must be a list"
    if not isinstance(threshold, (int, float)):
        return None, False, "Threshold must be numeric"
    if threshold < 0:
        return None, False, "Threshold cannot be negative"
    
    # Check elements
    if not all(isinstance(x, (int, float)) for x in numbers):
        return None, False, "Al

In [None]:
class HumanEvalPlusTestGenerator:
    def __init__(self, semcoder_model):
        self.model = semcoder_model
        self.dataset = load_dataset("openai_humaneval")

    def generate_plus_tests(self, problem_prompt: str, entry_point: str):
        test_prompt = f"""
Generate enhanced test cases and oracle for this function:

{problem_prompt}

Create test cases that go beyond basic testing. Include:

1. An enhanced oracle function:
def oracle_{entry_point}(numbers: List[float], threshold: float) -> tuple:
    '''Enhanced oracle with additional validations'''
    # Basic validation
    if numbers is None or not isinstance(numbers, list):
        return None, False, "Invalid input list"
    if not isinstance(threshold, (int, float)) or threshold < 0:
        return None, False, "Invalid threshold"

    # Enhanced validation
    try:
        if any(not isinstance(x, (int, float)) for x in numbers):
            return None, False, "Non-numeric elements in list"

        # Compute result
        for i in range(len(numbers)):
            for j in range(len(numbers)):
                if i != j and abs(numbers[i] - numbers[j]) < threshold:
                    return True, True, "Found close elements"
        return False, True, "No close elements found"
    except Exception as e:
        return None, False, f"Computation error: {{str(e)}}"

2. Enhanced test cases:
def test_{entry_point}_enhanced():
    # Large input tests
    assert {entry_point}(list(range(100)), 0.5) == False
    assert {entry_point}([i * 0.1 for i in range(50)], 0.05) == True

    # Boundary tests
    assert {entry_point}([], 1.0) == False
    assert {entry_point}([1.0], 0.5) == False
    assert {entry_point}([1.0, 1.0], 0.1) == True

    # Special cases
    with pytest.raises(TypeError):
        {entry_point}(None, 1.0)
    with pytest.raises(TypeError):
        {entry_point}([1.0, None, 2.0], 0.5)
    with pytest.raises(TypeError):
        {entry_point}([1.0, 2.0], None)

3. Performance tests:
def test_{entry_point}_performance():
    # Large lists
    large_list = [i * 0.5 for i in range(1000)]
    assert {entry_point}(large_list, 0.25) == True

    # Sparse lists
    sparse_list = [i * 100.0 for i in range(100)]
    assert {entry_point}(sparse_list, 1.0) == False

Generate complete test functions following this pattern with comprehensive coverage."""

        # Generate tests using SemCoder
        generated_code = self.model.generate_code(test_prompt)
        return self.clean_and_validate(generated_code)

    def clean_and_validate(self, generated_code: str) -> str:
        if not generated_code:
            return None

        lines = generated_code.split('\n')
        cleaned_lines = []
        in_function = False

        for line in lines:
            if line.strip().startswith('def '):
                in_function = True
                cleaned_lines.append(line)
            elif in_function and (line.startswith('    ') or not line.strip()):
                cleaned_lines.append(line)
            elif in_function and line.strip() and not line.startswith('    '):
                in_function = False
                cleaned_lines.append('')

        return '\n'.join(cleaned_lines).strip()

    def evaluate_plus_coverage(self, generated_tests: str) -> dict:
        """Evaluate HumanEval+ specific test coverage"""
        metrics = {
            'has_oracle': bool(re.search(r'def oracle_.*', generated_tests)),
            'has_enhanced_tests': bool(re.search(r'def test_.*_enhanced', generated_tests)),
            'has_performance_tests': bool(re.search(r'def test_.*_performance', generated_tests)),
            'large_inputs': bool(re.search(r'range\(\d{2,}\)', generated_tests)),
            'boundary_tests': bool(re.search(r'assert.*\[\]|assert.*\[1\.0\]', generated_tests)),
            'error_handling': bool(re.search(r'pytest\.raises', generated_tests))
        }

        print("\nDetailed test analysis:")
        for metric, present in metrics.items():
            print(f"{metric}: {'✓' if present else '✗'}")

        return metrics

# Test the generator
test_generator = HumanEvalPlusTestGenerator(semcoder)
generated_tests = test_generator.generate_plus_tests(
    example_problem['prompt'],
    example_problem['entry_point']
)

print("Generated HumanEval+ Tests:")
print(generated_tests)

if generated_tests:
    coverage_metrics = test_generator.evaluate_plus_coverage(generated_tests)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated HumanEval+ Tests:
def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



def oracle_has_close_elements(numbers: List[float], threshold: float) -> tuple:
    '''Enhanced oracle with additional validations'''
    # Basic validation
    if numbers is None or not isinstance(numbers, list):
        return None, False, "Invalid input list"
    if not isinstance(threshold, (int, float)) or threshold < 0:
        return None, False, "Invalid threshold"
    
    # Enhanced validation
    try:
        if any(not isinstance(x, (int, float)) for x in numbers):
            return None, False, "Non-numeric elements in list"
        
        # Compute result
        for i in range(len(numbers)):
            for j in range(len

In [None]:
class HumanEvalLargeTestGenerator:
    def __init__(self, semcoder_model):
        self.model = semcoder_model

    def generate_test_batch(self, problem_prompt: str, entry_point: str, category: str, num_cases: int):
        """Generate a batch of tests for a specific category"""

        if category == 'normal':
            test_prompt = f"""
    Generate {num_cases} test cases for this function:

    {problem_prompt}

    Each test should follow this exact pattern:
    def test_{entry_point}_normal_N():  # where N is the test number
        # Comment describing the test case
        assert {entry_point}([list of numbers], threshold) == expected_result

    Example test cases (DO NOT REPEAT THESE, generate new ones):
    def test_{entry_point}_normal_1():
        # Test small positive numbers
        assert {entry_point}([1.0, 2.0, 3.0], 0.5) == False

    def test_{entry_point}_normal_2():
        # Test medium range numbers
        assert {entry_point}([10.0, 20.0, 30.0], 5.0) == False

    Generate {num_cases} DIFFERENT test cases with:
    - Different list sizes
    - Different number ranges
    - Different thresholds
    - Different expected results
    Start numbering from test_{entry_point}_normal_5
    """

        elif category == 'edge':
            test_prompt = f"""
    Generate {num_cases} edge case tests for this function:

    {problem_prompt}

    Each test should follow this exact pattern:
    def test_{entry_point}_edge_N():  # where N is the test number
        # Comment describing the edge case
        assert {entry_point}([list of numbers], threshold) == expected_result

    Example edge cases (DO NOT REPEAT THESE, generate new ones):
    def test_{entry_point}_edge_1():
        # Test empty list
        assert {entry_point}([], 1.0) == False

    def test_{entry_point}_edge_2():
        # Test single element
        assert {entry_point}([5.0], 1.0) == False

    Generate {num_cases} DIFFERENT edge cases testing:
    - Extreme values
    - Boundary conditions
    - Special numeric cases
    Start numbering from test_{entry_point}_edge_5
    """

        elif category == 'performance':
            test_prompt = f"""
    Generate {num_cases} performance test cases for this function:

    {problem_prompt}

    Each test should follow this exact pattern:
    def test_{entry_point}_perf_N():  # where N is the test number
        # Comment describing the performance test
        assert {entry_point}([large list generation], threshold) == expected_result

    Example performance test (DO NOT REPEAT THIS, generate new ones):
    def test_{entry_point}_perf_1():
        # Test large sequential list
        assert {entry_point}([i * 0.1 for i in range(1000)], 0.05) == True

    Generate {num_cases} DIFFERENT performance tests with:
    - Different list sizes (1000+ elements)
    - Different patterns
    - Different thresholds
    Start numbering from test_{entry_point}_perf_4
    """

        else:  # error cases
            test_prompt = f"""
    Generate {num_cases} error test cases for this function:

    {problem_prompt}

    Each test should follow this exact pattern:
    def test_{entry_point}_error_N():  # where N is the test number
        # Comment describing the error case
        with pytest.raises(TypeError):
            {entry_point}(invalid_input, threshold)

    Example error test (DO NOT REPEAT THIS, generate new ones):
    def test_{entry_point}_error_1():
        # Test None input
        with pytest.raises(TypeError):
            {entry_point}(None, 1.0)

    Generate {num_cases} DIFFERENT error tests with:
    - Different invalid inputs
    - Different error conditions
    - Different invalid types
    Start numbering from test_{entry_point}_error_4
    """

        generated_code = self.model.generate_code(test_prompt)
        return self.clean_and_validate(generated_code)

    def generate_large_test_suite(self, problem_prompt: str, entry_point: str, num_cases: int = 100):
        """Generate complete test suite with distributed test cases"""
        # Calculate number of tests per category
        normal_cases = int(num_cases * 0.4)  # 40%
        edge_cases = int(num_cases * 0.3)    # 30%
        performance_cases = int(num_cases * 0.2)  # 20%
        error_cases = int(num_cases * 0.1)   # 10%

        # Generate tests for each category
        test_parts = []

        print(f"Generating {normal_cases} normal cases...")
        normal_tests = self.generate_test_batch(problem_prompt, entry_point, 'normal', normal_cases)
        if normal_tests:
            test_parts.append(normal_tests)

        print(f"Generating {edge_cases} edge cases...")
        edge_tests = self.generate_test_batch(problem_prompt, entry_point, 'edge', edge_cases)
        if edge_tests:
            test_parts.append(edge_tests)

        print(f"Generating {performance_cases} performance cases...")
        perf_tests = self.generate_test_batch(problem_prompt, entry_point, 'performance', performance_cases)
        if perf_tests:
            test_parts.append(perf_tests)

        print(f"Generating {error_cases} error cases...")
        error_tests = self.generate_test_batch(problem_prompt, entry_point, 'error', error_cases)
        if error_tests:
            test_parts.append(error_tests)

        # Combine all test parts
        combined_tests = "\n\n".join(filter(None, test_parts))
        return combined_tests

    def clean_and_validate(self, generated_code: str) -> str:
        if not generated_code:
            return None

        lines = generated_code.split('\n')
        cleaned_lines = []
        in_function = False

        for line in lines:
            if line.strip().startswith('def test_'):
                in_function = True
                cleaned_lines.append(line)
            elif in_function and (line.startswith('    ') or not line.strip()):
                cleaned_lines.append(line)
            elif in_function and line.strip() and not line.startswith('    '):
                in_function = False
                cleaned_lines.append('')

        return '\n'.join(cleaned_lines).strip()

    def evaluate_large_suite(self, generated_tests: str) -> dict:
        metrics = {
            'total_tests': len(re.findall(r'def test_', generated_tests)),
            'normal_cases': len(re.findall(r'test_\w+_normal_\d+', generated_tests)),
            'edge_cases': len(re.findall(r'test_\w+_edge_\d+', generated_tests)),
            'performance_cases': len(re.findall(r'test_\w+_perf_\d+', generated_tests)),
            'error_cases': len(re.findall(r'test_\w+_error_\d+', generated_tests)),
            'unique_assertions': len(set(re.findall(r'assert.*==.*', generated_tests)))
        }

        total = metrics['total_tests'] or 1
        metrics.update({
            'normal_percentage': (metrics['normal_cases'] / total) * 100,
            'edge_percentage': (metrics['edge_cases'] / total) * 100,
            'performance_percentage': (metrics['performance_cases'] / total) * 100,
            'error_percentage': (metrics['error_cases'] / total) * 100
        })

        return metrics

# Test the generator
test_generator = HumanEvalLargeTestGenerator(semcoder)
print("Generating complete test suite...")
generated_tests = test_generator.generate_large_test_suite(
    example_problem['prompt'],
    example_problem['entry_point'],
    num_cases=100
)

print("\nGenerated Test Suite:")
print(generated_tests)

if generated_tests:
    metrics = test_generator.evaluate_large_suite(generated_tests)
    print("\nTest Suite Metrics:")
    for metric, value in metrics.items():
        if 'percentage' in metric:
            print(f"{metric}: {value:.1f}%")
        else:
            print(f"{metric}: {value}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generating complete test suite...
Generating 40 normal cases...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generating 30 edge cases...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generating 20 performance cases...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generating 10 error cases...

Generated Test Suite:
def test_has_close_elements_normal_N():  # where N is the test number
        # Comment describing the test case
        assert has_close_elements([list of numbers], threshold) == expected_result

    Example test cases (DO NOT REPEAT THESE, generate new ones):
    def test_has_close_elements_normal_1():
        # Test small positive numbers
        assert has_close_elements([1.0, 2.0, 3.0], 0.5) == False

    def test_has_close_elements_normal_2():
        # Test medium range numbers
        assert has_close_elements([10.0, 20.0, 30.0], 5.0) == False

    Generate 40 DIFFERENT test cases with:
    - Different list sizes
    - Different number ranges
    - Different thresholds
    - Different expected results
    Start numbering from test_has_close_elements_normal_5
    
    #### Understanding the Function

def test_has_close_elements_edge_N():  # where N is the test number
        # Comment describing the edge case
        assert has

In [None]:
def generate_humaneval_tests(num_total_tests=100):
    dataset = load_dataset("openai_humaneval")
    results = []
    total_tests_generated = 0

    for i in range(len(dataset['test'])):
        if total_tests_generated >= num_total_tests:
            break

        problem = dataset['test'][i]
        prompt = problem['prompt']
        entry_point = problem['entry_point']

        print(f"\nProblem {i}: {entry_point}")
        print("Original prompt:")
        print(prompt)

        # Generate test cases
        test_prompt = f"""
Generate test cases for this function:

{prompt}

Format each test case as:
def test_{entry_point}_case_N():
    # Test description
    assert {entry_point}(input_args) == expected_output

Example:
def test_{entry_point}_case_1():
    # Basic test case
    {problem['test']}
"""

        try:
            # Generate and clean tests
            generated_tests = semcoder.generate_code(test_prompt)
            cleaned_tests = evaluator.clean_generated_code(generated_tests)

            if cleaned_tests:
                num_tests = len(re.findall(r'def test_', cleaned_tests))
                total_tests_generated += num_tests

                result = {
                    'problem_id': i,
                    'entry_point': entry_point,
                    'tests': cleaned_tests,
                    'num_tests': num_tests
                }
                results.append(result)

                print(f"Generated {num_tests} tests")
                print(f"Total tests so far: {total_tests_generated}/{num_total_tests}")
                print("\nGenerated tests:")
                print(cleaned_tests)
            else:
                print("No valid tests generated")

        except Exception as e:
            print(f"Error generating tests: {str(e)}")
            continue

        if total_tests_generated >= num_total_tests:
            print(f"\nReached target of {num_total_tests} tests")
            break

    return results, total_tests_generated

# Generate tests
print("Generating tests for HumanEval problems...")
results, total_tests = generate_humaneval_tests(100)

# Print summary
print("\nFinal Results:")
print(f"Total tests generated: {total_tests}")
print("\nBreakdown by problem:")
for result in results:
    print(f"Problem {result['problem_id']} ({result['entry_point']}): {result['num_tests']} tests")

Generating tests for HumanEval problems...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Problem 0: has_close_elements
Original prompt:
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 2 tests
Total tests so far: 2/100

Generated tests:
def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



def test_has_close_elements_case_N():
    # Test description
    assert has_close_elements(input_args) == expected_output


def test_has_close_elements_case_1():
    # Basic test case
    


def check(candidate):
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False
    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True
    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False
    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True
    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True
    assert cand

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 7 tests
Total tests so far: 9/100

Generated tests:
def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and return the list of those.
    Separate groups are balanced (each open brace is properly closed) and not nested within each other
    Ignore any spaces in the input string.
    >>> separate_paren_groups('( ) (( )) (( )( ))')
    ['()', '(())', '(()())']
    """



def test_separate_paren_groups_case_N():
    # Test description
    assert separate_paren_groups(input_args) == expected_output


def test_separate_paren_groups_case_1():
    # Basic test case
    


def check(candidate):
    assert candidate('(()()) ((())) () ((())()())') == [
        '(()())', '((()))', '()', '((())()())'
    ]
    assert candidate('() (()) ((())) (((())))') == [
        '()', '(())', '((()))', '(((())))'
    ]
    assert candidate(

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 8 tests
Total tests so far: 17/100

Generated tests:
def truncate_number(number: float) -> float:
    """ Given a positive floating point number, it can be decomposed into
    and integer part (largest integer smaller than given number) and decimals
    (leftover part always smaller than 1).

    Return the decimal part of the number.
    >>> truncate_number(3.5)
    0.5
    """



def test_truncate_number_case_N():
    # Test description
    assert truncate_number(input_args) == expected_output


def test_truncate_number_case_1():
    # Basic test case
    


def check(candidate):
    assert candidate(3.5) == 0.5
    assert abs(candidate(1.33) - 0.33) < 1e-6
    assert abs(candidate(123.456) - 0.456) < 1e-6

    # Additional edge cases
    assert candidate(0.999) == 0.999
    assert candidate(0.001) == 0.001
    assert candidate(0.9) == 0.9

    # Handle negative numbers
    assert candidate(-1.5) == 0.5  # The function is not expected to handle negative numbers


def test_t

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 9 tests
Total tests so far: 26/100

Generated tests:
def below_zero(operations: List[int]) -> bool:
    """ You're given a list of deposit and withdrawal operations on a bank account that starts with
    zero balance. Your task is to detect if at any point the balance of account fallls below zero, and
    at that point function should return True. Otherwise it should return False.
    >>> below_zero([1, 2, 3])
    False
    >>> below_zero([1, 2, -4, 5])
    True
    """



def test_below_zero_case_N():
    # Test description
    assert below_zero(input_args) == expected_output


def test_below_zero_case_1():
    # Basic test case
    


def check(candidate):
    assert candidate([]) == False
    assert candidate([1, 2, -3, 1, 2, -3]) == False
    assert candidate([1, 2, -4, 5, 6]) == True
    assert candidate([1, -1, 2, -2, 5, -5, 4, -4]) == False
    assert candidate([1, -1, 2, -2, 5, -5, 4, -5]) == True
    assert candidate([1, -2, 2, -2, 5, -5, 4, -4]) == True


def test_b

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 5 tests
Total tests so far: 31/100

Generated tests:
def mean_absolute_deviation(numbers: List[float]) -> float:
    """ For a given list of input numbers, calculate Mean Absolute Deviation
    around the mean of this dataset.
    Mean Absolute Deviation is the average absolute difference between each
    element and a centerpoint (mean in this case):
    MAD = average | x - x_mean |
    >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])
    1.0
    """



def test_mean_absolute_deviation_case_N():
    # Test description
    assert mean_absolute_deviation(input_args) == expected_output


def test_mean_absolute_deviation_case_1():
    # Basic test case
    


def check(candidate):
    assert abs(candidate([1.0, 2.0, 3.0]) - 2.0/3.0) < 1e-6
    assert abs(candidate([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6
    assert abs(candidate([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) < 1e-6


def test_mean_absolute_deviation_case_1():
    # Basic test case
    assert mean_absolute_deviation([1.0, 2.0

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 6 tests
Total tests so far: 37/100

Generated tests:
def intersperse(numbers: List[int], delimeter: int) -> List[int]:
    """ Insert a number 'delimeter' between every two consecutive elements of input list `numbers'
    >>> intersperse([], 4)
    []
    >>> intersperse([1, 2, 3], 4)
    [1, 4, 2, 4, 3]
    """



def test_intersperse_case_N():
    # Test description
    assert intersperse(input_args) == expected_output


def test_intersperse_case_1():
    # Basic test case
    


def check(candidate):
    assert candidate([], 7) == []
    assert candidate([5, 6, 3, 2], 8) == [5, 8, 6, 8, 3, 8, 2]
    assert candidate([2, 2, 2], 2) == [2, 2, 2, 2, 2]


def test_intersperse_case_1():
    # Basic test case
    assert intersperse([], 4) == []


def test_intersperse_case_2():
    # Insert delimeter between elements
    assert intersperse([1, 2, 3], 4) == [1, 4, 2, 4, 3]


def test_intersperse_case_3():
    # Test with a single element
    assert intersperse([1], 0) == [1]


def 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 5 tests
Total tests so far: 42/100

Generated tests:
def parse_nested_parens(paren_string: str) -> List[int]:
    """ Input to this function is a string represented multiple groups for nested parentheses separated by spaces.
    For each of the group, output the deepest level of nesting of parentheses.
    E.g. (()()) has maximum two levels of nesting while ((())) has three.

    >>> parse_nested_parens('(()()) ((())) () ((())()())')
    [2, 3, 1, 3]
    """



def test_parse_nested_parens_case_N():
    # Test description
    assert parse_nested_parens(input_args) == expected_output


def test_parse_nested_parens_case_1():
    # Basic test case
    


def check(candidate):
    assert candidate('(()()) ((())) () ((())()())') == [2, 3, 1, 3]
    assert candidate('() (()) ((())) (((())))') == [1, 2, 3, 4]
    assert candidate('(()(())((())))') == [4]

def test_parse_nested_parens_case_1():
    # Basic test case
    assert parse_nested_parens('(()()) ((())) () ((())()())') == [2,

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 6 tests
Total tests so far: 48/100

Generated tests:
def filter_by_substring(strings: List[str], substring: str) -> List[str]:
    """ Filter an input list of strings only for ones that contain given substring
    >>> filter_by_substring([], 'a')
    []
    >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')
    ['abc', 'bacd', 'array']
    """



def test_filter_by_substring_case_N():
    # Test description
    assert filter_by_substring(input_args) == expected_output


def test_filter_by_substring_case_1():
    # Basic test case
    


def check(candidate):
    assert candidate([], 'john') == []
    assert candidate(['xxx', 'asd', 'xxy', 'john doe', 'xxxAAA', 'xxx'], 'xxx') == ['xxx', 'xxxAAA', 'xxx']
    assert candidate(['xxx', 'asd', 'aaaxxy', 'john doe', 'xxxAAA', 'xxx'], 'xx') == ['xxx', 'aaaxxy', 'xxxAAA', 'xxx']
    assert candidate(['grunt', 'trumpet', 'prune', 'gruesome'], 'run') == ['grunt', 'prune']


def test_filter_by_substring_case_1():
    # Basic t

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 7 tests
Total tests so far: 55/100

Generated tests:
def sum_product(numbers: List[int]) -> Tuple[int, int]:
    """ For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.
    Empty sum should be equal to 0 and empty product should be equal to 1.
    >>> sum_product([])
    (0, 1)
    >>> sum_product([1, 2, 3, 4])
    (10, 24)
    """



def test_sum_product_case_N():
    # Test description
    assert sum_product(input_args) == expected_output


def test_sum_product_case_1():
    # Basic test case
    


def check(candidate):
    assert candidate([]) == (0, 1)
    assert candidate([1, 1, 1]) == (3, 1)
    assert candidate([100, 0]) == (100, 0)
    assert candidate([3, 5, 7]) == (3 + 5 + 7, 3 * 5 * 7)
    assert candidate([10]) == (10, 10)


def test_sum_product_case_1():
    # Basic test case
    assert sum_product([]) == (0, 1)


def test_sum_product_case_2():
    # Test case with numbers
    assert sum_product([1, 2, 3,

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 8 tests
Total tests so far: 63/100

Generated tests:
def rolling_max(numbers: List[int]) -> List[int]:
    """ From a given list of integers, generate a list of rolling maximum element found until given moment
    in the sequence.
    >>> rolling_max([1, 2, 3, 2, 3, 4, 2])
    [1, 2, 3, 3, 3, 4, 4]
    """



def test_rolling_max_case_N():
    # Test description
    assert rolling_max(input_args) == expected_output


def test_rolling_max_case_1():
    # Basic test case
    


def check(candidate):
    assert candidate([]) == []
    assert candidate([1, 2, 3, 4]) == [1, 2, 3, 4]
    assert candidate([4, 3, 2, 1]) == [4, 4, 4, 4]
    assert candidate([3, 2, 3, 100, 3]) == [3, 3, 3, 100, 100]


def test_rolling_max_case_1():
    # Basic test case
    assert rolling_max([]) == []


def test_rolling_max_case_2():
    # Increasing sequence
    assert rolling_max([1, 2, 3, 4]) == [1, 2, 3, 4]


def test_rolling_max_case_3():
    # Decreasing sequence
    assert rolling_max([4, 3, 2,

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 2 tests
Total tests so far: 65/100

Generated tests:
def is_palindrome(string: str) -> bool:
    """ Test if given string is a palindrome """
    return string == string[::-1]


def make_palindrome(string: str) -> str:
    """ Find the shortest palindrome that begins with a supplied string.
    Algorithm idea is simple:
    - Find the longest postfix of supplied string that is a palindrome.
    - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.
    >>> make_palindrome('')
    ''
    >>> make_palindrome('cat')
    'catac'
    >>> make_palindrome('cata')
    'catac'
    """



def test_make_palindrome_case_N():
    # Test description
    assert make_palindrome(input_args) == expected_output


def test_make_palindrome_case_1():
    # Basic test case
    


def check(candidate):
    assert candidate('') == ''
    assert candidate('x') == 'x'
    assert candidate('xyz') == 'xyzyx'
    assert candidate('xyx') == 'xyx'
    assert c

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 3 tests
Total tests so far: 68/100

Generated tests:
def string_xor(a: str, b: str) -> str:
    """ Input are two strings a and b consisting only of 1s and 0s.
    Perform binary XOR on these inputs and return result also as a string.
    >>> string_xor('010', '110')
    '100'
    """



def test_string_xor_case_N():
    # Test description
    assert string_xor(input_args) == expected_output


def test_string_xor_case_1():
    # Basic test case
    


def check(candidate):
    assert candidate('111000', '101010') == '010010'
    assert candidate('1', '1') == '0'
    assert candidate('0101', '0000') == '0101'

def test_string_xor_case_1():
    assert string_xor('010', '110') == '100'

Problem 12: longest
Original prompt:
from typing import List, Optional


def longest(strings: List[str]) -> Optional[str]:
    """ Out of list of strings, return the longest one. Return the first one in case of multiple
    strings of the same length. Return None in case the input list is empty.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 8 tests
Total tests so far: 76/100

Generated tests:
def longest(strings: List[str]) -> Optional[str]:
    """ Out of list of strings, return the longest one. Return the first one in case of multiple
    strings of the same length. Return None in case the input list is empty.
    >>> longest([])

    >>> longest(['a', 'b', 'c'])
    'a'
    >>> longest(['a', 'bb', 'ccc'])
    'ccc'
    """



def test_longest_case_N():
    # Test description
    assert longest(input_args) == expected_output


def test_longest_case_1():
    # Basic test case
    


def check(candidate):
    assert candidate([]) == None
    assert candidate(['x', 'y', 'z']) == 'x'
    assert candidate(['x', 'yyy', 'zzzz', 'www', 'kkkk', 'abc']) == 'zzzz'


def test_longest_case_1():
    # Basic test case
    assert longest([]) == None


def test_longest_case_2():
    # Single string case
    assert longest(['a']) == 'a'


def test_longest_case_3():
    # Multiple strings, first one is longest
    assert longest

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 7 tests
Total tests so far: 83/100

Generated tests:
def greatest_common_divisor(a: int, b: int) -> int:
    """ Return a greatest common divisor of two integers a and b
    >>> greatest_common_divisor(3, 5)
    1
    >>> greatest_common_divisor(25, 15)
    5
    """



def test_greatest_common_divisor_case_N():
    # Test description
    assert greatest_common_divisor(input_args) == expected_output


def test_greatest_common_divisor_case_1():
    # Basic test case
    


def check(candidate):
    assert candidate(3, 7) == 1
    assert candidate(10, 15) == 5
    assert candidate(49, 14) == 7
    assert candidate(144, 60) == 12


def greatest_common_divisor(a: int, b: int) -> int:
    while b != 0:
        a, b = b, a % b
    return a


def test_greatest_common_divisor_case_1():
    # Basic test case
    assert greatest_common_divisor(3, 5) == 1


def test_greatest_common_divisor_case_2():
    # Another basic test case
    assert greatest_common_divisor(25, 15) == 5


def test

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 7 tests
Total tests so far: 90/100

Generated tests:
def all_prefixes(string: str) -> List[str]:
    """ Return list of all prefixes from shortest to longest of the input string
    >>> all_prefixes('abc')
    ['a', 'ab', 'abc']
    """



def test_all_prefixes_case_N():
    # Test description
    assert all_prefixes(input_args) == expected_output


def test_all_prefixes_case_1():
    # Basic test case
    


def check(candidate):
    assert candidate('') == []
    assert candidate('asdfgh') == ['a', 'as', 'asd', 'asdf', 'asdfg', 'asdfgh']
    assert candidate('WWW') == ['W', 'WW', 'WWW']

def test_all_prefixes_case_1():
    # Basic test case
    assert all_prefixes('') == []

def test_all_prefixes_case_2():
    # Test case for a single character string
    assert all_prefixes('a') == ['a']

def test_all_prefixes_case_3():
    # Test case for longer string
    assert all_prefixes('abc') == ['a', 'ab', 'abc']

def test_all_prefixes_case_4():
    # Test case for repeated charac

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated 5 tests
Total tests so far: 95/100

Generated tests:
def string_sequence(n: int) -> str:
    """ Return a string containing space-delimited numbers starting from 0 upto n inclusive.
    >>> string_sequence(0)
    '0'
    >>> string_sequence(5)
    '0 1 2 3 4 5'
    """



def test_string_sequence_case_N():
    # Test description
    assert string_sequence(input_args) == expected_output


def test_string_sequence_case_1():
    # Basic test case
    


def check(candidate):
    assert candidate(0) == '0'
    assert candidate(3) == '0 1 2 3'
    assert candidate(10) == '0 1 2 3 4 5 6 7 8 9 10'


def test_string_sequence_case_1():
    # Basic test case
    assert string_sequence(0) == '0'


def test_string_sequence_case_2():
    # Test for positive numbers
    assert string_sequence(5) == '0 1 2 3 4 5'


def test_string_sequence_case_3():
    # Test for negative numbers (though this function doesn't handle negative numbers, it can still return valid sequences)
    assert string_s

In [None]:
import re
def generate_humaneval_plus_tests(model_type, deep_seek_tokenizer=None, num_total_tests=100):
    dataset = load_dataset("openai_humaneval")
    results = []
    total_tests_generated = 0
    with open(f'{model_type}_test_case_generation_results.txt', 'w') as f:
      for i in range(len(dataset['test'])):
          if total_tests_generated >= num_total_tests:
              break

          problem = dataset['test'][i]
          prompt = problem['prompt']
          solution = problem['canonical_solution']
          entry_point = problem['entry_point']
          test_code = problem['test']

          # Extract working test cases
          check_match = re.search(r'def check\(candidate\):\s*(.*?)(?=\n\n|$)', test_code, re.DOTALL)
          test_cases = re.findall(r'assert.*?(?=\n|$)', check_match.group(1) if check_match else '')

          test_prompt = f"""
Please provide executable test cases for this function:
{prompt}

Working test examples:
{test_cases}

Include these types of tests:
1. Performance test:
def test_{entry_point}_perf():
    {test_cases[0].replace('candidate', entry_point)}

2. Edge case test:
def test_{entry_point}_edge():
    {test_cases[-1].replace('candidate', entry_point)}

3. Error test:
def test_{entry_point}_error():
    with pytest.raises(TypeError):
        {entry_point}(None)

Only provide executable test cases. No placeholders."""

          try:
              generated_tests, cleaned_tests = None, None
              if model_type == "semcoder":
                generated_tests = semcoder.generate_code(test_prompt)
                cleaned_tests = evaluator.clean_generated_code(generated_tests)
              elif model_type == "deep_seek":
                generated_tests = generate_code(model, deep_seek_tokenizer, test_prompt, max_new_tokens=4096)
                cleaned_tests = clean_deepseek_generated_code(generated_tests)

              if cleaned_tests:
                  num_tests = len(re.findall(r'def test_', cleaned_tests))
                  total_tests_generated += num_tests

                  result = {
                      'problem_id': i,
                      'entry_point': entry_point,
                      'tests': cleaned_tests,
                      'num_tests': num_tests
                  }
                  results.append(result)

                  print(f"Generated {num_tests} enhanced tests")
                  print(f"Total tests so far: {total_tests_generated}/{num_total_tests}")
                  print("\nTest prompt:")
                  print(test_prompt)
                  print("\nGenerated tests:")
                  print(generated_tests)
                  print("\nCleaned tests:")
                  print(cleaned_tests)

                  f.write(f"Generated {num_tests} enhanced tests\n")
                  f.write(f"Total tests so far: {total_tests_generated}/{num_total_tests}")
                  f.write("\nGenerated tests:\n")
                  f.write(cleaned_tests + "\n")
              else:
                  print("No valid tests generated")

          except Exception as e:
              print(f"Error generating tests: {str(e)}")
              continue

    return results, total_tests_generated

In [None]:
# Generate HumanEval+ tests
print("Generating HumanEval+ test cases...")
plus_results, total_plus_tests = generate_humaneval_plus_tests("deep_seek", tokenizer, 100)

Generating HumanEval+ test cases...
Generated 3 enhanced tests
Total tests so far: 3/100

Test prompt:

Please provide executable test cases for this function:
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """


Working test examples:
['assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True', 'assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False', 'assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True', 'assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False', 'assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True', 'assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True', 'assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False']

Include these types of te

In [None]:
# Generate HumanEval+ tests
print("Generating HumanEval+ test cases...")
plus_results, total_plus_tests = generate_humaneval_plus_tests(100)

# Print summary
print("\nHumanEval+ Results:")
print(f"Total enhanced tests generated: {total_plus_tests}")
print("\nBreakdown by problem:")
for result in plus_results:
    print(f"Problem {result['problem_id']} ({result['entry_point']}): {result['num_tests']} tests")

In [None]:
import re

def extract_test_suites(content: str) -> list[str]:
    """
    Extract test suites from the content and format them with function calls.
    Handles both standalone assert statements and function definitions.
    Returns a list of formatted test suite strings.
    """
    # Split content into test suite blocks
    test_blocks = re.split(r'Generated \d+ enhanced tests\nTotal tests so far: \d+/\d+\n+Generated tests:', content)

    # Remove empty blocks
    test_blocks = [block.strip() for block in test_blocks if block.strip()]

    formatted_suites = []
    for block in test_blocks:
        if "unittest.TestCase" in block:
          print("FORMATTED TEST SUITE:")
          print(block)
          formatted_suites.append(block)
          continue


        print("ORIGINAL TEST SUITE:")
        print(block)
        suite_parts = []

        # First, collect any imports at the start of the block
        import_statements = re.findall(r'^import [^\n]+', block, re.MULTILINE)

        # Extract function-based tests
        test_functions = re.finditer(r'def (test_\w+)\(\):\n((?:[ ]{4}.*\n?)+)', block)

        # Extract standalone assert statements (not within functions)
        # Looking for asserts that are at the start of a line and not indented
        standalone_asserts = re.finditer(r'^assert [^\n]+$', block, re.MULTILINE)

        # Extract standalone pytest.raises statements
        standalone_raises = re.finditer(r'^with pytest\.raises\([^\)]+\):\n[ ]{4}[^\n]+\n', block, re.MULTILINE)

        # Add imports if they exist
        if import_statements:
            suite_parts.extend(import_statements)
            suite_parts.append("")  # Add blank line after imports

        # Add standalone asserts
        for match in standalone_asserts:
            suite_parts.append(match.group(0))

        # Add standalone pytest.raises
        for match in standalone_raises:
            suite_parts.append(match.group(0).rstrip())

        # Add function-based tests
        for match in test_functions:
            func_name = match.group(1)
            func_body = match.group(2).rstrip()
            formatted_func = f"def {func_name}():\n{func_body}\n{func_name}()"
            suite_parts.append(formatted_func)

        if suite_parts:
            formatted_suite = "\n".join(suite_parts)
            print("FORMATTED TEST SUITE:")
            print(formatted_suite)
            print("-" * 50)
            formatted_suites.append(formatted_suite)

    return formatted_suites

def process_file_path(file_path: str) -> list[str]:
    """Process a file by path and return list of formatted test suite strings."""
    with open(file_path, 'r') as f:
        content = f.read()
    return extract_test_suites(content)

def process_file_content(content: str) -> list[str]:
    """Process file content directly and return list of formatted test suite strings."""
    return extract_test_suites(content)

# Example usage with a test case that includes context manager
if __name__ == "__main__":
    test_content = """Generated 3 enhanced tests
Total tests so far: 90/100

Generated tests:
import unittest

class TestTruncateNumber(unittest.TestCase):

    def test_truncate_number_perf(self):
        self.assertEqual(truncate_number(3.5), 0.5)

    def test_truncate_number_edge(self):
        self.assertAlmostEqual(truncate_number(123.456), 0.456, places=6)

    def test_truncate_number_error(self):
        with self.assertRaises(TypeError):
            truncate_number(None)

if __name__ == "__main__":
    unittest.main()
"""

    formatted_suites = process_file_content(test_content)

    # Process each suite
    for i, suite in enumerate(formatted_suites, 1):
        print(f"Test Suite {i}:")
        print(suite)

FORMATTED TEST SUITE:
import unittest

class TestTruncateNumber(unittest.TestCase):

    def test_truncate_number_perf(self):
        self.assertEqual(truncate_number(3.5), 0.5)

    def test_truncate_number_edge(self):
        self.assertAlmostEqual(truncate_number(123.456), 0.456, places=6)

    def test_truncate_number_error(self):
        with self.assertRaises(TypeError):
            truncate_number(None)

if __name__ == "__main__":
    unittest.main()
Test Suite 1:
import unittest

class TestTruncateNumber(unittest.TestCase):

    def test_truncate_number_perf(self):
        self.assertEqual(truncate_number(3.5), 0.5)

    def test_truncate_number_edge(self):
        self.assertAlmostEqual(truncate_number(123.456), 0.456, places=6)

    def test_truncate_number_error(self):
        with self.assertRaises(TypeError):
            truncate_number(None)

if __name__ == "__main__":
    unittest.main()


In [None]:
extracted_test_suites = process_file_path("/content/deep_seek_test_case_generation_results.txt")

ORIGINAL TEST SUITE:
def test_has_close_elements_perf():
    assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
def test_has_close_elements_edge():
    assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False
import pytest
def test_has_close_elements_error():
    with pytest.raises(TypeError):
        has_close_elements(None)
FORMATTED TEST SUITE:
import pytest

def test_has_close_elements_perf():
    assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
test_has_close_elements_perf()
def test_has_close_elements_edge():
    assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False
test_has_close_elements_edge()
def test_has_close_elements_error():
    with pytest.raises(TypeError):
        has_close_elements(None)
test_has_close_elements_error()
--------------------------------------------------
ORIGINAL TEST SUITE:
import pytest
from typing import List

def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to t

In [None]:
for suite in extracted_test_suites:
  print(suite)
  print("-" * 50)

import pytest

def test_has_close_elements_perf():
    assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
test_has_close_elements_perf()
def test_has_close_elements_edge():
    assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False
test_has_close_elements_edge()
def test_has_close_elements_error():
    with pytest.raises(TypeError):
        has_close_elements(None)
test_has_close_elements_error()
--------------------------------------------------
import pytest

def test_separate_paren_groups_perf():
    assert separate_paren_groups('(()()) ((())) () ((())()())') == ['()', '(())', '(()())', '((()))', '(((())))']
test_separate_paren_groups_perf()
def test_separate_paren_groups_edge():
    assert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']
test_separate_paren_groups_edge()
def test_separate_paren_groups_error():
    with pytest.raises(TypeError):
        separate_paren_groups(None)
test_separate_paren_groups_error()
-------------

In [None]:
extracted_test_suites=extracted_test_suites[:35] #TODO:-figure out why there's an extra entry

In [None]:
semcoder_extracted_test_suites = process_file_path("/content/SemCoder Test Case Generation.txt")

ORIGINAL TEST SUITE:
def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



def test_has_close_elements_perf():
    assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True


def test_has_close_elements_edge():
    assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False


def test_has_close_elements_error():
    with pytest.raises(TypeError):
        has_close_elements(None)
FORMATTED TEST SUITE:
def test_has_close_elements_perf():
    assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
test_has_close_elements_perf()
def test_has_close_elements_edge():
    assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False
test_has_close_elements_edge()
def test_has_close_elemen

In [None]:
for suite in semcoder_extracted_test_suites:
  print(suite)
  print("-" * 50)

def test_has_close_elements_perf():
    assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
test_has_close_elements_perf()
def test_has_close_elements_edge():
    assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False
test_has_close_elements_edge()
def test_has_close_elements_error():
    with pytest.raises(TypeError):
        has_close_elements(None)
test_has_close_elements_error()
--------------------------------------------------
def test_separate_paren_groups_perf():
    assert separate_paren_groups('(()()) ((())) () ((())()())') == [
test_separate_paren_groups_perf()
def test_separate_paren_groups_edge():
    assert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']
test_separate_paren_groups_edge()
def test_separate_paren_groups_error():
    with pytest.raises(TypeError):
        separate_paren_groups(None)
test_separate_paren_groups_error()
--------------------------------------------------
def test_truncate_number_perf():
    

In [None]:
!pip install datasets

In [None]:
from typing import List, Dict
import numpy as np
from datasets import load_dataset
import pytest
def execute_test_case(code: str, test_case: str) -> bool:
    try:
        namespace = {}
        # Execute the function code
        exec(code, namespace)
        # Execute the test case
        exec("import pytest", namespace)
        exec(test_case, namespace)
        return True
    except pytest.raises.Exception:
        # This catches when pytest.raises() fails (i.e., expected exception wasn't raised)
        return False
    except Exception as e:
        # Catch any other exceptions
        return False

def check_syntax(code: str) -> bool:
        try:
            compile(code, '<string>', 'exec')
            return True
        except SyntaxError:
            return False

def evaluate_single_test_suite(solution: str,
                               generated_tests: str) -> Dict:
        syntax_valid = check_syntax(solution + "\n" + generated_tests)

        # Execute test cases if syntax is valid
        if syntax_valid:
            # TODO:- consider using thread pool for parallel test execution
            execution_success = execute_test_case(solution, generated_tests)
        else:
            execution_success = False

        return {
            "syntax_valid": syntax_valid,
            "execution_success": execution_success
        }
def evaluate_test_suite(model_type,dataset, n_tasks, test_suites):
  solutions = dataset['test']["canonical_solution"]
  metrics = {"pass@1": 0.0,      # Single-attempt success rate
            "pass@10": 0.0,     # Success within 10 attempts
            "pass@100": 0.0,    # Success within 100 attempts
            "syntax_validity": 0.0,  # Syntactic correctness
            "execution_accuracy": 0.0  # Functional correctness
  }
  results = []
  with open(f'{model_type}_test_case_generation_accuracy_results.txt', 'w') as f:
          for i in range(n_tasks):
              solution = solutions[i]
              full_solution = dataset['test']["prompt"][i] + solution
              cleaned_tests = test_suites[i]
              result = evaluate_single_test_suite(full_solution, cleaned_tests)

              f.write(f"PROBLEM {i}:\n")
              print(f"PROBLEM {i}:\n")
              f.write("CANONICAL SOLUTION:\n")
              print("CANONICAL SOLUTION:\n")
              f.write(full_solution + "\n")
              print(full_solution + "\n")
              f.write("CLEANED TESTS:\n")
              print("CLEANED TESTS:\n")
              f.write(cleaned_tests + "\n")
              print(cleaned_tests)
              f.write("RESULT:\n" + str(result) + "\n")
              print("RESULT:\n" + str(result))

              results.append(result)

          # Calculate aggregate metrics
          metrics["syntax_validity"] = np.mean([r["syntax_valid"] for r in results])
          metrics["execution_accuracy"] = np.mean([r["execution_success"] for r in results])
          f.write(str(metrics))

In [None]:
dataset = load_dataset("openai_humaneval")

In [None]:
evaluate_test_suite("deep_seek",dataset, 34, extracted_test_suites)

PROBLEM 0:

CANONICAL SOLUTION:

from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """
    for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):
            if idx != idx2:
                distance = abs(elem - elem2)
                if distance < threshold:
                    return True

    return False


CLEANED TESTS:

import pytest

def test_has_close_elements_perf():
    assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
test_has_close_elements_perf()
def test_has_close_elements_edge():
    assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False
test_has_close_elements_edge()
def test_has_close_elements_error():
    with pytest.r

### SemCoder Simple Prompt Results

In [None]:
def clean_generated_code(code: str) -> str:
    """Clean up generated code to extract only the functions."""
    lines = code.split('\n')
    cleaned_lines = []
    in_function = False

    for line in lines:
        if line.strip().startswith('def '):
            in_function = True
            cleaned_lines.append(line)
        elif in_function and (line.startswith('    ') or not line.strip()):
            cleaned_lines.append(line)
        elif in_function and line.strip() and not line.startswith('    '):
            in_function = False
            cleaned_lines.append('')

    return '\n'.join(cleaned_lines).strip()

### DeepSeek Results

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
from typing import List, Dict
import numpy as np
import timeout_decorator
from datasets import load_dataset

In [None]:
metrics = {
            "pass@1": 0.0,      # Single-attempt success rate
            "pass@10": 0.0,     # Success within 10 attempts
            "pass@100": 0.0,    # Success within 100 attempts
            "syntax_validity": 0.0,  # Syntactic correctness
            "execution_accuracy": 0.0  # Functional correctness
}
def clean_deepseek_generated_code(code: str) -> str:
        """Clean up generated code to extract only the functions."""
        lines = code.split('\n')
        cleaned_lines = []
        found_start = False
        found_test_func_call = False
        for line in lines:
            if line.startswith('```python'):
                found_start = True
            elif line.startswith('```'):
                if found_test_func_call: break
                else: found_start = False
            elif found_start:
                if line.startswith('test_') and line.endswith('()'):
                    found_test_func_call = True
                cleaned_lines.append(line)

        return '\n'.join(cleaned_lines).strip()

def evaluate_model(model, dataset, model_type, tokenizer, n_tasks: int = None):
        solutions = dataset['test']["canonical_solution"]
        if n_tasks is None:
            n_tasks = len(solutions)

        results = []
        with open(f'{model_type}_test_case_generation_results.txt', 'w') as f:
          for i in range(n_tasks):
              solution = solutions[i]
              full_solution = dataset['test']["prompt"][i] + solution

              prompt = f"""
              Please provide and execute a set of test cases for the following function:
              {full_solution}

              Please do not include natural language or anything that cannot be compiled/executed.
              Please only provided the test cases and their immediate execution.

              Example:
              def test_hello_with_name():
                  assert hello("Alice") == "Hello, Alice"
                  assert hello("Bob") == "Hello, Bob"
              test_hello_with_name()

              def test_hello_without_name():
                  assert hello(None) == "Hello, world"
                  assert hello("") == "Hello, world"
              test_hello_without_name()
              """
              generated_tests = ""
              if model_type == "deepseek":
                  generated_tests = generate_code(
                      model,
                      tokenizer,
                      prompt,
                      max_new_tokens=4096
                  )
              elif model_type == "semcoder":
                  generated_tests = model.generate_code(prompt, max_new_tokens=4096)

              cleaned_tests = clean_deepseek_generated_code(generated_tests) if model_type == "deepseek" else "" #no-op for now
              result = evaluate_single_test_suite(full_solution, cleaned_tests)

              f.write(f"PROBLEM {i}:\n")
              print(f"PROBLEM {i}:\n")
              f.write("CANONICAL SOLUTION:\n")
              print("CANONICAL SOLUTION:\n")
              f.write(full_solution + "\n")
              print(full_solution + "\n")
              f.write("GENERATED TESTS:\n")
              print("GENERATED TESTS:\n")
              f.write(generated_tests + "\n")
              print(generated_tests)
              f.write("CLEANED TESTS:\n")
              print("CLEANED TESTS:\n")
              f.write(cleaned_tests + "\n")
              print(cleaned_tests)
              f.write("RESULT:\n" + str(result) + "\n")
              print("RESULT:\n" + str(result))

              results.append(result)

          # Calculate aggregate metrics
          metrics["syntax_validity"] = np.mean([r["syntax_valid"] for r in results])
          metrics["execution_accuracy"] = np.mean([r["execution_success"] for r in results])
          f.write(str(metrics))
        return metrics

In [None]:
evaluator = TestCaseEvaluator()

In [None]:
metrics = evaluator.evaluate_model(model, "deepseek", tokenizer, 100)
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")
from google.colab import files
files.download('deepseek_test_case_generation_results.txt')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    assert all_prefixes("a") == ['a']
    assert all_prefixes("xyzt") == ['x', 'xy', 'xyz', 'xyzt']

test_all_prefixes()
```
You can simply copy the code above and run it in your Python environment to test the function. If the function `all_prefixes` is implemented correctly, all the assertions will pass and you won't see any error messages.

CLEANED TESTS:

def test_all_prefixes():
    assert all_prefixes("abc") == ['a', 'ab', 'abc']
    assert all_prefixes("abcd") == ['a', 'ab', 'abc', 'abcd']
    assert all_prefixes("") == []
    assert all_prefixes("a") == ['a']
    assert all_prefixes("xyzt") == ['x', 'xy', 'xyz', 'xyzt']

test_all_prefixes()
RESULT:
{'syntax_valid': True, 'execution_success': True}
PROBLEM 15:

CANONICAL SOLUTION:



def string_sequence(n: int) -> str:
    """ Return a string containing space-delimited numbers starting from 0 upto n inclusive.
    >>> string_sequence(0)
    '0'
    >>> string_sequen

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Standardized SemCoder Results

In [None]:
metrics = evaluator.evaluate_model(semcoder, "semcoder", tokenizer, 100)
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")
from google.colab import files
files.download('semcoder_test_case_generation_results.txt')

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


PROBLEM 0:

CANONICAL SOLUTION:

from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """
    for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):
            if idx != idx2:
                distance = abs(elem - elem2)
                if distance < threshold:
                    return True

    return False


GENERATED TESTS:


              Please provide and execute a set of test cases for the following function:
              from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


PROBLEM 1:

CANONICAL SOLUTION:

from typing import List


def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and return the list of those.
    Separate groups are balanced (each open brace is properly closed) and not nested within each other
    Ignore any spaces in the input string.
    >>> separate_paren_groups('( ) (( )) (( )( ))')
    ['()', '(())', '(()())']
    """
    result = []
    current_string = []
    current_depth = 0

    for c in paren_string:
        if c == '(':
            current_depth += 1
            current_string.append(c)
        elif c == ')':
            current_depth -= 1
            current_string.append(c)

            if current_depth == 0:
                result.append(''.join(current_string))
                current_string.clear()

    return result


GENERATED TESTS:


              Please 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


PROBLEM 2:

CANONICAL SOLUTION:



def truncate_number(number: float) -> float:
    """ Given a positive floating point number, it can be decomposed into
    and integer part (largest integer smaller than given number) and decimals
    (leftover part always smaller than 1).

    Return the decimal part of the number.
    >>> truncate_number(3.5)
    0.5
    """
    return number % 1.0


GENERATED TESTS:


              Please provide and execute a set of test cases for the following function:
              

def truncate_number(number: float) -> float:
    """ Given a positive floating point number, it can be decomposed into
    and integer part (largest integer smaller than given number) and decimals
    (leftover part always smaller than 1).

    Return the decimal part of the number.
    >>> truncate_number(3.5)
    0.5
    """
    return number % 1.0


              Please do not include natural language or anything that cannot be compiled/executed.
              Please only provid

KeyboardInterrupt: 

 ### Code Coverage Assessment

In [None]:
# First, install required packages
!pip install pytest pytest-cov coverage
from google.colab import files  # Colab-specific import

Collecting pytest-cov
  Downloading pytest_cov-6.0.0-py3-none-any.whl.metadata (27 kB)
Collecting coverage
  Downloading coverage-7.6.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.2 kB)
Downloading pytest_cov-6.0.0-py3-none-any.whl (22 kB)
Downloading coverage-7.6.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (234 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.0/235.0 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: coverage, pytest-cov
Successfully installed coverage-7.6.9 pytest-cov-6.0.0


In [None]:
import os
import re
import tempfile
import subprocess
import statistics
from typing import Dict, List, Tuple
import json
from pathlib import Path
from google.colab import files  # Colab-specific import

In [None]:
# Utilities
def extract_sections(entry: str) -> Tuple[str, str]:
    """Extract canonical solution and cleaned tests from an entry."""
    # Extract solution between CANONICAL SOLUTION: and GENERATED TESTS:
    solution_match = re.search(r'CANONICAL SOLUTION:\n(.*?)\nGENERATED TESTS:',
                            entry, re.DOTALL)

    # Extract tests between CLEANED TESTS: and RESULT:
    tests_match = re.search(r'CLEANED TESTS:\n(.*?)\nRESULT:',
                              entry, re.DOTALL)

    if not solution_match or not tests_match:
        raise ValueError("Could not find required sections in entry")

    solution = solution_match.group(1).strip()
    tests = tests_match.group(1).strip()

    # Debug output
    print("Extracted solution:\n", solution)
    print("Extracted tests:\n", tests)
    return solution, tests

def calculate_aggregate_metrics(results, target_score_name) -> Dict:
    if not results:
        return {'error': 'No valid results to analyze'}

    score_values = [r[target_score_name] for r in results if target_score_name in r]

    if not score_values:
        return {'error': 'No valid score values found'}

    return {
        f'mean_{target_score_name}': statistics.mean(score_values),
        f'median_{target_score_name}': statistics.median(score_values),
        f'min_{target_score_name}': min(score_values),
        f'max_{target_score_name}': max(score_values),
        f'std_dev': statistics.stdev(score_values) if len(score_values) > 1 else 0,
        'total_entries_analyzed': len(score_values)
    }

In [None]:
class TestCoverageAnalyzer:
    def __init__(self, input_file: str = "", output_dir: str = "/content/coverage_results"):
        """Initialize the analyzer with input file path and output directory."""
        self.input_file = input_file
        self.output_dir = output_dir
        self.coverage_results = []
        os.makedirs(output_dir, exist_ok=True)

    def create_test_files(self, solution: str, tests: str, temp_dir: str) -> Tuple[str, str]:
        """Create temporary Python files for the solution and tests."""
        # Create solution file
        solution_file = Path(temp_dir) / "solution.py"
        with open(solution_file, 'w') as f:
            f.write(solution)

        # Create test file with proper imports for Colab
        test_file = Path(temp_dir) / "test_solution.py"
        with open(test_file, 'w') as f:
            f.write("import sys\n")
            f.write(f"sys.path.append('{temp_dir}')\n")
            f.write("from solution import *\n")
            f.write(tests)

        return str(solution_file), str(test_file)

    def run_coverage_analysis(self, solution_file: str, test_file: str, temp_dir: str) -> Dict:
        """Run pytest with coverage and return results."""
        try:
            # Change to temp directory
            orig_dir = os.getcwd()
            os.chdir(temp_dir)

            # Run pytest with coverage using python -m to ensure proper module resolution
            cmd = [
                'python3',  # Use python3 explicitly in Colab
                '-m',
                'pytest',
                '--cov=solution',
                '--cov-report=json',
                'test_solution.py',
                '-v'
            ]

            env = os.environ.copy()
            env['PYTHONPATH'] = temp_dir  # Ensure proper module resolution

            result = subprocess.run(cmd, capture_output=True, text=True, env=env)
            # Read coverage data
            if os.path.exists('coverage.json'):
                with open('coverage.json') as f:
                    coverage_data = json.load(f)
                    for file_path, file_data in coverage_data['files'].items():
                        if 'solution.py' in file_path:
                            return {
                                'line_coverage': file_data['summary']['percent_covered'],
                                'total_lines': file_data['summary']['num_statements'],
                                'covered_lines': file_data['summary']['covered_lines'],
                                'missing_lines': file_data['summary']['missing_lines']
                            }
            return {'error': 'No coverage data generated'}

        except subprocess.CalledProcessError as e:
            print(f"Command output: {e.output}")  # More detailed error reporting for Colab
            return {'error': f'pytest failed: {str(e)}'}
        except Exception as e:
            print(f"Exception details: {str(e)}")  # More detailed error reporting for Colab
            return {'error': f'Analysis failed: {str(e)}'}
        finally:
            os.chdir(orig_dir)

    def analyze_all_entries(self) -> Dict:
        """Process all entries in the input file and calculate aggregate metrics."""
        with open(self.input_file, 'r') as f:
            content = f.read()

        # Split content into individual entries using 'CANONICAL SOLUTION:' as delimiter
        entries = content.split('CANONICAL SOLUTION:')[1:]  # Skip first empty split

        for i, entry in enumerate(entries):
            try:
                # Add back the header since we split on it
                entry = 'CANONICAL SOLUTION:' + entry

                with tempfile.TemporaryDirectory() as temp_dir:
                    # Extract solution and tests
                    solution, tests = extract_sections(entry)
                    if not tests.strip():  # Skip if no tests
                        continue

                    # Create temporary files
                    solution_file, test_file = self.create_test_files(solution, tests, temp_dir)

                    # Run coverage analysis
                    result = self.run_coverage_analysis(solution_file, test_file, temp_dir)
                    print(result)
                    # Store results
                    if 'line_coverage' in result:
                        self.coverage_results.append(result)

            except Exception as e:
                print(f"Error processing entry {i}: {str(e)}")
                continue

        # Calculate aggregate metrics
        return calculate_aggregate_metrics(self.coverage_results, "line_coverage")

In [None]:
deep_seek_coverage_analyzer = TestCoverageAnalyzer()
deep_seek_coverage_results = []
for index, test_suite in enumerate(extracted_test_suites):
  solution = dataset['test']["prompt"][index] + dataset['test']["canonical_solution"][index]
  with tempfile.TemporaryDirectory() as temp_dir:
    solution_file, test_file = deep_seek_coverage_analyzer.create_test_files(solution, test_suite, temp_dir)
    result = deep_seek_coverage_analyzer.run_coverage_analysis(solution_file, test_file, temp_dir)
    if 'line_coverage' in result:
      deep_seek_coverage_results.append(result)
print(calculate_aggregate_metrics(deep_seek_coverage_results, "line_coverage"))

{'mean_line_coverage': 97.34693877551021, 'median_line_coverage': 100.0, 'min_line_coverage': 21.428571428571427, 'max_line_coverage': 100.0, 'std_dev': 13.428673596709753, 'total_entries_analyzed': 35}


In [None]:
semcoder_coverage_analyzer = TestCoverageAnalyzer()
semcoder_coverage_results = []
for index, test_suite in enumerate(semcoder_extracted_test_suites):
  solution = dataset['test']["prompt"][index] + dataset['test']["canonical_solution"][index]
  with tempfile.TemporaryDirectory() as temp_dir:
    solution_file, test_file = semcoder_coverage_analyzer.create_test_files(solution, test_suite, temp_dir)
    result = semcoder_coverage_analyzer.run_coverage_analysis(solution_file, test_file, temp_dir)
    if 'line_coverage' in result:
      semcoder_coverage_results.append(result)
print(calculate_aggregate_metrics(semcoder_coverage_results, "line_coverage"))

{'mean_line_coverage': 96.75324675324676, 'median_line_coverage': 100.0, 'min_line_coverage': 21.428571428571427, 'max_line_coverage': 100.0, 'std_dev': 14.40695307294402, 'total_entries_analyzed': 33}


### Measuring Novelty and Diversity

In [None]:
!pip install anthropic

Collecting anthropic
  Downloading anthropic-0.40.0-py3-none-any.whl.metadata (23 kB)
Downloading anthropic-0.40.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.5/199.5 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.40.0


#### Measuring with LLM as Judge

In [None]:
from anthropic import Anthropic
import json
from google.colab import userdata
def analyze_novelty_with_claude(source_function: str, generated_tests: str, original_tests: str = None) -> dict:
    """Use Claude API to analyze test novelty."""

    anthropic = Anthropic(api_key=userdata.get('ANTHROPIC_API_KEY'))

    prompt = f"""
As an expert test engineer, analyze the semantic novelty and diversity of the generated test cases for the given function. Consider the function's purpose, edge cases, and expected behaviors.

Source Function:

{source_function}


Generated Test Suite:

{generated_tests}

Original Test Suite:

{original_tests}

Please analyze:
1. How well do the tests cover different aspects of the function's behavior?
2. What novel testing scenarios are introduced?
3. Are there important edge cases or boundary conditions tested?
4. How diverse are the test inputs and scenarios?
5. Are the tests relevant to the function's purpose?

Provide your analysis in the following JSON format:
{{
    "novelty_score": <float between 0.0 and 1.0>,
    "novel_aspects": [<list of strings describing novel aspects>],
    "unique_scenarios": [<list of strings describing unique test scenarios>],
    "coverage_assessment": <string describing overall test coverage>,
    "recommendations": [<list of strings with suggested additional test cases>]
}}
Do not provide any other additonal text other than the JSON in order to facilitate
text processing.

"""

    message = anthropic.messages.create(
        model="claude-3-sonnet-20240229",
        max_tokens=4096,
        temperature=0,  # Use 0 for consistent analysis
        messages=[{
            "role": "user",
            "content": prompt
        }]
    )

    try:
        # Parse the response as JSON
        analysis = json.loads(message.content[0].text)
        return analysis
    except json.JSONDecodeError:
        print("Failed to parse Claude's response as JSON")
        return None

In [None]:
deep_seek_novelty_results = []
for index, test_suite in enumerate(extracted_test_suites):
  solution = dataset['test']["prompt"][index] + dataset['test']["canonical_solution"][index]
  original_tests = dataset['test']["test"][index]
  result = analyze_novelty_with_claude(solution, test_suite, original_tests)
  print(result)
  deep_seek_novelty_results.append(result)
print(calculate_aggregate_metrics(deep_seek_novelty_results, "novelty_score"))

{'novelty_score': 0.6, 'novel_aspects': ['Tests for error handling (TypeError)', 'Tests for performance edge case'], 'unique_scenarios': ['Passing None as input', 'Large list with close elements'], 'coverage_assessment': 'The generated test suite covers some important aspects like error handling and performance edge cases, but lacks comprehensive coverage of boundary conditions and diverse input scenarios.', 'recommendations': ['Test with empty list', 'Test with list containing duplicate values', 'Test with list containing negative numbers', 'Test with threshold values at or near 0', 'Test with large threshold values']}
{'novelty_score': 0.6, 'novel_aspects': ['Tests for error handling (passing None as input)', 'Tests for performance (large input string)'], 'unique_scenarios': ['Empty string input', 'Nested parentheses within a group', 'Consecutive groups with no spaces', 'Single group with no spaces'], 'coverage_assessment': 'The tests cover a good range of scenarios, including edge c

In [None]:
print(calculate_aggregate_metrics(deep_seek_novelty_results[:34], "novelty_score"))

{'mean_novelty_score': 0.6558823529411765, 'median_novelty_score': 0.7, 'min_novelty_score': 0.4, 'max_novelty_score': 0.8, 'std_dev': 0.07859052479933758, 'total_entries_analyzed': 34}


In [None]:
semcoder_novelty_results = []
for index, test_suite in enumerate(semcoder_extracted_test_suites):
  solution = dataset['test']["prompt"][index] + dataset['test']["canonical_solution"][index]
  original_tests = dataset['test']["test"][index]
  result = analyze_novelty_with_claude(solution, test_suite, original_tests)
  semcoder_novelty_results.append(result)
  print(result)
print(calculate_aggregate_metrics(semcoder_novelty_results, "novelty_score"))

{'novelty_score': 0.6, 'novel_aspects': ['Tests for error handling (TypeError)', 'Tests for performance edge case'], 'unique_scenarios': ['Passing None as input', 'Large list with close elements'], 'coverage_assessment': 'The generated test suite covers some important aspects like error handling and performance edge cases, but lacks comprehensive coverage of boundary conditions and diverse input scenarios.', 'recommendations': ['Test with empty list', 'Test with list containing duplicate elements', 'Test with list containing negative numbers', 'Test with threshold values at or near 0', 'Test with large threshold values']}
{'novelty_score': 0.6, 'novel_aspects': ['Tests for error handling (passing None as input)', 'Tests for performance (large input string)'], 'unique_scenarios': ['Empty string input', 'Nested parentheses', 'Single group of parentheses', 'Multiple groups of parentheses', 'Unbalanced parentheses (not tested)'], 'coverage_assessment': 'The tests cover a good range of scen

In [None]:
# prompt: Write the contents of semcoder_novelty_results and deep_seek_novelty_results to their own respective files that I can then download

import json

# Assuming deep_seek_novelty_results and semcoder_novelty_results are lists of dictionaries
# as produced by your analyze_novelty_with_claude function.


def write_results_to_file(results, filename):
    with open(filename, 'w') as f:
        json.dump(results, f, indent=4)


write_results_to_file(deep_seek_novelty_results, 'deep_seek_novelty_results.json')
write_results_to_file(semcoder_novelty_results, 'semcoder_novelty_results.json')

from google.colab import files

files.download('deep_seek_novelty_results.json')
files.download('semcoder_novelty_results.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def analyze_all_entries(dataset, entry_start, input_file) -> Dict:
    novelty_results = []
    """Process all entries in the input file and calculate aggregate metrics."""
    with open(input_file, 'r') as f:
        content = f.read()

        # Split content into individual entries using 'CANONICAL SOLUTION:' as delimiter
        entries = content.split('CANONICAL SOLUTION:')[1:]  # Skip first empty split

        for i, entry in enumerate(entries):
            try:
                # Add back the header since we split on it
                entry = 'CANONICAL SOLUTION:' + entry
                solution, tests = extract_sections(entry)
                if not tests.strip():  # Skip if no tests
                    continue

                result = analyze_novelty_with_claude(solution, tests)
                print(result)
                if 'novelty_score' in result: novelty_results.append(result)

            except Exception as e:
                print(f"Error processing entry {i}: {str(e)}")
                continue

        # Calculate aggregate metrics
        return calculate_aggregate_metrics(novelty_results, "novelty_score")

In [None]:
from typing import Dict, List, Set
import re
from collections import defaultdict

class CoveragePatternAnalyzer:
    """Analyzes test coverage patterns focusing on types of test cases."""

    def __init__(self):
        # Define patterns to identify different types of test cases
        self.patterns = {
            'edge_cases': {
                'empty_input': r'(empty|""|\[\]|\{\}|\(\))',
                'null_input': r'(None|null)',
                'single_element': r'assert.*\[.?\]|assert.*\(.?\)',
            },
            'boundary_testing': {
                'zero_values': r'(^0$|^0\.0$)',
                'negative_values': r'-\d+',
                'large_values': r'\d{5,}',
            },
            'error_handling': {
                'exception_testing': r'(raises|assertRaises|try|except|error)',
                'invalid_input': r'(invalid|wrong|incorrect|bad)',
            },
            'functionality': {
                'typical_case': r'assert.*normal|typical|standard',
                'complex_input': r'assert.*(\[.*,.*,.*\]|\{.*:.*,.*:.*\})',
            }
        }

    def analyze_test_suite(self, test_code: str) -> Dict:
        """Analyze a test suite and return coverage metrics."""
        results = defaultdict(dict)
        total_asserts = len(re.findall(r'assert', test_code))

        # Analyze each pattern category
        for category, patterns in self.patterns.items():
            category_matches = 0
            pattern_matches = {}

            for name, pattern in patterns.items():
                matches = len(re.findall(pattern, test_code))
                pattern_matches[name] = matches
                category_matches += matches

            results[category] = {
                'total_matches': category_matches,
                'coverage_ratio': category_matches / total_asserts if total_asserts > 0 else 0,
                'pattern_breakdown': pattern_matches
            }

        # Add overall metrics
        results['overall'] = {
            'total_assertions': total_asserts,
            'pattern_diversity': len([p for p in sum([list(p.values()) for p in results.values()], []) if p > 0]) / \
                               len(sum([list(p.values()) for p in self.patterns.values()], []))
        }

        return dict(results)