In [8]:
import json
import re
from openai import OpenAI
from typing import Dict, Any, Optional

class LLMClient:
    """OpenAI API client wrapper class"""
    
    def __init__(self, 
                 api_key: str,
                 base_url: str = "https://api.openai.com/v1",
                 model: str = "gpt-3.5-turbo"):
        """
        Initialize LLM client
        
        Args:
            api_key: OpenAI API key
            base_url: API base URL
            model: Default model to use
        """
        self.client = OpenAI(api_key=api_key, base_url=base_url)
        self.default_model = model
    
    def chat(self, 
             messages: list,
             model: Optional[str] = None,
             temperature: float = 0.7,
             max_tokens: Optional[int] = None,
             top_p: float = 1.0,
             frequency_penalty: float = 0.0,
             presence_penalty: float = 0.0,
             stop: Optional[list] = None,
             stream: bool = False) -> str:
        """
        Send chat request
        
        Args:
            messages: List of messages
            model: Model to use, if None uses default model
            temperature: Temperature parameter (0.0-2.0)
            max_tokens: Maximum number of tokens
            top_p: Top_p parameter
            frequency_penalty: Frequency penalty
            presence_penalty: Presence penalty
            stop: List of stop words
            stream: Whether to stream output
            
        Returns:
            str: Model response content
        """
        try:
            response = self.client.chat.completions.create(
                model=model or self.default_model,
                messages=messages,
                temperature=temperature,
                max_tokens=max_tokens,
                top_p=top_p,
                frequency_penalty=frequency_penalty,
                presence_penalty=presence_penalty,
                stop=stop,
                stream=stream
            )
            
            if stream:
                return response  # Return generator object
            else:
                return response.choices[0].message.content
                
        except Exception as e:
            print(f"API call error: {e}")
            return None


def CodeEval(task: str, response: str, llm_client: LLMClient) -> Dict[str, Any]:
    """
    Bioinformatics code evaluation function
    
    Args:
        task: Task description to evaluate
        response: R code response to evaluate
        llm_client: LLM client instance
        
    Returns:
        Dict: Dictionary containing evaluation results
    """
    
    # Evaluation system prompt
    system_prompt = """You are an expert bioinformatics code reviewer with extensive experience in computational biology, omics, and biostatistics. Your task is to evaluate bioinformatics code solutions objectively and provide constructive feedback. Focus on scientific correctness, practical utility, and professional standards in bioinformatics."""
    
    # Evaluation prompt template
    evaluation_prompt_template = """## Evaluation Instructions

You will be provided with:
1. A bioinformatics task/question
2. R Code that attempts to solve this task

Evaluate the R code using the framework below. This is a positive-scoring system - award points for good practices, do not subtract.

## Scoring Framework (100 points total)

### Part 1: Core Scientific Validity (45 points)

#### 1.1 Problem Solving (20 points)
- [ ] Code addresses the biological question (+10)
- [ ] Solution approach is scientifically sound (+10)

#### 1.2 Technical Implementation (25 points)
- [ ] Appropriate methods for data type (+15)
  - RNA-seq: DESeq2/edgeR/limma or proper transformation
  - Genomics: GenomicRanges or appropriate coordinate handling
  - Clustering: suitable algorithm for data characteristics
  - General: method matches the biological question
- [ ] Complete analysis workflow (+10)
  - All necessary steps present
  - Logical flow from input to conclusion

### Part 2: Technical Quality (30 points)

#### 2.1 Data Handling (15 points)
- [ ] Appropriate preprocessing/normalization (+8)
  - Log transformation for expression data
  - Scaling for multi-omics integration
  - Batch effect consideration
- [ ] Data quality control (+7)
  - Checks for NA/missing values
  - Dimension validation
  - Sample/feature filtering

#### 2.2 Statistical Rigor (15 points)
- [ ] Multiple testing correction when needed (+8)
  - FDR/BH/Bonferroni for multiple comparisons
  - **If not applicable (e.g., clustering only): award 8 points**
- [ ] Appropriate statistical methods (+7)
  - Correct test for data distribution
  - Proper handling of paired/grouped data
  - **If only visualization/processing: award 7 points**

### Part 3: Professional Excellence (25 points)

#### 3.1 Domain Knowledge & Interpretation (10 points)
- [ ] Shows understanding of biological context (+5)
  - Explains why methods suit the biological data
  - Mentions relevant biological considerations
- [ ] Results are biologically interpretable (+5)
  - Output can be understood by biologists
  - Includes relevant biological annotation

#### 3.2 Robustness & Completeness (10 points)
- [ ] Error handling or input validation (+5)
  - Try-catch blocks or if-statements for edge cases
  - Informative messages or warnings
- [ ] Analysis validation or quality checks (+5)
  - Parameter optimization (e.g., choosing k for clustering)
  - Stability/reproducibility considerations (e.g., set.seed)

#### 3.3 Documentation & Usability (5 points)
- [ ] Clear workflow and outputs (+3)
  - Can follow the analysis logic
  - Results are saved or displayed
- [ ] Adequate documentation (+2)
  - Key steps explained
  - Parameters justified

## Evaluation Guidelines

1. **Focus on scientific merit** - Correct biology > elegant code
2. **Recognize good practices** - Award points for any professional elements
3. **Consider context** - Simple working solutions can score well if scientifically sound

## Task to evaluate:
{task}

## R Code to evaluate:
{response}

## Output Format

Return ONLY the JSON below with no additional text:

```json
{{
  "total_score": <integer 0-100>,
  "breakdown": {{
    "problem_solving": <integer 0-20>,
    "technical_implementation": <integer 0-25>,
    "data_handling": <integer 0-15>,
    "statistical_rigor": <integer 0-15>,
    "domain_knowledge": <integer 0-10>,
    "robustness": <integer 0-10>,
    "documentation": <integer 0-5>
  }}
}}
```"""
    
    try:
        # Build evaluation prompt
        eval_prompt = evaluation_prompt_template.format(
            task=task,
            response=response
        )
        
        # Build messages
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": eval_prompt}
        ]
        
        # Call LLM for evaluation
        evaluation_content = llm_client.chat(
            messages=messages,
            temperature=0.2
        )
        
        if evaluation_content is None:
            return {"error": "LLM API call failed"}
        
        # Parse JSON response
        try:
            eval_result = json.loads(evaluation_content)
            
            # Calculate total score
            total_score = eval_result.get("total_score", 0)
            breakdown = eval_result.get("breakdown", {})
            
            # If no total_score, calculate from breakdown
            if total_score == 0 and breakdown:
                total_score = sum(score for score in breakdown.values() if isinstance(score, (int, float)))
            
            result = {
                "total_score": total_score,
                "breakdown": breakdown
            }
            
            return result
            
        except json.JSONDecodeError:
            # Try to extract JSON from response
            json_pattern = r'\{(?:[^{}]|(?:\{[^{}]*\}))*\}'
            match = re.search(json_pattern, evaluation_content)
            
            if match:
                try:
                    eval_result = json.loads(match.group(0))
                    total_score = eval_result.get("total_score", 0)
                    breakdown = eval_result.get("breakdown", {})
                    
                    if total_score == 0 and breakdown:
                        total_score = sum(score for score in breakdown.values() if isinstance(score, (int, float)))
                    
                    return {
                        "total_score": total_score,
                        "breakdown": breakdown
                    }
                except json.JSONDecodeError:
                    pass
            
            print(f"JSON parsing failed: {evaluation_content}")
            return {"error": "JSON parsing failed"}
            
    except Exception as e:
        print(f"Error during evaluation: {e}")
        return {"error": str(e)}