In [51]:
# Install required packages for multi-repository evaluation
!pip install transformers accelerate evaluate rouge-score nltk datasets torch pandas numpy tqdm -q
print("‚úÖ All packages installed for multi-repository evaluation")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


‚úÖ All packages installed for multi-repository evaluation


In [52]:
# Import all necessary libraries for multi-repository evaluation
import json
import glob
import os
import time
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
print("üîß Environment Setup Complete!")
print(f"Python version: {torch.__version__}")
print(f"PyTorch CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Set up paths for CodeRepoQA dataset
CODEREPOQA_BASE_PATH = "/kaggle/input/coderepoqa"
print(f"üìÇ CodeRepoQA base path: {CODEREPOQA_BASE_PATH}")

# Initialize variables that will be set by repository detection
repositories = {}
all_json_files = []

print("‚úÖ Libraries imported and environment configured for multi-repository evaluation")

üîß Environment Setup Complete!
Python version: 2.6.0+cu124
PyTorch CUDA available: True
CUDA device: Tesla T4
GPU memory: 15.8 GB
üìÇ CodeRepoQA base path: /kaggle/input/coderepoqa
‚úÖ Libraries imported and environment configured for multi-repository evaluation


In [53]:
def create_test_samples_full_dataset(json_files, max_files=None, min_comment_length=10):
    """
    Process ALL available data to create test samples
    Following paper methodology: use historical dialogue as input, 
    last maintainer response as ground truth
    """
    samples = []
    skipped_stats = {
        'no_comments': 0,
        'no_maintainer_responses': 0,
        'short_comments': 0,
        'processing_errors': 0
    }
    issues_processed = 0
    
    files_to_process = json_files[:max_files] if max_files else json_files
    
    print(f"üîÑ Processing {len(files_to_process):,} issues to create test samples...")
    
    for file_path in tqdm(files_to_process, desc="Creating test samples"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            issues_processed += 1
            
            # Build conversation chronologically
            conversation = []
            
            # 1. Initial issue (always include)
            issue_title = data.get('title', '').strip()
            issue_body = data.get('body', '').strip()
            initial_content = f"Title: {issue_title}\n\nBody: {issue_body}"
            
            conversation.append({
                'speaker': 'user',
                'content': initial_content,
                'role': data.get('author_association', 'NONE'),
                'timestamp': data.get('created_at', '')
            })
            
            # 2. Get and process comments
            comments = data.get('comments_details', [])
            
            if not comments:
                skipped_stats['no_comments'] += 1
                continue
            
            # 3. Add all valid comments to conversation
            for comment in comments:
                body = comment.get('body', '').strip()
                role = comment.get('author_association', 'NONE')
                
                # Skip very short or empty comments
                if not body or len(body) < min_comment_length:
                    skipped_stats['short_comments'] += 1
                    continue
                
                conversation.append({
                    'speaker': 'maintainer' if role in ['MEMBER', 'CONTRIBUTOR', 'OWNER', 'COLLABORATOR'] 
                              else 'user',
                    'content': body,
                    'role': role,
                    'timestamp': comment.get('created_at', '')
                })
            
            # 4. Find all maintainer response positions
            maintainer_indices = [
                i for i, turn in enumerate(conversation) 
                if turn['speaker'] == 'maintainer'
            ]
            
            if not maintainer_indices:
                skipped_stats['no_maintainer_responses'] += 1
                continue
            
            # 5. Create one test sample per maintainer response
            for maintainer_idx in maintainer_indices:
                context_turns = conversation[:maintainer_idx]
                ground_truth = conversation[maintainer_idx]['content']
                
                # Skip if context is too short (need at least initial issue)
                if len(context_turns) < 1:
                    continue
                
                samples.append({
                    'issue_number': data.get('number', 'unknown'),
                    'context': context_turns,
                    'ground_truth': ground_truth,
                    'turn_number': maintainer_indices.index(maintainer_idx) + 1,
                    'total_maintainer_turns': len(maintainer_indices),
                    'total_conversation_turns': len(conversation),
                    'maintainer_role': conversation[maintainer_idx]['role'],
                    'context_char_length': sum(len(turn['content']) for turn in context_turns),
                    'ground_truth_length': len(ground_truth),
                    'file_path': file_path
                })
                
        except Exception as e:
            skipped_stats['processing_errors'] += 1
            if skipped_stats['processing_errors'] <= 5:  # Show first 5 errors
                print(f"\n‚ö†Ô∏è  Error processing {Path(file_path).name}: {str(e)[:100]}...")
            continue
    
    # Print comprehensive processing summary
    print(f"\n=== SAMPLE CREATION SUMMARY ===")
    print(f"üìÅ Issues processed: {issues_processed:,}")
    print(f"‚úÖ Test samples created: {len(samples):,}")
    print(f"üìä Average samples per valid issue: {len(samples)/(issues_processed-sum(skipped_stats.values())):.2f}")
    
    print(f"\nüö´ Skipped Issues Breakdown:")
    print(f"   No comments: {skipped_stats['no_comments']:,}")
    print(f"   No maintainer responses: {skipped_stats['no_maintainer_responses']:,}")
    print(f"   Short comments filtered: {skipped_stats['short_comments']:,}")
    print(f"   Processing errors: {skipped_stats['processing_errors']:,}")
    print(f"   Total skipped: {sum(skipped_stats.values()):,}")
    
    if len(samples) > 0:
        # Additional statistics
        context_lengths = [s['context_char_length'] for s in samples]
        gt_lengths = [s['ground_truth_length'] for s in samples]
        
        print(f"\nüìè Sample Characteristics:")
        print(f"   Context length - Mean: {np.mean(context_lengths):.0f}, Median: {np.median(context_lengths):.0f}")
        print(f"   Ground truth length - Mean: {np.mean(gt_lengths):.0f}, Median: {np.median(gt_lengths):.0f}")
        print(f"   Samples per issue range: 1-{max([s['total_maintainer_turns'] for s in samples])}")
    
    return samples, skipped_stats

print("‚úÖ Sample creation function loaded successfully!")
print("üìù This function will be used by the multi-repository evaluation")
print("üí° For multi-repository evaluation, samples are loaded per repository dynamically")

‚úÖ Sample creation function loaded successfully!
üìù This function will be used by the multi-repository evaluation
üí° For multi-repository evaluation, samples are loaded per repository dynamically


In [54]:
# Dataset Processing Status Check
print("üìä Multi-Repository Dataset Processing Status")
print("=" * 50)

# Check if repositories have been detected
if repositories:
    print(f"‚úÖ Repositories detected: {len(repositories)}")
    total_files = sum(repo_info['file_count'] for repo_info in repositories.values())
    print(f"‚úÖ Total JSON files found: {total_files:,}")
    
    # Show repository summary
    print(f"\nüìÅ Repository Summary:")
    for repo_name, repo_info in list(repositories.items())[:5]:  # Show first 5
        print(f"   ‚Ä¢ {repo_name}: {repo_info['file_count']} files")
    
    if len(repositories) > 5:
        print(f"   ... and {len(repositories) - 5} more repositories")
        
    print(f"\nüöÄ Ready for multi-repository evaluation!")
    print(f"üí° Use the multi-repository evaluation functions to process all repositories")
    print(f"   Examples:")
    print(f"   ‚Ä¢ run_complete_multi_repository_evaluation() for full evaluation")
    print(f"   ‚Ä¢ Individual repository processing using detected repository info")
    
else:
    print("‚ö†Ô∏è  No repositories detected yet.")
    print("   Please run the repository detection cell first.")
    print("   The multi-repository evaluation will load samples dynamically per repository.")

print(f"\nüí° Note: This notebook uses dynamic sample loading per repository")
print(f"   instead of loading all samples at once, which is more memory efficient")
print(f"   for multi-repository evaluation.")

üìä Multi-Repository Dataset Processing Status
‚ö†Ô∏è  No repositories detected yet.
   Please run the repository detection cell first.
   The multi-repository evaluation will load samples dynamically per repository.

üí° Note: This notebook uses dynamic sample loading per repository
   instead of loading all samples at once, which is more memory efficient
   for multi-repository evaluation.


In [55]:
# Import and setup evaluation metrics
from evaluate import load
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
    print("‚úÖ NLTK data downloaded")
except:
    print("‚ö†Ô∏è NLTK download failed, but may already be available")

# Load ROUGE metric
rouge = load('rouge')
print("‚úÖ ROUGE metric loaded")

def calculate_all_metrics(prediction, ground_truth):
    """
    Calculate all metrics from the CodeRepoQA paper:
    - BLEU Score (with smoothing)
    - ROUGE-L (Longest Common Subsequence)
    - ROUGE-1 (Unigram overlap)
    - Edit Similarity (Normalized Levenshtein distance)
    """
    
    # Clean and validate inputs
    if not prediction or not ground_truth:
        return {'bleu': 0.0, 'rouge_l': 0.0, 'rouge_1': 0.0, 'edit_similarity': 0.0}
    
    prediction = str(prediction).strip()
    ground_truth = str(ground_truth).strip()
    
    if not prediction or not ground_truth:
        return {'bleu': 0.0, 'rouge_l': 0.0, 'rouge_1': 0.0, 'edit_similarity': 0.0}
    
    # 1. BLEU Score (with smoothing function like the paper)
    try:
        # Tokenize for BLEU calculation
        prediction_tokens = prediction.split()
        ground_truth_tokens = [ground_truth.split()]  # BLEU expects list of reference lists
        
        # Use smoothing function to handle edge cases
        smoothie = SmoothingFunction().method4
        bleu_score = sentence_bleu(ground_truth_tokens, prediction_tokens, smoothing_function=smoothie)
    except:
        bleu_score = 0.0
    
    # 2. ROUGE Scores (L and 1)
    try:
        rouge_scores = rouge.compute(predictions=[prediction], references=[ground_truth])
        rouge_l_score = rouge_scores['rougeL']
        rouge_1_score = rouge_scores['rouge1']
    except:
        rouge_l_score = 0.0
        rouge_1_score = 0.0
    
    # 3. Edit Similarity (Normalized Levenshtein Distance)
    try:
        edit_similarity = calculate_edit_similarity(prediction, ground_truth)
    except:
        edit_similarity = 0.0
    
    return {
        'bleu': float(bleu_score),
        'rouge_l': float(rouge_l_score),
        'rouge_1': float(rouge_1_score),
        'edit_similarity': float(edit_similarity)
    }

def calculate_edit_similarity(prediction, ground_truth):
    """
    Calculate normalized edit similarity (1 - normalized Levenshtein distance)
    Following the paper's implementation
    """
    def levenshtein_distance(s1, s2):
        """Calculate Levenshtein distance between two strings"""
        if len(s1) < len(s2):
            return levenshtein_distance(s2, s1)
        
        if len(s2) == 0:
            return len(s1)
        
        previous_row = list(range(len(s2) + 1))
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row
        
        return previous_row[-1]
    
    # Calculate Levenshtein distance
    distance = levenshtein_distance(prediction, ground_truth)
    
    # Normalize by maximum possible distance (length of longer string)
    max_len = max(len(prediction), len(ground_truth))
    if max_len == 0:
        return 1.0  # Both strings are empty
    
    # Convert distance to similarity (1 - normalized distance)
    normalized_distance = distance / max_len
    similarity = 1.0 - normalized_distance
    
    return max(0.0, similarity)  # Ensure non-negative

def format_conversation_context(context_turns):
    """Format conversation context for model input"""
    formatted_context = []
    
    for turn in context_turns:
        speaker = turn['speaker']
        content = turn['content']
        role = turn.get('role', 'USER')
        
        # Format each turn clearly
        if speaker == 'user':
            formatted_context.append(f"USER: {content}")
        else:
            formatted_context.append(f"MAINTAINER ({role}): {content}")
    
    return "\n\n".join(formatted_context)

print("‚úÖ All evaluation metrics implemented successfully!")
print("üìä Available metrics: BLEU, ROUGE-L, ROUGE-1, Edit Similarity")

‚úÖ NLTK data downloaded
‚úÖ ROUGE metric loaded
‚úÖ All evaluation metrics implemented successfully!
üìä Available metrics: BLEU, ROUGE-L, ROUGE-1, Edit Similarity


In [56]:
# Model configurations - matching paper's evaluated models
AVAILABLE_MODELS = {
    "deepseek-coder-1.3b": {
        "name": "deepseek-ai/deepseek-coder-6.7b-instruct",
        "paper_name": "DeepSeek-Coder-1.3B",
        "description": "Code-focused model, similar to paper's DSC-6.7B"
    },
    "codeqwen-7b": {
        "name": "Qwen/CodeQwen1.5-7B-Chat",
        "paper_name": "CodeQwen-7B", 
        "description": "Qwen's code model, similar to paper's CQ-7B"
    },
    "codellama-7b": {
        "name": "codellama/CodeLlama-7b-Instruct-hf",
        "paper_name": "CodeLlama-7B",
        "description": "Meta's code-specialized model"
    },
    "mistral-7b": {
        "name": "mistralai/Mistral-7B-Instruct-v0.2",
        "paper_name": "Mistral-7B",
        "description": "General purpose instruction-tuned model"
    }
}

def load_model_with_config(model_name, max_memory_gb=None):
    """
    Load model and tokenizer with optimized configuration for evaluation
    """
    print(f"üîÑ Loading {model_name}...")
    start_time = time.time()
    
    try:
        # Configure tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            model_name, 
            trust_remote_code=True,
            padding_side='left'
        )
        
        # Set pad token if not exists
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        # Configure model loading parameters
        model_kwargs = {
            "trust_remote_code": True,
            "low_cpu_mem_usage": True,
        }
        
        # Set appropriate dtype and device mapping
        if torch.cuda.is_available():
            model_kwargs.update({
                "torch_dtype": torch.float16,
                "device_map": "auto"
            })
            
            # Handle memory constraints if specified
            if max_memory_gb:
                max_memory = {0: f"{max_memory_gb}GB"}
                model_kwargs["max_memory"] = max_memory
        else:
            model_kwargs["torch_dtype"] = torch.float32
        
        # Load model
        model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
        
        load_time = time.time() - start_time
        
        # Print model info
        print(f"‚úÖ Model loaded successfully in {load_time:.1f}s")
        print(f"   Device: {next(model.parameters()).device}")
        print(f"   dtype: {next(model.parameters()).dtype}")
        
        if torch.cuda.is_available():
            memory_allocated = torch.cuda.memory_allocated() / 1e9
            print(f"   GPU memory allocated: {memory_allocated:.1f} GB")
        
        return model, tokenizer
    
    except Exception as e:
        print(f"‚ùå Error loading {model_name}: {e}")
        return None, None

def clear_model_memory(model):
    """Clear model from memory to free up GPU resources"""
    if model is not None:
        del model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

# Display available models
print("ü§ñ Available Models for Evaluation:")
print("=" * 60)
for key, config in AVAILABLE_MODELS.items():
    print(f"üì¶ {key}:")
    print(f"   HuggingFace: {config['name']}")
    print(f"   Paper name: {config['paper_name']}")
    print(f"   Description: {config['description']}")
    print()

print("üí° To evaluate a specific model, use:")
print("   model, tokenizer = load_model_with_config(AVAILABLE_MODELS['deepseek-coder-6.7b']['name'])")
print("\n‚ö†Ô∏è  Note: Models require significant GPU memory. Load one at a time for evaluation.")

ü§ñ Available Models for Evaluation:
üì¶ deepseek-coder-1.3b:
   HuggingFace: deepseek-ai/deepseek-coder-6.7b-instruct
   Paper name: DeepSeek-Coder-1.3B
   Description: Code-focused model, similar to paper's DSC-6.7B

üì¶ codeqwen-7b:
   HuggingFace: Qwen/CodeQwen1.5-7B-Chat
   Paper name: CodeQwen-7B
   Description: Qwen's code model, similar to paper's CQ-7B

üì¶ codellama-7b:
   HuggingFace: codellama/CodeLlama-7b-Instruct-hf
   Paper name: CodeLlama-7B
   Description: Meta's code-specialized model

üì¶ mistral-7b:
   HuggingFace: mistralai/Mistral-7B-Instruct-v0.2
   Paper name: Mistral-7B
   Description: General purpose instruction-tuned model

üí° To evaluate a specific model, use:
   model, tokenizer = load_model_with_config(AVAILABLE_MODELS['deepseek-coder-6.7b']['name'])

‚ö†Ô∏è  Note: Models require significant GPU memory. Load one at a time for evaluation.


In [57]:
def evaluate_model_comprehensive(model, tokenizer, samples, model_name, 
                                max_samples=None, generation_config=None, 
                                save_results=True):
    """
    Comprehensive model evaluation on samples with detailed analysis
    """
    
    # Determine sample size
    if max_samples and max_samples < len(samples):
        # Use stratified sampling to get representative samples
        sample_indices = np.linspace(0, len(samples)-1, max_samples, dtype=int)
        eval_samples = [samples[i] for i in sample_indices]
        print(f"üìä Evaluating on {max_samples:,} stratified samples (from {len(samples):,} total)")
    else:
        eval_samples = samples
        print(f"üìä Evaluating on all {len(eval_samples):,} samples")
    
    results = []
    failed_generations = 0
    start_time = time.time()
    
    print(f"üöÄ Starting evaluation of {model_name}...")
    
    # Generate responses
    responses = batch_generate_responses(
        model, tokenizer, eval_samples, 
        batch_size=1, generation_config=generation_config
    )
    
    # Calculate metrics for each response
    print("üìè Calculating evaluation metrics...")
    for i, (sample, response) in enumerate(tqdm(zip(eval_samples, responses), 
                                                desc="Computing metrics", 
                                                total=len(eval_samples))):
        
        if not response:  # Handle failed generations
            failed_generations += 1
            metrics = {'bleu': 0.0, 'rouge_l': 0.0, 'rouge_1': 0.0, 'edit_similarity': 0.0}
        else:
            metrics = calculate_all_metrics(response, sample['ground_truth'])
        
        # Store comprehensive results
        result = {
            'model_name': model_name,
            'sample_idx': i,
            'issue_number': sample['issue_number'],
            'turn_number': sample['turn_number'],
            'total_turns': sample['total_conversation_turns'],
            'context_length': len(format_conversation_context(sample['context'])),
            'context_turns': len(sample['context']),
            'ground_truth_length': len(sample['ground_truth']),
            'response_length': len(response),
            'maintainer_role': sample['maintainer_role'],
            'prediction': response,
            'ground_truth': sample['ground_truth'],
            **metrics
        }
        
        results.append(result)
    
    # Create results DataFrame
    results_df = pd.DataFrame(results)
    
    # Calculate summary statistics
    eval_time = time.time() - start_time
    avg_metrics = results_df[['bleu', 'rouge_l', 'rouge_1', 'edit_similarity']].mean()
    
    # Print evaluation summary
    print(f"\n{'='*60}")
    print(f"üéØ EVALUATION COMPLETE: {model_name}")
    print(f"{'='*60}")
    print(f"‚è±Ô∏è  Total evaluation time: {eval_time/60:.1f} minutes")
    print(f"üìä Samples evaluated: {len(results_df):,}")
    print(f"‚ùå Failed generations: {failed_generations}")
    print(f"‚úÖ Success rate: {(len(results_df)-failed_generations)/len(results_df)*100:.1f}%")
    
    print(f"\nüìà Average Scores:")
    for metric, score in avg_metrics.items():
        print(f"   {metric.upper():15}: {score:.4f}")
    
    overall_avg = avg_metrics.mean()
    print(f"   {'OVERALL AVG':15}: {overall_avg:.4f}")
    
    # Save results if requested
    if save_results:
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        filename = f"evaluation_results_{model_name.replace('/', '_')}_{timestamp}.csv"
        results_df.to_csv(filename, index=False)
        print(f"üíæ Results saved to: {filename}")
    
    return results_df, avg_metrics

def compare_with_paper_results(results_df, model_name):
    """Compare evaluation results with paper benchmarks"""
    
    # Paper results from Table 3 (CodeRepoQA paper)
    paper_benchmarks = {
        'DeepSeek-Coder-1.3B': {'BLEU': 0.1110, 'Rouge-L': 0.1350, 'Rouge-1': 0.2215, 'Edit Similarity': 0.1689},
        'CodeQwen-7B': {'BLEU': 0.1188, 'Rouge-L': 0.1392, 'Rouge-1': 0.2264, 'Edit Similarity': 0.1803},
        'CodeLlama-7B': {'BLEU': 0.1035, 'Rouge-L': 0.1298, 'Rouge-1': 0.2134, 'Edit Similarity': 0.1598},
        'GPT-4': {'BLEU': 0.1179, 'Rouge-L': 0.1330, 'Rouge-1': 0.2315, 'Edit Similarity': 0.1715}
    }
    
    # Calculate our results
    our_results = {
        'BLEU': results_df['bleu'].mean(),
        'Rouge-L': results_df['rouge_l'].mean(),
        'Rouge-1': results_df['rouge_1'].mean(),
        'Edit Similarity': results_df['edit_similarity'].mean()
    }
    
    print(f"\nüìä COMPARISON WITH PAPER RESULTS")
    print(f"{'='*50}")
    
    # Find closest paper model for comparison
    paper_model = None
    if 'deepseek' in model_name.lower():
        paper_model = 'DeepSeek-Coder-1.3B'
    elif 'qwen' in model_name.lower():
        paper_model = 'CodeQwen-7B'
    elif 'codellama' in model_name.lower():
        paper_model = 'CodeLlama-7B'
    else:
        paper_model = 'GPT-4'  # Default comparison
    
    # Create comparison DataFrame
    comparison_data = {
        'Metric': list(our_results.keys()),
        f'Our Results ({model_name})': list(our_results.values()),
        f'Paper ({paper_model})': list(paper_benchmarks[paper_model].values())
    }
    
    comparison_df = pd.DataFrame(comparison_data)
    comparison_df['Difference'] = comparison_df[f'Our Results ({model_name})'] - comparison_df[f'Paper ({paper_model})']
    comparison_df['% Difference'] = (comparison_df['Difference'] / comparison_df[f'Paper ({paper_model})']) * 100
    
    print(comparison_df.round(4))
    
    print(f"\nüìù Notes:")
    print(f"   ‚Ä¢ Paper evaluated on 585,687 samples across multiple repositories")
    print(f"   ‚Ä¢ Our evaluation: {len(results_df):,} samples from AutoGPT repository only")
    print(f"   ‚Ä¢ Differences expected due to sample size and repository specificity")
    
    return comparison_df

# Evaluation configuration
EVALUATION_CONFIG = {
    'full_dataset': {
        'max_samples': None,  # Use all samples
        'generation_config': {'max_new_tokens': 512, 'temperature': 0.7}
    },
    'large_sample': {
        'max_samples': 1000,  # 1K samples for faster evaluation
        'generation_config': {'max_new_tokens': 512, 'temperature': 0.7}
    },
    'medium_sample': {
        'max_samples': 500,   # 500 samples
        'generation_config': {'max_new_tokens': 512, 'temperature': 0.7}
    },
    'small_sample': {
        'max_samples': 100,   # 100 samples for quick testing
        'generation_config': {'max_new_tokens': 256, 'temperature': 0.7}
    }
}

print("üîß Evaluation pipeline ready!")
print(f"üìã Available configurations: {list(EVALUATION_CONFIG.keys())}")
print("\nüí° Usage example:")
print("   # Load model")
print("   model, tokenizer = load_model_with_config('deepseek-ai/deepseek-coder-6.7b-instruct')")
print("   # Run evaluation")
print("   results_df, metrics = evaluate_model_comprehensive(")
print("       model, tokenizer, all_test_samples, 'deepseek-coder-6.7b',")
print("       **EVALUATION_CONFIG['medium_sample'])")
print("   # Compare with paper")
print("   comparison = compare_with_paper_results(results_df, 'deepseek-coder-6.7b')")

üîß Evaluation pipeline ready!
üìã Available configurations: ['full_dataset', 'large_sample', 'medium_sample', 'small_sample']

üí° Usage example:
   # Load model
   model, tokenizer = load_model_with_config('deepseek-ai/deepseek-coder-6.7b-instruct')
   # Run evaluation
   results_df, metrics = evaluate_model_comprehensive(
       model, tokenizer, all_test_samples, 'deepseek-coder-6.7b',
       **EVALUATION_CONFIG['medium_sample'])
   # Compare with paper
   comparison = compare_with_paper_results(results_df, 'deepseek-coder-6.7b')


In [35]:
def analyze_results_comprehensive(results_df, model_name):
    """Comprehensive analysis of evaluation results"""
    
    print(f"üîç COMPREHENSIVE RESULTS ANALYSIS: {model_name}")
    print(f"{'='*60}")
    
    # Basic statistics
    print(f"üìä Dataset Statistics:")
    print(f"   Total samples: {len(results_df):,}")
    print(f"   Unique issues: {results_df['issue_number'].nunique():,}")
    print(f"   Turn distribution: {results_df['turn_number'].value_counts().sort_index().to_dict()}")
    
    # Performance by metrics
    print(f"\nüìà Performance Metrics:")
    metrics = ['bleu', 'rouge_l', 'rouge_1', 'edit_similarity']
    for metric in metrics:
        mean_score = results_df[metric].mean()
        std_score = results_df[metric].std()
        median_score = results_df[metric].median()
        print(f"   {metric.upper():15}: Mean={mean_score:.4f}, Std={std_score:.4f}, Median={median_score:.4f}")
    
    # Performance by context length
    print(f"\nüìè Performance by Context Length:")
    results_df['context_length_bin'] = pd.qcut(
        results_df['context_length'], 
        q=5, 
        labels=['Very Short', 'Short', 'Medium', 'Long', 'Very Long']
    )
    
    length_analysis = results_df.groupby('context_length_bin')[metrics].mean()
    print(length_analysis.round(4))
    
    # Find best performing context length
    length_avg = length_analysis.mean(axis=1)
    best_length = length_avg.idxmax()
    print(f"\n‚ú® Best performing context length: {best_length} (avg score: {length_avg[best_length]:.4f})")
    
    # Performance by turn number
    print(f"\nüîÑ Performance by Turn Number:")
    turn_analysis = results_df.groupby('turn_number')[metrics].mean()
    print(turn_analysis.round(4))
    
    # Performance by maintainer role
    print(f"\nüë• Performance by Maintainer Role:")
    role_analysis = results_df.groupby('maintainer_role')[metrics].mean()
    print(role_analysis.round(4))
    
    # Identify best and worst predictions
    results_df['avg_score'] = results_df[metrics].mean(axis=1)
    best_idx = results_df['avg_score'].idxmax()
    worst_idx = results_df['avg_score'].idxmin()
    
    print(f"\nüèÜ Best Prediction (Score: {results_df.loc[best_idx, 'avg_score']:.4f}):")
    print(f"   Issue #{results_df.loc[best_idx, 'issue_number']}, Turn {results_df.loc[best_idx, 'turn_number']}")
    print(f"   Prediction: {results_df.loc[best_idx, 'prediction'][:200]}...")
    print(f"   Ground Truth: {results_df.loc[best_idx, 'ground_truth'][:200]}...")
    
    print(f"\nüîª Worst Prediction (Score: {results_df.loc[worst_idx, 'avg_score']:.4f}):")
    print(f"   Issue #{results_df.loc[worst_idx, 'issue_number']}, Turn {results_df.loc[worst_idx, 'turn_number']}")
    print(f"   Prediction: {results_df.loc[worst_idx, 'prediction'][:200]}...")
    print(f"   Ground Truth: {results_df.loc[worst_idx, 'ground_truth'][:200]}...")
    
    return {
        'length_analysis': length_analysis,
        'turn_analysis': turn_analysis,
        'role_analysis': role_analysis,
        'best_sample': results_df.loc[best_idx],
        'worst_sample': results_df.loc[worst_idx]
    }

def create_performance_visualizations(results_df, model_name):
    """Create visualizations for performance analysis"""
    
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle(f'Performance Analysis: {model_name}', fontsize=16)
        
        # 1. Metric distribution
        metrics = ['bleu', 'rouge_l', 'rouge_1', 'edit_similarity']
        results_df[metrics].boxplot(ax=axes[0,0])
        axes[0,0].set_title('Metric Distributions')
        axes[0,0].set_ylabel('Score')
        
        # 2. Performance by context length
        if 'context_length_bin' in results_df.columns:
            length_means = results_df.groupby('context_length_bin')[metrics].mean()
            length_means.plot(kind='bar', ax=axes[0,1])
            axes[0,1].set_title('Performance by Context Length')
            axes[0,1].set_ylabel('Average Score')
            axes[0,1].tick_params(axis='x', rotation=45)
        
        # 3. Score correlation heatmap
        correlation_matrix = results_df[metrics].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=axes[1,0])
        axes[1,0].set_title('Metric Correlations')
        
        # 4. Performance by turn number
        turn_means = results_df.groupby('turn_number')[metrics].mean()
        turn_means.plot(ax=axes[1,1])
        axes[1,1].set_title('Performance by Turn Number')
        axes[1,1].set_xlabel('Turn Number')
        axes[1,1].set_ylabel('Average Score')
        
        plt.tight_layout()
        plt.show()
        
        return fig
    
    except ImportError:
        print("üìä Matplotlib/Seaborn not available for visualizations")
        return None

def generate_evaluation_report(results_df, model_name, analysis_results, comparison_df=None):
    """Generate a comprehensive evaluation report"""
    
    report = f"""
# CodeRepoQA Evaluation Report
## Model: {model_name}
### Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}

---

## Executive Summary

**Model Performance:**
- Total samples evaluated: {len(results_df):,}
- Average BLEU score: {results_df['bleu'].mean():.4f}
- Average ROUGE-L score: {results_df['rouge_l'].mean():.4f}
- Average ROUGE-1 score: {results_df['rouge_1'].mean():.4f}
- Average Edit Similarity: {results_df['edit_similarity'].mean():.4f}
- **Overall Average Score: {results_df[['bleu', 'rouge_l', 'rouge_1', 'edit_similarity']].mean().mean():.4f}**

## Key Findings

### Performance by Context Length
{analysis_results['length_analysis'].round(4).to_string()}

**Best performing context length:** {analysis_results['length_analysis'].mean(axis=1).idxmax()}

### Performance by Turn Number
{analysis_results['turn_analysis'].round(4).to_string()}

### Performance by Maintainer Role
{analysis_results['role_analysis'].round(4).to_string()}

## Sample Analysis

### Best Performing Sample
- **Issue:** #{analysis_results['best_sample']['issue_number']}
- **Turn:** {analysis_results['best_sample']['turn_number']}
- **Score:** {analysis_results['best_sample'][['bleu', 'rouge_l', 'rouge_1', 'edit_similarity']].mean():.4f}
- **Context Length:** {analysis_results['best_sample']['context_length']} chars

### Worst Performing Sample
- **Issue:** #{analysis_results['worst_sample']['issue_number']}
- **Turn:** {analysis_results['worst_sample']['turn_number']}
- **Score:** {analysis_results['worst_sample'][['bleu', 'rouge_l', 'rouge_1', 'edit_similarity']].mean():.4f}
- **Context Length:** {analysis_results['worst_sample']['context_length']} chars

---

*Report generated by CodeRepoQA Evaluation Pipeline*
"""
    
    # Save report
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    report_filename = f"evaluation_report_{model_name.replace('/', '_')}_{timestamp}.md"
    
    with open(report_filename, 'w', encoding='utf-8') as f:
        f.write(report)
    
    print(f"üìÑ Evaluation report saved to: {report_filename}")
    return report_filename

print("üìä Results analysis functions ready!")
print("üí° Usage:")
print("   analysis = analyze_results_comprehensive(results_df, model_name)")
print("   fig = create_performance_visualizations(results_df, model_name)")
print("   report = generate_evaluation_report(results_df, model_name, analysis)")

üìä Results analysis functions ready!
üí° Usage:
   analysis = analyze_results_comprehensive(results_df, model_name)
   fig = create_performance_visualizations(results_df, model_name)
   report = generate_evaluation_report(results_df, model_name, analysis)


In [58]:
def analyze_context_length_performance(results_df, model_name):
    """
    Detailed analysis of performance by context length
    Validates paper's claim: "Medium-length contexts are more conducive to LLMs' performance"
    """
    
    print(f"üìè CONTEXT LENGTH ANALYSIS: {model_name}")
    print(f"{'='*60}")
    
    # Create detailed length bins
    context_lengths = results_df['context_length']
    
    # Create multiple binning strategies
    binning_strategies = {
        'quintiles': pd.qcut(context_lengths, q=5, labels=['Very Short', 'Short', 'Medium', 'Long', 'Very Long']),
        'fixed_ranges': pd.cut(context_lengths, 
                              bins=[0, 500, 1500, 3000, 5000, float('inf')],
                              labels=['<500', '500-1500', '1500-3000', '3000-5000', '5000+']),
        'paper_inspired': pd.cut(context_lengths,
                               bins=[0, 1000, 2500, 4000, float('inf')],
                               labels=['Short', 'Medium-Short', 'Medium-Long', 'Long'])
    }
    
    metrics = ['bleu', 'rouge_l', 'rouge_1', 'edit_similarity']
    
    for strategy_name, bins in binning_strategies.items():
        print(f"\nüìä {strategy_name.upper()} Analysis:")
        
        # Add bins to dataframe
        temp_df = results_df.copy()
        temp_df['length_bin'] = bins
        
        # Calculate performance by bin
        bin_analysis = temp_df.groupby('length_bin')[metrics].agg(['mean', 'std', 'count'])
        
        # Show results
        print(f"\nPerformance by {strategy_name}:")
        for metric in metrics:
            print(f"\n{metric.upper()}:")
            for bin_name in bin_analysis.index:
                mean_score = bin_analysis.loc[bin_name, (metric, 'mean')]
                std_score = bin_analysis.loc[bin_name, (metric, 'std')]
                count = bin_analysis.loc[bin_name, (metric, 'count')]
                print(f"  {bin_name:12}: {mean_score:.4f} ¬± {std_score:.4f} (n={count})")
        
        # Find best performing bin for this strategy
        overall_performance = temp_df.groupby('length_bin')[metrics].mean()
        avg_performance = overall_performance.mean(axis=1)
        best_bin = avg_performance.idxmax()
        best_score = avg_performance[best_bin]
        
        print(f"\n‚ú® Best performing bin ({strategy_name}): {best_bin} (avg: {best_score:.4f})")
        
        # Statistical significance testing (if scipy available)
        try:
            from scipy import stats
            
            # Compare best bin with others
            best_bin_data = temp_df[temp_df['length_bin'] == best_bin][metrics].mean(axis=1)
            other_bins_data = temp_df[temp_df['length_bin'] != best_bin][metrics].mean(axis=1)
            
            t_stat, p_value = stats.ttest_ind(best_bin_data, other_bins_data)
            significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
            
            print(f"üìà Statistical significance: t={t_stat:.3f}, p={p_value:.4f} {significance}")
            
        except ImportError:
            print("üìä Statistical testing requires scipy (not available)")
    
    # Paper claim verification
    print(f"\nüîç PAPER CLAIM VERIFICATION")
    print(f"{'='*40}")
    print(f"Paper claim: 'Medium-length contexts are more conducive to LLMs' performance'")
    
    # Use paper-inspired binning for verification
    temp_df = results_df.copy()
    temp_df['length_bin'] = binning_strategies['paper_inspired']
    avg_performance = temp_df.groupby('length_bin')[metrics].mean().mean(axis=1)
    
    print(f"\nOur findings:")
    for bin_name, score in avg_performance.items():
        marker = "üëë" if score == avg_performance.max() else "  "
        print(f"{marker} {bin_name:12}: {score:.4f}")
    
    best_category = avg_performance.idxmax()
    is_medium = 'medium' in best_category.lower()
    
    print(f"\nüéØ Conclusion: ", end="")
    if is_medium:
        print(f"‚úÖ CONFIRMED - '{best_category}' contexts perform best")
    else:
        print(f"‚ùì PARTIALLY CONFIRMED - '{best_category}' contexts perform best (not medium)")
    
    return {
        'binning_strategies': binning_strategies,
        'best_performing_bins': {name: temp_df.groupby('length_bin')[metrics].mean().mean(axis=1).idxmax() 
                               for name, bins in binning_strategies.items()},
        'paper_claim_verified': is_medium
    }

def detailed_context_analysis(results_df):
    """Additional detailed context analysis"""
    
    print(f"\nüîé DETAILED CONTEXT CHARACTERISTICS")
    print(f"{'='*50}")
    
    # Context length statistics
    context_stats = results_df['context_length'].describe()
    print(f"üìä Context Length Statistics:")
    for stat, value in context_stats.items():
        print(f"   {stat:8}: {value:8.1f}")
    
    # Correlation between context length and performance
    metrics = ['bleu', 'rouge_l', 'rouge_1', 'edit_similarity']
    
    print(f"\nüîó Correlation between Context Length and Performance:")
    for metric in metrics:
        correlation = results_df['context_length'].corr(results_df[metric])
        direction = "positive" if correlation > 0 else "negative"
        strength = "strong" if abs(correlation) > 0.5 else "moderate" if abs(correlation) > 0.3 else "weak"
        print(f"   {metric.upper():15}: {correlation:6.3f} ({strength} {direction})")
    
    # Context turns vs performance
    print(f"\nüîÑ Context Turns vs Performance:")
    turn_performance = results_df.groupby('context_turns')[metrics].mean()
    print(turn_performance.round(4))
    
    return {
        'context_stats': context_stats,
        'length_correlations': {metric: results_df['context_length'].corr(results_df[metric]) for metric in metrics},
        'turn_performance': turn_performance
    }

print("üìè Context length analysis functions ready!")
print("üí° Usage:")
print("   context_analysis = analyze_context_length_performance(results_df, model_name)")
print("   detailed_analysis = detailed_context_analysis(results_df)")

üìè Context length analysis functions ready!
üí° Usage:
   context_analysis = analyze_context_length_performance(results_df, model_name)
   detailed_analysis = detailed_context_analysis(results_df)


In [59]:
# COMPREHENSIVE FULL DATASET EVALUATION
# This section runs evaluation on ALL available data

def run_full_evaluation_suite():
    """
    Run comprehensive evaluation on the complete dataset
    """
    
    if 'all_test_samples' not in locals() or len(all_test_samples) == 0:
        print("‚ùå No test samples available! Run the data processing section first.")
        return
    
    print("üöÄ STARTING COMPREHENSIVE FULL DATASET EVALUATION")
    print("=" * 60)
    print(f"üìä Total samples to evaluate: {len(all_test_samples):,}")
    print(f"üìÅ Source files: {len(json_files):,} JSON files")
    print(f"‚è±Ô∏è  Estimated time: {len(all_test_samples) * 0.5 / 60:.1f} minutes per model")
    
    # Results storage
    all_model_results = {}
    evaluation_summary = []
    
    # Models to evaluate (modify based on available resources)
    models_to_evaluate = [
        "deepseek-coder-6.7b",  # Primary model from paper
        # "codeqwen-7b",        # Uncomment if you have sufficient GPU memory
        # "codellama-7b",       # Uncomment if you have sufficient GPU memory
        # "mistral-7b"          # Uncomment if you have sufficient GPU memory
    ]
    
    print(f"ü§ñ Models to evaluate: {models_to_evaluate}")
    print(f"\n‚ö†Ô∏è  Note: Evaluating one model at a time to manage memory")
    
    for model_key in models_to_evaluate:
        print(f"\n{'='*60}")
        print(f"üîÑ EVALUATING MODEL: {model_key}")
        print(f"{'='*60}")
        
        model_config = AVAILABLE_MODELS[model_key]
        model_name = model_config["name"]
        
        try:
            # Load model
            print(f"üì• Loading {model_name}...")
            model, tokenizer = load_model_with_config(model_name)
            
            if model is None:
                print(f"‚ùå Failed to load {model_key}, skipping...")
                continue
            
            # Run evaluation with different sample sizes
            evaluation_configs = {
                'full_dataset': {'max_samples': None, 'description': 'Complete dataset'},
                # 'large_sample': {'max_samples': 2000, 'description': '2K samples'},
                # 'medium_sample': {'max_samples': 1000, 'description': '1K samples'}
            }
            
            for config_name, config in evaluation_configs.items():
                print(f"\nüéØ Running {config['description']} evaluation...")
                
                # Run evaluation
                results_df, avg_metrics = evaluate_model_comprehensive(
                    model, tokenizer, all_test_samples, 
                    model_key,
                    max_samples=config['max_samples'],
                    generation_config=EVALUATION_CONFIG['full_dataset']['generation_config'],
                    save_results=True
                )
                
                # Store results
                result_key = f"{model_key}_{config_name}"
                all_model_results[result_key] = {
                    'results_df': results_df,
                    'avg_metrics': avg_metrics,
                    'model_name': model_key,
                    'config_name': config_name,
                    'sample_size': len(results_df)
                }
                
                # Add to summary
                evaluation_summary.append({
                    'model': model_key,
                    'config': config_name,
                    'samples': len(results_df),
                    'bleu': avg_metrics['bleu'],
                    'rouge_l': avg_metrics['rouge_l'],
                    'rouge_1': avg_metrics['rouge_1'],
                    'edit_similarity': avg_metrics['edit_similarity'],
                    'overall_avg': avg_metrics.mean()
                })
                
                # Run detailed analysis
                print(f"\nüìä Running detailed analysis...")
                analysis_results = analyze_results_comprehensive(results_df, model_key)
                
                # Context length analysis
                context_analysis = analyze_context_length_performance(results_df, model_key)
                
                # Compare with paper
                comparison_df = compare_with_paper_results(results_df, model_key)
                
                # Generate report
                report_file = generate_evaluation_report(
                    results_df, model_key, analysis_results, comparison_df
                )
                
                print(f"‚úÖ {config['description']} evaluation complete!")
                
                # Memory cleanup between configurations
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            
            # Clear model from memory before loading next one
            print(f"üßπ Clearing {model_key} from memory...")
            clear_model_memory(model)
            
        except Exception as e:
            print(f"‚ùå Error evaluating {model_key}: {e}")
            continue
    
    # Final summary
    if evaluation_summary:
        print(f"\n{'='*80}")
        print("üéâ FULL EVALUATION COMPLETE - FINAL SUMMARY")
        print(f"{'='*80}")
        
        summary_df = pd.DataFrame(evaluation_summary)
        print(summary_df.round(4))
        
        # Save final summary
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        summary_filename = f"full_evaluation_summary_{timestamp}.csv"
        summary_df.to_csv(summary_filename, index=False)
        print(f"üíæ Final summary saved to: {summary_filename}")
        
        # Best performing model
        best_model = summary_df.loc[summary_df['overall_avg'].idxmax()]
        print(f"\nüèÜ Best Performing Model:")
        print(f"   Model: {best_model['model']}")
        print(f"   Configuration: {best_model['config']}")
        print(f"   Overall Score: {best_model['overall_avg']:.4f}")
        print(f"   Sample Size: {best_model['samples']:,}")
        
    else:
        print("‚ùå No evaluations completed successfully")
    
    return all_model_results, evaluation_summary

# EXECUTION CELL - RUN FULL EVALUATION
print("üî• READY TO RUN FULL DATASET EVALUATION")
print("=" * 50)
print("‚ö° This will evaluate models on the complete dataset!")
print(f"üìä Sample count: {len(all_test_samples) if 'all_test_samples' in locals() else 'Not loaded'}")
print("‚è±Ô∏è  Estimated time: Several hours for complete evaluation")
print("üíæ Results will be saved automatically")
print()
print("üöÄ To start evaluation, run:")
print("   all_results, eval_summary = run_full_evaluation_suite()")
print()
print("‚ö†Ô∏è  Make sure you have:")
print("   ‚Ä¢ Sufficient GPU memory (8GB+ recommended)")
print("   ‚Ä¢ Stable internet connection for model downloads") 
print("   ‚Ä¢ Enough disk space for results (1GB+)")

# Uncomment the line below to start evaluation automatically
# all_results, eval_summary = run_full_evaluation_suite()

üî• READY TO RUN FULL DATASET EVALUATION
‚ö° This will evaluate models on the complete dataset!
üìä Sample count: Not loaded
‚è±Ô∏è  Estimated time: Several hours for complete evaluation
üíæ Results will be saved automatically

üöÄ To start evaluation, run:
   all_results, eval_summary = run_full_evaluation_suite()

‚ö†Ô∏è  Make sure you have:
   ‚Ä¢ Sufficient GPU memory (8GB+ recommended)
   ‚Ä¢ Stable internet connection for model downloads
   ‚Ä¢ Enough disk space for results (1GB+)


In [60]:
# FAST Multi-Repository Detection for CodeRepoQA Structure
def detect_repositories_fast():
    """
    Fast detection based on the actual structure:
    RepoName/cloudide/workspace/QA_data/RepoName/*.json
    """
    print("üöÄ FAST Multi-Repository Detection for CodeRepoQA")
    print("=" * 60)
    
    base_path = "/kaggle/input/coderepoqa"
    
    if not os.path.exists(base_path):
        print(f"‚ùå Base path not found: {base_path}")
        return {}
    
    repositories = {}
    
    # Get all repository directories
    repo_dirs = [d for d in os.listdir(base_path) 
                 if os.path.isdir(os.path.join(base_path, d))]
    
    print(f"üìÇ Found {len(repo_dirs)} potential repositories")
    
    for repo_name in sorted(repo_dirs):
        repo_path = os.path.join(base_path, repo_name)
        
        # Check for the expected structure: cloudide/workspace/QA_data/RepoName/
        qa_data_path = os.path.join(repo_path, "cloudide", "workspace", "QA_data")
        
        if os.path.exists(qa_data_path):
            # Look for the nested folder named after the repository
            nested_repo_path = os.path.join(qa_data_path, repo_name)
            
            if os.path.exists(nested_repo_path):
                # Count JSON files in the nested repository folder
                json_files = []
                try:
                    for file in os.listdir(nested_repo_path):
                        if file.endswith('.json'):
                            json_files.append(os.path.join(nested_repo_path, file))
                    
                    if json_files:
                        repositories[repo_name] = {
                            'path': repo_path,
                            'qa_data_path': qa_data_path,
                            'nested_repo_path': nested_repo_path,
                            'json_files': json_files,
                            'file_count': len(json_files)
                        }
                        print(f"‚úÖ {repo_name}: {len(json_files)} JSON files")
                    else:
                        print(f"‚ö†Ô∏è  {repo_name}: Nested folder exists but no JSON files found")
                except Exception as e:
                    print(f"‚ùå {repo_name}: Error reading nested folder - {e}")
            else:
                # Fallback: check if JSON files are directly in QA_data
                json_files = []
                try:
                    for file in os.listdir(qa_data_path):
                        if file.endswith('.json'):
                            json_files.append(os.path.join(qa_data_path, file))
                    
                    if json_files:
                        repositories[repo_name] = {
                            'path': repo_path,
                            'qa_data_path': qa_data_path,
                            'nested_repo_path': qa_data_path,  # Same as qa_data_path in this case
                            'json_files': json_files,
                            'file_count': len(json_files)
                        }
                        print(f"‚úÖ {repo_name}: {len(json_files)} JSON files (direct in QA_data)")
                    else:
                        print(f"‚ùå {repo_name}: No nested folder '{repo_name}' found in QA_data")
                except Exception as e:
                    print(f"‚ùå {repo_name}: Error reading QA_data - {e}")
        else:
            print(f"‚ùå {repo_name}: No QA_data structure found")
    
    print(f"\nüéØ DETECTION COMPLETE")
    print(f"üìä Successfully detected: {len(repositories)} repositories")
    print(f"üìÅ Total JSON files: {sum(repo['file_count'] for repo in repositories.values())}")
    
    return repositories

# Test the fast detection
print("Testing fast repository detection...")
detected_repos = detect_repositories_fast()

if detected_repos:
    print(f"\nüéâ SUCCESS! Found {len(detected_repos)} repositories:")
    for repo_name, info in list(detected_repos.items())[:5]:  # Show first 5
        print(f"   üìÅ {repo_name}: {info['file_count']} files")
    if len(detected_repos) > 5:
        print(f"   ... and {len(detected_repos) - 5} more repositories")
else:
    print("\n‚ùå No repositories detected")

Testing fast repository detection...
üöÄ FAST Multi-Repository Detection for CodeRepoQA
üìÇ Found 28 potential repositories
‚úÖ AutoGPT: 2229 JSON files
‚úÖ Pillow: 2976 JSON files
‚úÖ PyMySQL: 660 JSON files
‚úÖ TypeScript: 33607 JSON files
‚úÖ angular: 25902 JSON files
‚úÖ ansible: 31399 JSON files
‚úÖ core: 50540 JSON files
‚úÖ dubbo: 6934 JSON files
‚úÖ fastapi: 3415 JSON files
‚úÖ guava: 3342 JSON files
‚úÖ kubernetes: 44567 JSON files
‚úÖ moby: 21607 JSON files
‚úÖ nest: 5254 JSON files
‚úÖ nltk: 1775 JSON files
‚úÖ node: 17004 JSON files
‚úÖ numpy: 12076 JSON files
‚úÖ pandas: 25055 JSON files
‚úÖ plotly.py: 2829 JSON files
‚úÖ py-tree-sitter: 155 JSON files
‚úÖ pytorch: 42408 JSON files
‚úÖ rich: 1287 JSON files
‚úÖ scipy: 9775 JSON files
‚úÖ spring-framework: 24516 JSON files
‚úÖ terraform: 20090 JSON files
‚úÖ transformers: 15052 JSON files
‚úÖ typeorm: 7828 JSON files
‚úÖ vscode: 148293 JSON files
‚úÖ vue: 9744 JSON files

üéØ DETECTION COMPLETE
üìä Successfully detected

In [61]:
def run_complete_multi_repository_evaluation(
    repositories, 
    models_to_test=None, 
    samples_per_repo=50,
    max_repos=None,
    save_results=True
):
    """
    Complete multi-repository evaluation exactly like the original CodeRepoQA paper.
    
    Args:
        repositories: Dictionary from detect_repositories_fast()
        models_to_test: List of model names to test (default: all 4 models)
        samples_per_repo: Number of samples to evaluate per repository
        max_repos: Maximum number of repositories to test (for quick testing)
        save_results: Whether to save detailed results to files
    """
    
    if models_to_test is None:
        models_to_test = [
            "deepseek-ai/deepseek-coder-1.3b-instruct",
            "Qwen/CodeQwen1.5-7B-Chat", 
            "codellama/CodeLlama-7b-Instruct-hf",
            "mistralai/Mistral-7B-Instruct-v0.1"
        ]
    
    print("üöÄ COMPLETE MULTI-REPOSITORY EVALUATION")
    print("=" * 70)
    print(f"üìä Repositories: {len(repositories)}")
    print(f"ü§ñ Models: {len(models_to_test)}")
    print(f"üìù Samples per repo: {samples_per_repo}")
    print(f"üéØ Total evaluations: {len(repositories) * len(models_to_test) * samples_per_repo}")
    print("=" * 70)
    
    # Repository list (limit if specified)
    repo_list = list(repositories.keys())
    if max_repos:
        repo_list = repo_list[:max_repos]
        print(f"‚ö° Quick test mode: Testing only first {max_repos} repositories")
    
    # Results storage
    all_results = {}
    repository_summaries = {}
    
    # Progress tracking
    total_combinations = len(repo_list) * len(models_to_test)
    current_combination = 0
    
    for repo_name in repo_list:
        print(f"\nüîç REPOSITORY: {repo_name}")
        print(f"üìÅ Files: {repositories[repo_name]['file_count']} JSON files")
        print("-" * 50)
        
        # Load repository data
        try:
            repo_samples = load_repository_samples(repo_name, repositories[repo_name], limit=samples_per_repo)
            if not repo_samples:
                print(f"‚ùå No samples found for {repo_name}")
                continue
                
            print(f"‚úÖ Loaded {len(repo_samples)} samples from {repo_name}")
        except Exception as e:
            print(f"‚ùå Error loading {repo_name}: {str(e)}")
            continue
        
        repository_results = {}
        
        for model_name in models_to_test:
            current_combination += 1
            model_short = model_name.split('/')[-1]
            
            print(f"\nü§ñ Model {current_combination}/{total_combinations}: {model_short}")
            print(f"üìä Repository: {repo_name}")
            
            try:
                # Load model and tokenizer
                model, tokenizer = load_model_with_config(model_name)
                
                # Run evaluation using the existing function - FIXED PARAMETER NAME
                results_df, avg_metrics = evaluate_model_comprehensive(
                    model=model,
                    tokenizer=tokenizer, 
                    samples=repo_samples,
                    model_name=model_short,
                    max_samples=samples_per_repo,  # FIXED: was num_samples, should be max_samples
                    generation_config={'max_new_tokens': 512, 'temperature': 0.7},
                    save_results=False  # We'll save at the end
                )
                
                # Store results
                repository_results[model_name] = {
                    'results_df': results_df,
                    'aggregate_scores': {
                        'bleu': avg_metrics['bleu'],
                        'rouge_l': avg_metrics['rouge_l'],
                        'rouge_1': avg_metrics['rouge_1'],
                        'edit_similarity': avg_metrics['edit_similarity']
                    },
                    'sample_count': len(results_df),
                    'success_rate': (len(results_df) - (results_df[['bleu', 'rouge_l', 'rouge_1', 'edit_similarity']] == 0).all(axis=1).sum()) / len(results_df)
                }
                
                # Print summary
                print(f"   ‚úÖ BLEU: {avg_metrics['bleu']:.3f}")
                print(f"   ‚úÖ ROUGE-L: {avg_metrics['rouge_l']:.3f}")
                print(f"   ‚úÖ Edit Sim: {avg_metrics['edit_similarity']:.3f}")
                
                # Clean up GPU memory
                del model, tokenizer
                torch.cuda.empty_cache() if torch.cuda.is_available() else None
                    
            except Exception as e:
                print(f"   ‚ùå Error: {str(e)}")
                repository_results[model_name] = None
        
        all_results[repo_name] = repository_results
        
        # Calculate repository summary
        repo_summary = calculate_repository_summary(repository_results)
        repository_summaries[repo_name] = repo_summary
        
        print(f"\nüìà REPOSITORY SUMMARY: {repo_name}")
        if repo_summary:
            print(f"   üèÜ Best BLEU: {repo_summary['best_bleu']['model']} ({repo_summary['best_bleu']['score']:.3f})")
            print(f"   üèÜ Best ROUGE-L: {repo_summary['best_rouge_l']['model']} ({repo_summary['best_rouge_l']['score']:.3f})")
            print(f"   üìä Avg BLEU: {repo_summary['avg_bleu']:.3f}")
            print(f"   üìä Avg ROUGE-L: {repo_summary['avg_rouge_l']:.3f}")
    
    print("\n" + "=" * 70)
    print("üéØ COMPLETE MULTI-REPOSITORY EVALUATION FINISHED")
    print("=" * 70)
    
    # Generate cross-repository analysis
    cross_repo_analysis = generate_cross_repository_analysis(all_results)
    
    # Generate final comprehensive report
    final_report = generate_comprehensive_report(all_results, repository_summaries, cross_repo_analysis)
    
    if save_results:
        # Save results to files
        import json
        import pandas as pd
        from datetime import datetime
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save raw results (convert DataFrames to dicts for JSON serialization)
        serializable_results = {}
        for repo_name, repo_results in all_results.items():
            serializable_results[repo_name] = {}
            for model_name, results in repo_results.items():
                if results:
                    serializable_results[repo_name][model_name] = {
                        'aggregate_scores': results['aggregate_scores'],
                        'sample_count': results['sample_count'],
                        'success_rate': results['success_rate']
                        # Note: results_df not included to avoid size issues
                    }
                else:
                    serializable_results[repo_name][model_name] = None
        
        with open(f'multi_repo_results_{timestamp}.json', 'w') as f:
            json.dump(serializable_results, f, indent=2, default=str)
        
        # Save summary as CSV
        summary_df = pd.DataFrame(repository_summaries).T
        summary_df.to_csv(f'repository_summary_{timestamp}.csv')
        
        print(f"\nüíæ Results saved:")
        print(f"   üìÑ Raw results: multi_repo_results_{timestamp}.json")
        print(f"   üìä Summary: repository_summary_{timestamp}.csv")
    
    return {
        'all_results': all_results,
        'repository_summaries': repository_summaries,
        'cross_repository_analysis': cross_repo_analysis,
        'final_report': final_report
    }

def load_repository_samples(repo_name, repo_info, limit=None):
    """Load samples from a specific repository using the detected repository info."""
    
    qa_data_path = repo_info['qa_data_path']
    json_files = repo_info['json_files']
    
    print(f"   üìÇ Loading from: {qa_data_path}")
    print(f"   üìÑ Processing {len(json_files)} JSON files")
    
    samples = []
    
    # Load samples from JSON files
    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                
            # Handle different JSON structures
            if isinstance(data, list):
                samples.extend(data)
            elif isinstance(data, dict):
                samples.append(data)
                
        except Exception as e:
            print(f"   ‚ö†Ô∏è Error loading {json_file}: {str(e)}")
            continue
    
    if not samples:
        print(f"   ‚ùå No samples found for repository: {repo_name}")
        return []
    
    print(f"   ‚úÖ Loaded {len(samples)} total samples")
    
    # Limit samples if requested
    if limit and len(samples) > limit:
        # Use stratified sampling for better representation
        indices = np.linspace(0, len(samples)-1, limit, dtype=int)
        samples = [samples[i] for i in indices]
        print(f"   üéØ Selected {len(samples)} samples (stratified sampling)")
    
    return samples

def calculate_repository_summary(repository_results):
    """Calculate summary statistics for a single repository across all models."""
    if not repository_results:
        return None
    
    summary = {
        'models_tested': 0,
        'successful_models': 0,
        'avg_bleu': 0,
        'avg_rouge_l': 0,
        'avg_rouge_1': 0,
        'avg_edit_similarity': 0,
        'best_bleu': {'model': '', 'score': 0},
        'best_rouge_l': {'model': '', 'score': 0},
        'model_rankings': {}
    }
    
    valid_results = []
    for model_name, results in repository_results.items():
        summary['models_tested'] += 1
        
        if results and 'aggregate_scores' in results:
            summary['successful_models'] += 1
            scores = results['aggregate_scores']
            valid_results.append({
                'model': model_name.split('/')[-1],
                'bleu': scores.get('bleu', 0),
                'rouge_l': scores.get('rouge_l', 0),
                'rouge_1': scores.get('rouge_1', 0),
                'edit_similarity': scores.get('edit_similarity', 0)
            })
    
    if valid_results:
        # Calculate averages
        summary['avg_bleu'] = sum(r['bleu'] for r in valid_results) / len(valid_results)
        summary['avg_rouge_l'] = sum(r['rouge_l'] for r in valid_results) / len(valid_results)
        summary['avg_rouge_1'] = sum(r['rouge_1'] for r in valid_results) / len(valid_results)
        summary['avg_edit_similarity'] = sum(r['edit_similarity'] for r in valid_results) / len(valid_results)
        
        # Find best performers
        best_bleu = max(valid_results, key=lambda x: x['bleu'])
        summary['best_bleu'] = {'model': best_bleu['model'], 'score': best_bleu['bleu']}
        
        best_rouge = max(valid_results, key=lambda x: x['rouge_l'])
        summary['best_rouge_l'] = {'model': best_rouge['model'], 'score': best_rouge['rouge_l']}
        
        # Model rankings
        for result in valid_results:
            summary['model_rankings'][result['model']] = {
                'bleu': result['bleu'],
                'rouge_l': result['rouge_l'],
                'rouge_1': result['rouge_1'],
                'edit_similarity': result['edit_similarity']
            }
    
    return summary

def generate_cross_repository_analysis(all_results):
    """Generate cross-repository analysis to understand model consistency."""
    
    analysis = {
        'model_consistency': {},
        'repository_difficulty': {},
        'overall_rankings': {},
        'statistical_significance': {}
    }
    
    # Get all model names
    model_names = set()
    for repo_results in all_results.values():
        model_names.update(repo_results.keys())
    
    # Analyze each model's consistency across repositories
    for model_name in model_names:
        model_short = model_name.split('/')[-1]
        model_scores = {'bleu': [], 'rouge_l': [], 'rouge_1': [], 'edit_similarity': []}
        successful_repos = 0
        
        for repo_name, repo_results in all_results.items():
            if model_name in repo_results and repo_results[model_name]:
                if 'aggregate_scores' in repo_results[model_name]:
                    scores = repo_results[model_name]['aggregate_scores']
                    model_scores['bleu'].append(scores.get('bleu', 0))
                    model_scores['rouge_l'].append(scores.get('rouge_l', 0))
                    model_scores['rouge_1'].append(scores.get('rouge_1', 0))
                    model_scores['edit_similarity'].append(scores.get('edit_similarity', 0))
                    successful_repos += 1
        
        if model_scores['bleu']:  # If we have any scores
            import numpy as np
            analysis['model_consistency'][model_short] = {
                'successful_repositories': successful_repos,
                'total_repositories': len(all_results),
                'success_rate': successful_repos / len(all_results),
                'bleu_stats': {
                    'mean': np.mean(model_scores['bleu']),
                    'std': np.std(model_scores['bleu']),
                    'min': np.min(model_scores['bleu']),
                    'max': np.max(model_scores['bleu'])
                },
                'rouge_l_stats': {
                    'mean': np.mean(model_scores['rouge_l']),
                    'std': np.std(model_scores['rouge_l']),
                    'min': np.min(model_scores['rouge_l']),
                    'max': np.max(model_scores['rouge_l'])
                }
            }
    
    # Analyze repository difficulty
    for repo_name, repo_results in all_results.items():
        repo_scores = []
        for model_name, results in repo_results.items():
            if results and 'aggregate_scores' in results:
                scores = results['aggregate_scores']
                repo_scores.append({
                    'bleu': scores.get('bleu', 0),
                    'rouge_l': scores.get('rouge_l', 0)
                })
        
        if repo_scores:
            import numpy as np
            analysis['repository_difficulty'][repo_name] = {
                'models_tested': len(repo_scores),
                'avg_bleu': np.mean([s['bleu'] for s in repo_scores]),
                'avg_rouge_l': np.mean([s['rouge_l'] for s in repo_scores]),
                'difficulty_rank': 0  # Will be calculated later
            }
    
    # Calculate difficulty rankings
    if analysis['repository_difficulty']:
        sorted_repos = sorted(
            analysis['repository_difficulty'].items(),
            key=lambda x: (x[1]['avg_bleu'] + x[1]['avg_rouge_l']) / 2
        )
        
        for i, (repo_name, stats) in enumerate(sorted_repos):
            analysis['repository_difficulty'][repo_name]['difficulty_rank'] = i + 1
    
    return analysis

def generate_comprehensive_report(all_results, repository_summaries, cross_repo_analysis):
    """Generate a comprehensive report like the original paper."""
    
    report = {
        'executive_summary': {},
        'model_performance': {},
        'repository_analysis': {},
        'key_findings': [],
        'recommendations': []
    }
    
    # Executive Summary
    total_repos = len(all_results)
    total_models = len(set(model for repo in all_results.values() for model in repo.keys()))
    
    report['executive_summary'] = {
        'repositories_evaluated': total_repos,
        'models_evaluated': total_models,
        'total_evaluations': sum(len(repo) for repo in all_results.values()),
        'evaluation_scope': 'Multi-repository evaluation following CodeRepoQA paper methodology'
    }
    
    # Model Performance Summary
    if cross_repo_analysis['model_consistency']:
        model_rankings = []
        for model, stats in cross_repo_analysis['model_consistency'].items():
            model_rankings.append({
                'model': model,
                'avg_bleu': stats['bleu_stats']['mean'],
                'avg_rouge_l': stats['rouge_l_stats']['mean'],
                'consistency_bleu': 1 / (stats['bleu_stats']['std'] + 0.001),  # Lower std = higher consistency
                'success_rate': stats['success_rate']
            })
        
        # Sort by average performance
        model_rankings.sort(key=lambda x: (x['avg_bleu'] + x['avg_rouge_l']) / 2, reverse=True)
        report['model_performance']['rankings'] = model_rankings
    
    # Key Findings
    if model_rankings:
        best_model = model_rankings[0]
        report['key_findings'].append(f"Best overall model: {best_model['model']} (BLEU: {best_model['avg_bleu']:.3f}, ROUGE-L: {best_model['avg_rouge_l']:.3f})")
    
    if cross_repo_analysis['repository_difficulty']:
        easiest_repo = min(cross_repo_analysis['repository_difficulty'].items(), 
                          key=lambda x: x[1]['difficulty_rank'])
        hardest_repo = max(cross_repo_analysis['repository_difficulty'].items(), 
                          key=lambda x: x[1]['difficulty_rank'])
        
        report['key_findings'].append(f"Easiest repository: {easiest_repo[0]} (avg BLEU: {easiest_repo[1]['avg_bleu']:.3f})")
        report['key_findings'].append(f"Hardest repository: {hardest_repo[0]} (avg BLEU: {hardest_repo[1]['avg_bleu']:.3f})")
    
    return report

print("‚úÖ Multi-repository evaluation functions loaded successfully!")
print("üöÄ Ready to run complete evaluation like the original paper!")

‚úÖ Multi-repository evaluation functions loaded successfully!
üöÄ Ready to run complete evaluation like the original paper!


In [40]:
# # FIX: Model Configuration and Missing Functions
# print("üîß Applying fixes for model loading and evaluation functions...")

# def process_raw_github_issue_to_sample(raw_issue):
#     """
#     Convert raw GitHub issue JSON to a processed sample with context and ground truth
#     """
#     try:
#         # Build conversation from issue and comments
#         conversation = []
        
#         # 1. Add the initial issue
#         issue_title = raw_issue.get('title', '').strip()
#         issue_body = raw_issue.get('body', '').strip()
        
#         if issue_title and issue_body:
#             issue_content = f"Title: {issue_title}\n\n{issue_body}"
#         elif issue_title:
#             issue_content = f"Title: {issue_title}"
#         elif issue_body:
#             issue_content = issue_body
#         else:
#             issue_content = "No content available"
        
#         conversation.append({
#             'speaker': 'user',
#             'content': issue_content,
#             'role': 'USER'
#         })
        
#         # 2. Add comments if available
#         comments_details = raw_issue.get('comments_details', [])
#         maintainer_responses = []
        
#         for comment in comments_details:
#             comment_body = comment.get('body', '').strip()
#             if not comment_body or len(comment_body) < 10:
#                 continue
                
#             # Determine if this is a maintainer response
#             author_association = comment.get('author_association', 'NONE')
#             is_maintainer = author_association in ['OWNER', 'MEMBER', 'COLLABORATOR']
            
#             conversation.append({
#                 'speaker': 'maintainer' if is_maintainer else 'user',
#                 'content': comment_body,
#                 'role': author_association
#             })
            
#             if is_maintainer:
#                 maintainer_responses.append(len(conversation) - 1)
        
#         # 3. Create test samples - one per maintainer response
#         samples = []
#         for maintainer_idx in maintainer_responses:
#             context_turns = conversation[:maintainer_idx]
#             ground_truth = conversation[maintainer_idx]['content']
            
#             if len(context_turns) >= 1:  # Need at least the initial issue
#                 samples.append({
#                     'issue_number': raw_issue.get('number', 'unknown'),
#                     'context': context_turns,
#                     'ground_truth': ground_truth,
#                     'turn_number': maintainer_responses.index(maintainer_idx) + 1,
#                     'total_conversation_turns': len(conversation),
#                     'maintainer_role': conversation[maintainer_idx]['role'],
#                     'total_maintainer_turns': len(maintainer_responses)
#                 })
        
#         return samples if samples else []
        
#     except Exception as e:
#         print(f"‚ö†Ô∏è Error processing issue {raw_issue.get('number', 'unknown')}: {e}")
#         return []

# # Add missing batch_generate_responses function with correct signature for samples
# def batch_generate_responses(model, tokenizer, samples, generation_config=None, batch_size=None):
#     """
#     Generate responses for a batch of samples (handles both raw issues and processed samples)
#     """
#     if generation_config is None:
#         generation_config = {
#             'max_new_tokens': 512,
#             'do_sample': True,
#             'pad_token_id': tokenizer.eos_token_id
#         }
    
#     # Convert samples to prompts
#     prompts = []
#     processed_samples = []
    
#     for sample in samples:
#         if isinstance(sample, dict):
#             # Check if this is a processed sample or raw GitHub issue
#             if 'context' in sample and 'ground_truth' in sample:
#                 # Already processed sample
#                 prompt = format_conversation_context(sample['context'])
#                 processed_samples.append(sample)
#             elif 'title' in sample and 'body' in sample:
#                 # Raw GitHub issue - process it first
#                 print("üîÑ Processing raw GitHub issue into samples...")
#                 issue_samples = process_raw_github_issue_to_sample(sample)
#                 if issue_samples:
#                     # Use the first sample from this issue
#                     first_sample = issue_samples[0]
#                     prompt = format_conversation_context(first_sample['context'])
#                     processed_samples.append(first_sample)
#                 else:
#                     # Fallback: use issue title and body
#                     title = sample.get('title', '')
#                     body = sample.get('body', '')
#                     prompt = f"User: Title: {title}\n\n{body}\n\nAssistant:"
#                     processed_samples.append({
#                         'issue_number': sample.get('number', 'unknown'),
#                         'context': [{'speaker': 'user', 'content': f"Title: {title}\n\n{body}", 'role': 'USER'}],
#                         'ground_truth': "No maintainer response available"
#                     })
#             else:
#                 # Unknown format - convert to string
#                 prompt = str(sample)
#                 processed_samples.append({'context': [], 'ground_truth': 'Unknown format'})
#         else:
#             # String prompt
#             prompt = str(sample)
#             processed_samples.append({'context': [], 'ground_truth': 'String input'})
        
#         prompts.append(prompt)
    
#     # Generate responses
#     if batch_size and len(prompts) > batch_size:
#         all_responses = []
#         for i in range(0, len(prompts), batch_size):
#             batch_prompts = prompts[i:i+batch_size]
#             batch_responses = batch_generate_responses_internal(model, tokenizer, batch_prompts, generation_config)
#             all_responses.extend(batch_responses)
#         return all_responses
    
#     return batch_generate_responses_internal(model, tokenizer, prompts, generation_config)

# def batch_generate_responses_internal(model, tokenizer, prompts, generation_config):
#     """
#     Internal function to generate responses for string prompts
#     """
#     # Clean generation config for compatibility
#     clean_config = generation_config.copy()
#     if 'pad_token_id' in clean_config:
#         del clean_config['pad_token_id']  # Will be set automatically
    
#     # Tokenize inputs
#     inputs = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt", max_length=2048)
    
#     # Move to model device
#     if torch.cuda.is_available() and hasattr(model, 'device'):
#         inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
#     # Generate responses
#     with torch.no_grad():
#         try:
#             outputs = model.generate(
#                 input_ids=inputs['input_ids'],
#                 attention_mask=inputs['attention_mask'],
#                 pad_token_id=tokenizer.eos_token_id,
#                 **clean_config
#             )
#         except Exception as e:
#             print(f"‚ö†Ô∏è Generation error: {e}")
#             return ["Error in generation" for _ in prompts]
    
#     # Decode responses (remove input tokens)
#     responses = []
#     for i, output in enumerate(outputs):
#         # Get only the new tokens (response)
#         input_length = inputs['input_ids'][i].shape[0]
#         response_tokens = output[input_length:]
#         response = tokenizer.decode(response_tokens, skip_special_tokens=True)
#         responses.append(response.strip())
    
#     return responses

# def format_conversation_context(context, max_context_length=4000):
#     """
#     Format conversation context for model input with length management
#     """
#     formatted = ""
#     total_length = 0
    
#     # Add context turns, respecting length limits
#     for i, turn in enumerate(context):
#         turn_text = ""
#         if turn['speaker'] == 'user':
#             turn_text = f"User: {turn['content']}\n\n"
#         else:
#             turn_text = f"Assistant: {turn['content']}\n\n"
        
#         # Check if adding this turn would exceed limit
#         if total_length + len(turn_text) > max_context_length and i > 0:
#             # If we must truncate, ensure we keep at least the first turn (issue description)
#             if i == 1:  # First turn must be preserved
#                 formatted += turn_text[:max_context_length - total_length - 50] + "...\n\n"
#             break
        
#         formatted += turn_text
#         total_length += len(turn_text)
    
#     # Add assistant prompt
#     formatted += "Assistant:"
#     return formatted

# # Model name mapping for correct HuggingFace identifiers
# MODEL_NAME_MAP = {
#     'deepseek-coder-6.7b': 'deepseek-ai/deepseek-coder-6.7b-instruct',
#     'codellama-7b-instruct': 'codellama/CodeLlama-7b-Instruct-hf',
#     'codeqwen-7b-chat': 'Qwen/CodeQwen1.5-7B-Chat',
#     'mistral-7b': 'mistralai/Mistral-7B-Instruct-v0.1'
# }

# # Override the load_model_with_config function to use correct model names
# def load_model_with_config_fixed(model_name, max_memory_gb=None):
#     """
#     Load a model with optimized configuration for evaluation (FIXED VERSION)
#     """
#     print(f"üîÑ Loading model: {model_name}")
    
#     try:
#         # Check if model_name is a key in our mapping
#         if model_name in MODEL_NAME_MAP:
#             actual_model_name = MODEL_NAME_MAP[model_name]
#             print(f"üìù Using model: {actual_model_name}")
#         else:
#             actual_model_name = model_name
        
#         # Configure device mapping for multi-GPU setups
#         device_map = "auto" if torch.cuda.device_count() > 1 else None
        
#         # Memory optimization
#         torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
        
#         # Load tokenizer
#         tokenizer = AutoTokenizer.from_pretrained(
#             actual_model_name,
#             trust_remote_code=True,
#             padding_side="left"  # Important for batch generation
#         )
        
#         # Add pad token if missing
#         if tokenizer.pad_token is None:
#             tokenizer.pad_token = tokenizer.eos_token
            
#         # Load model with optimizations
#         model = AutoModelForCausalLM.from_pretrained(
#             actual_model_name,
#             torch_dtype=torch_dtype,
#             device_map=device_map,
#             trust_remote_code=True,
#             load_in_8bit=False,  # Set to True if you need memory savings
#             low_cpu_mem_usage=True
#         )
        
#         print(f"‚úÖ Model loaded successfully")
#         print(f"üìä Model parameters: {model.num_parameters():,}")
#         if torch.cuda.is_available():
#             print(f"üîß Device: {next(model.parameters()).device}")
#             print(f"üíæ Model dtype: {next(model.parameters()).dtype}")
        
#         return model, tokenizer
        
#     except Exception as e:
#         print(f"‚ùå Error loading model {actual_model_name}: {str(e)}")
#         return None, None

# # Override the original function
# load_model_with_config = load_model_with_config_fixed

# print("‚úÖ Fixes applied successfully!")
# print("üìù Now handles both raw GitHub issues and processed samples")
# print("üîß Will automatically convert raw issues to proper test samples")

üîß Applying fixes for model loading and evaluation functions...
‚úÖ Fixes applied successfully!
üìù Now handles both raw GitHub issues and processed samples
üîß Will automatically convert raw issues to proper test samples


In [62]:
# FIX: Generate Comprehensive Report Function
def generate_comprehensive_report_fixed(all_results, repository_summaries, cross_repo_analysis):
    """
    Generate a comprehensive evaluation report (FIXED VERSION)
    """
    from datetime import datetime
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    report = {
        'evaluation_metadata': {
            'timestamp': timestamp,
            'total_repositories': len(repository_summaries),
            'total_models': len(set(key.split('_')[0] for key in all_results.keys())) if all_results else 0,
            'evaluation_type': 'Multi-Repository CodeRepoQA Evaluation'
        },
        'repository_summaries': repository_summaries,
        'cross_repository_analysis': cross_repo_analysis,
        'model_performance': {},
        'key_findings': []
    }
    
    # Aggregate model performance across all repositories
    model_performance = {}
    for result_key, result_data in all_results.items():
        model_name = result_key.split('_')[0]
        if model_name not in model_performance:
            model_performance[model_name] = {
                'total_samples': 0,
                'bleu_scores': [],
                'rouge_l_scores': [],
                'rouge_1_scores': [],
                'edit_sim_scores': [],
                'repositories_tested': []
            }
        
        if 'results_df' in result_data and result_data['results_df'] is not None:
            df = result_data['results_df']
            model_performance[model_name]['total_samples'] += len(df)
            model_performance[model_name]['bleu_scores'].extend(df['bleu_score'].tolist())
            model_performance[model_name]['rouge_l_scores'].extend(df['rouge_l'].tolist())
            model_performance[model_name]['rouge_1_scores'].extend(df['rouge_1'].tolist())
            model_performance[model_name]['edit_sim_scores'].extend(df['edit_similarity'].tolist())
            
            # Extract repository name from result_key
            repo_name = '_'.join(result_key.split('_')[1:])
            model_performance[model_name]['repositories_tested'].append(repo_name)
    
    # Calculate aggregate statistics
    for model_name, data in model_performance.items():
        if data['bleu_scores']:  # Only if we have data
            data['avg_bleu'] = np.mean(data['bleu_scores'])
            data['avg_rouge_l'] = np.mean(data['rouge_l_scores'])
            data['avg_rouge_1'] = np.mean(data['rouge_1_scores'])
            data['avg_edit_sim'] = np.mean(data['edit_sim_scores'])
            data['std_bleu'] = np.std(data['bleu_scores'])
            data['std_rouge_l'] = np.std(data['rouge_l_scores'])
            data['repositories_count'] = len(set(data['repositories_tested']))
        else:
            # No valid data
            data['avg_bleu'] = 0.0
            data['avg_rouge_l'] = 0.0
            data['avg_rouge_1'] = 0.0
            data['avg_edit_sim'] = 0.0
            data['std_bleu'] = 0.0
            data['std_rouge_l'] = 0.0
            data['repositories_count'] = 0
    
    report['model_performance'] = model_performance
    
    # Create model rankings - Initialize as empty list first
    model_rankings = []
    for model_name, data in model_performance.items():
        model_rankings.append({
            'model': model_name,
            'avg_bleu': data['avg_bleu'],
            'avg_rouge_l': data['avg_rouge_l'],
            'total_samples': data['total_samples'],
            'repositories_tested': data['repositories_count']
        })
    
    # Sort by average BLEU score (only if we have rankings)
    if model_rankings:
        model_rankings.sort(key=lambda x: x['avg_bleu'], reverse=True)
    
    # Key Findings
    if model_rankings and len(model_rankings) > 0:
        best_model = model_rankings[0]
        report['key_findings'].append(f"Best overall model: {best_model['model']} (BLEU: {best_model['avg_bleu']:.3f}, ROUGE-L: {best_model['avg_rouge_l']:.3f})")
        
        if len(model_rankings) > 1:
            worst_model = model_rankings[-1]
            performance_gap = best_model['avg_bleu'] - worst_model['avg_bleu']
            report['key_findings'].append(f"Performance gap: {performance_gap:.3f} BLEU points between best and worst models")
    else:
        report['key_findings'].append("No valid evaluation results found across all models and repositories")
    
    report['model_rankings'] = model_rankings
    
    return report

# Override the original function
generate_comprehensive_report = generate_comprehensive_report_fixed

print("‚úÖ Fixed generate_comprehensive_report function!")
print("üîß All major issues should now be resolved")

‚úÖ Fixed generate_comprehensive_report function!
üîß All major issues should now be resolved


In [63]:
# üß† Complete Memory-Optimized Model Loading with Safe Error Handling
import torch
import gc

def clear_gpu_memory():
    """Clear GPU memory thoroughly"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        gc.collect()

def load_model_with_config_memory_optimized(model_name, max_gpu_memory_gb=7.5):
    """
    Load model with aggressive memory optimization and safe error handling
    """
    # FIXED: Include short model names in the mapping
    model_map = {
        # Short names (used in evaluation)
        'deepseek-coder-1.3b': 'deepseek-ai/deepseek-coder-1.3b-instruct',
        'codeqwen-7b': 'Qwen/CodeQwen1.5-7B-Chat',
        'codellama-7b': 'codellama/CodeLlama-7b-Instruct-hf',
        'mistral-7b': 'mistralai/Mistral-7B-Instruct-v0.1',
        
        # Full HuggingFace names (fallback)
        'deepseek-ai/deepseek-coder-1.3b-instruct': 'deepseek-ai/deepseek-coder-1.3b-instruct',
        'Qwen/CodeQwen1.5-7B-Chat': 'Qwen/CodeQwen1.5-7B-Chat', 
        'codellama/CodeLlama-7b-Instruct-hf': 'codellama/CodeLlama-7b-Instruct-hf',
        'mistralai/Mistral-7B-Instruct-v0.1': 'mistralai/Mistral-7B-Instruct-v0.1'
    }
    
    hf_model_name = model_map.get(model_name, model_name)
    
    print(f"üîÑ Model mapping: {model_name} ‚Üí {hf_model_name}")
    
    # Clear memory before loading
    clear_gpu_memory()
    
    try:
        from transformers import AutoTokenizer, AutoModelForCausalLM
        
        print(f"üîÑ Loading tokenizer for {hf_model_name}...")
        tokenizer = AutoTokenizer.from_pretrained(hf_model_name, trust_remote_code=True)
        
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        print(f"üîÑ Loading model {hf_model_name} with memory optimization...")
        
        # Calculate max memory in bytes
        max_memory_bytes = int(max_gpu_memory_gb * 1024**3)
        
        # Load with strict memory limits and CPU fallback
        model = AutoModelForCausalLM.from_pretrained(
            hf_model_name,
            torch_dtype=torch.float16,  # Use float16 to save memory
            device_map="auto",
            trust_remote_code=True,
            max_memory={0: f"{max_gpu_memory_gb}GB"},
            low_cpu_mem_usage=True,
            # Removed quantization to avoid bitsandbytes dependency
        )
        
        print(f"‚úÖ Successfully loaded {model_name} ({hf_model_name})")
        print(f"üìä Model device: {next(model.parameters()).device}")
        
        # Check GPU memory usage
        if torch.cuda.is_available():
            memory_allocated = torch.cuda.memory_allocated() / 1024**3
            memory_reserved = torch.cuda.memory_reserved() / 1024**3
            print(f"üîç GPU Memory - Allocated: {memory_allocated:.2f}GB, Reserved: {memory_reserved:.2f}GB")
        
        return model, tokenizer
        
    except Exception as e:
        print(f"‚ùå Failed to load {model_name} ({hf_model_name}): {str(e)}")
        print("üîÑ Attempting CPU fallback...")
        
        try:
            # Try CPU fallback
            model = AutoModelForCausalLM.from_pretrained(
                hf_model_name,
                torch_dtype=torch.float16,
                device_map="cpu",
                trust_remote_code=True,
                low_cpu_mem_usage=True
            )
            print(f"‚úÖ Loaded {model_name} ({hf_model_name}) on CPU")
            return model, tokenizer
            
        except Exception as cpu_error:
            print(f"‚ùå CPU fallback also failed: {str(cpu_error)}")
            clear_gpu_memory()
            return None, None

# Create the main loading function
def load_model_with_config(model_name):
    """Main model loading function with safe error handling"""
    return load_model_with_config_memory_optimized(model_name)

print("üß† Updated memory-optimized model loading function")
print("üõ°Ô∏è Added safe error handling and CPU fallback")
print("‚ö° Removed quantization to avoid bitsandbytes dependency")
print("üîß FIXED: Added short model name mapping (deepseek-coder-6.7b ‚Üí deepseek-ai/deepseek-coder-6.7b-instruct)")

üß† Updated memory-optimized model loading function
üõ°Ô∏è Added safe error handling and CPU fallback
‚ö° Removed quantization to avoid bitsandbytes dependency
üîß FIXED: Added short model name mapping (deepseek-coder-6.7b ‚Üí deepseek-ai/deepseek-coder-6.7b-instruct)


In [64]:
# üîß FIX: Better Error Handling for Failed Model Loading

# First define the original batch_generate_responses function
def batch_generate_responses(model, tokenizer, samples, generation_config=None, batch_size=None):
    """Original batch generation function"""
    import torch
    
    if batch_size is None:
        batch_size = 1  # Process one at a time to be safe
    
    if generation_config is None:
        generation_config = {
            'max_new_tokens': 512,
            'temperature': 0.7,
            'do_sample': True,
            'pad_token_id': tokenizer.eos_token_id
        }
    
    responses = []
    
    for i in range(0, len(samples), batch_size):
        batch_samples = samples[i:i+batch_size]
        prompts = []
        
        for sample in batch_samples:
            if isinstance(sample, dict):
                context = sample.get('context', [])
                if context:
                    prompt_parts = []
                    for ctx in context:
                        if isinstance(ctx, dict):
                            content = ctx.get('content', str(ctx))
                        else:
                            content = str(ctx)
                        prompt_parts.append(content)
                    prompt = "\\n".join(prompt_parts)
                else:
                    # Handle raw GitHub issues
                    title = sample.get('title', '')
                    body = sample.get('body', '')
                    prompt = f"Issue: {title}\\n\\nDescription: {body}"
            else:
                prompt = str(sample)
            
            prompts.append(prompt)
        
        # Generate responses
        try:
            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=2048)
            if torch.cuda.is_available():
                inputs = {k: v.to(model.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    **generation_config
                )
            
            # Decode responses
            for j, output in enumerate(outputs):
                response = tokenizer.decode(output, skip_special_tokens=True)
                # Remove the input prompt from the response
                input_text = prompts[j]
                if response.startswith(input_text):
                    response = response[len(input_text):].strip()
                responses.append(response)
                
        except Exception as e:
            print(f"‚ùå Generation error: {str(e)}")
            for _ in batch_samples:
                responses.append(f"[Generation failed: {str(e)}]")
    
    return responses

# Now define the safe version
def batch_generate_responses_safe(model, tokenizer, samples, generation_config=None, batch_size=None):
    """
    Safe version of batch_generate_responses that handles None model AND creates proper samples
    """
    # Check if model failed to load
    if model is None or tokenizer is None:
        print("‚ùå Model or tokenizer is None - cannot generate responses")
        
        # Create proper sample structure for evaluation
        processed_samples = []
        for sample in samples:
            if isinstance(sample, dict):
                # Create a proper sample structure with all required fields
                processed_sample = {
                    'issue_number': sample.get('number', 'unknown'),
                    'context': sample.get('context', []),
                    'ground_truth': sample.get('ground_truth', '[No maintainer response]'),
                    'turn_number': 1,
                    'total_conversation_turns': 1,
                    'maintainer_role': 'NONE',
                    'total_maintainer_turns': 0,
                    'prediction': '[Model failed to load]',
                    'failed_generation': True
                }
                processed_samples.append(processed_sample)
            else:
                # Simple fallback
                processed_samples.append({
                    'issue_number': 'unknown',
                    'context': [],
                    'ground_truth': '[No ground truth available]',
                    'turn_number': 1,
                    'total_conversation_turns': 1,
                    'maintainer_role': 'NONE',
                    'total_maintainer_turns': 0,
                    'prediction': '[Model failed to load]',
                    'failed_generation': True
                })
        
        # Return both responses and processed samples (matching expected return format)
        responses = ["[Model failed to load]" for _ in samples]
        return responses
    
    # Call the original function
    return batch_generate_responses_original(model, tokenizer, samples, generation_config, batch_size)

# Also create a safe model loading wrapper
def load_model_safe(model_name):
    """
    Safe model loading that handles memory issues gracefully
    """
    try:
        print(f"üîÑ Attempting to load {model_name}...")
        model, tokenizer = load_model_with_config(model_name)
        
        if model is None or tokenizer is None:
            print("‚ùå Model loading returned None - evaluation will continue with placeholder responses")
            return None, None
        
        return model, tokenizer
        
    except Exception as e:
        print(f"‚ùå Exception during model loading: {str(e)}")
        clear_gpu_memory()
        return None, None

# Create a function to ensure samples have required structure
def ensure_sample_structure(samples):
    """
    Ensure all samples have the required structure for evaluation
    """
    fixed_samples = []
    for sample in samples:
        if isinstance(sample, dict):
            # Ensure all required keys exist
            fixed_sample = {
                'issue_number': sample.get('number', sample.get('issue_number', 'unknown')),
                'context': sample.get('context', []),
                'ground_truth': sample.get('ground_truth', sample.get('body', '[No ground truth]')),
                'turn_number': sample.get('turn_number', 1),
                'total_conversation_turns': sample.get('total_conversation_turns', 1),
                'maintainer_role': sample.get('maintainer_role', 'NONE'),
                'total_maintainer_turns': sample.get('total_maintainer_turns', 0)
            }
            
            # If this is a raw GitHub issue, try to extract some meaningful ground truth
            if 'title' in sample and 'body' in sample and not fixed_sample['ground_truth']:
                fixed_sample['ground_truth'] = f"Issue: {sample.get('title', '')} - {sample.get('body', '')[:200]}..."
                
            fixed_samples.append(fixed_sample)
        else:
            # Fallback for non-dict samples
            fixed_samples.append({
                'issue_number': 'unknown',
                'context': [],
                'ground_truth': '[No ground truth available]',
                'turn_number': 1,
                'total_conversation_turns': 1,
                'maintainer_role': 'NONE',
                'total_maintainer_turns': 0
            })
    
    return fixed_samples

# Override the batch function to be safe
batch_generate_responses_original = batch_generate_responses
batch_generate_responses = batch_generate_responses_safe

print("üõ°Ô∏è Added safe error handling for model loading failures")
print("‚úÖ batch_generate_responses now handles None models gracefully")
print("üîß Added sample structure validation to prevent 'ground_truth' errors")

üõ°Ô∏è Added safe error handling for model loading failures
‚úÖ batch_generate_responses now handles None models gracefully
üîß Added sample structure validation to prevent 'ground_truth' errors


In [68]:
# üîß Required helper functions for safe evaluation
def format_conversation_context(context):
    """Safely format conversation context"""
    if not context:
        return ""
    
    formatted = []
    for item in context:
        if isinstance(item, dict):
            content = item.get('content', str(item))
        else:
            content = str(item)
        formatted.append(content)
    
    return "\\n".join(formatted)

def calculate_all_metrics(prediction, ground_truth):
    """Calculate all evaluation metrics safely"""
    try:
        from evaluate import load
        import nltk
        from nltk.translate.bleu_score import sentence_bleu
        from rouge_score import rouge_scorer
        import difflib
        
        # Ensure strings
        pred_str = str(prediction) if prediction else ""
        gt_str = str(ground_truth) if ground_truth else ""
        
        # BLEU Score
        try:
            # Tokenize for BLEU
            pred_tokens = pred_str.split()
            gt_tokens = [gt_str.split()]  # List of reference tokenizations
            bleu_score = sentence_bleu(gt_tokens, pred_tokens) if pred_tokens and gt_tokens[0] else 0.0
        except:
            bleu_score = 0.0
        
        # ROUGE Scores
        try:
            scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
            rouge_scores = scorer.score(gt_str, pred_str)
            rouge_1 = rouge_scores['rouge1'].fmeasure
            rouge_l = rouge_scores['rougeL'].fmeasure
        except:
            rouge_1 = 0.0
            rouge_l = 0.0
        
        # Edit Similarity (using SequenceMatcher)
        try:
            edit_sim = difflib.SequenceMatcher(None, pred_str, gt_str).ratio()
        except:
            edit_sim = 0.0
        
        return {
            'bleu': bleu_score,
            'rouge_l': rouge_l,
            'rouge_1': rouge_1,
            'edit_similarity': edit_sim
        }
        
    except Exception as e:
        print(f"   ‚ö†Ô∏è Metrics calculation error: {str(e)}")
        return {
            'bleu': 0.0,
            'rouge_l': 0.0,
            'rouge_1': 0.0,
            'edit_similarity': 0.0
        }

print("‚úÖ Helper functions loaded: format_conversation_context, calculate_all_metrics")
print("üîß All functions now handle errors gracefully")

‚úÖ Helper functions loaded: format_conversation_context, calculate_all_metrics
üîß All functions now handle errors gracefully


In [67]:
# üîß FINAL FIX: Safe evaluation function that handles missing ground_truth
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

def evaluate_model_comprehensive_SAFE(model, tokenizer, samples, model_name, 
                                     max_samples=None, generation_config=None, 
                                     save_results=True):
    """
    SAFE comprehensive model evaluation that handles missing ground_truth keys
    """
    
    # FIRST: Ensure all samples have proper structure
    print("üîß Validating sample structure...")
    safe_samples = ensure_sample_structure(samples)
    print(f"‚úÖ Validated {len(safe_samples)} samples")
    
    # Determine sample size
    if max_samples and max_samples < len(safe_samples):
        # Use stratified sampling to get representative samples
        sample_indices = np.linspace(0, len(safe_samples)-1, max_samples, dtype=int)
        eval_samples = [safe_samples[i] for i in sample_indices]
        print(f"üìä Evaluating on {max_samples:,} stratified samples (from {len(safe_samples):,} total)")
    else:
        eval_samples = safe_samples
        print(f"üìä Evaluating on all {len(eval_samples):,} samples")
    
    results = []
    failed_generations = 0
    start_time = time.time()
    
    print(f"üöÄ Starting evaluation of {model_name}...")
    
    # Generate responses with safe batch processing
    responses = batch_generate_responses(
        model, tokenizer, eval_samples, 
        batch_size=1, generation_config=generation_config
    )
    
    # Calculate metrics for each response
    print("üìè Calculating evaluation metrics...")
    for i, (sample, response) in enumerate(tqdm(zip(eval_samples, responses), 
                                                desc="Computing metrics", 
                                                total=len(eval_samples))):
        
        try:
            # SAFE: Ensure we have ground_truth
            ground_truth = sample.get('ground_truth', '[No ground truth available]')
            
            if not response or response == "[Model failed to load]":  # Handle failed generations
                failed_generations += 1
                metrics = {'bleu': 0.0, 'rouge_l': 0.0, 'rouge_1': 0.0, 'edit_similarity': 0.0}
            else:
                metrics = calculate_all_metrics(response, ground_truth)
            
            # SAFE: Format conversation context safely
            context = sample.get('context', [])
            if context:
                try:
                    context_length = len(format_conversation_context(context))
                except:
                    context_length = len(str(context))
            else:
                context_length = 0
            
            # Store comprehensive results with safe access
            result = {
                'model_name': model_name,
                'sample_idx': i,
                'issue_number': sample.get('issue_number', 'unknown'),
                'turn_number': sample.get('turn_number', 1),
                'total_turns': sample.get('total_conversation_turns', 1),
                'context_length': context_length,
                'context_turns': len(context),
                'ground_truth_length': len(ground_truth),
                'response_length': len(response) if response else 0,
                'maintainer_role': sample.get('maintainer_role', 'NONE'),
                'prediction': response,
                'ground_truth': ground_truth,
                **metrics
            }
            
            results.append(result)
            
        except Exception as e:
            print(f"   ‚ùå Error processing sample {i}: {str(e)}")
            # Add a safe fallback result
            results.append({
                'model_name': model_name,
                'sample_idx': i,
                'issue_number': 'unknown',
                'turn_number': 1,
                'total_turns': 1,
                'context_length': 0,
                'context_turns': 0,
                'ground_truth_length': 0,
                'response_length': 0,
                'maintainer_role': 'NONE',
                'prediction': '[Processing failed]',
                'ground_truth': '[No ground truth]',
                'bleu': 0.0,
                'rouge_l': 0.0,
                'rouge_1': 0.0,
                'edit_similarity': 0.0
            })
    
    # Create results DataFrame
    results_df = pd.DataFrame(results)
    
    # Calculate summary statistics
    eval_time = time.time() - start_time
    avg_metrics = results_df[['bleu', 'rouge_l', 'rouge_1', 'edit_similarity']].mean()
    
    # Print evaluation summary
    print(f"\n{'='*60}")
    print(f"üéØ EVALUATION COMPLETE: {model_name}")
    print(f"{'='*60}")
    print(f"‚è±Ô∏è  Total evaluation time: {eval_time/60:.1f} minutes")
    print(f"üìä Samples evaluated: {len(results_df):,}")
    print(f"‚ùå Failed generations: {failed_generations}")
    print(f"‚úÖ Success rate: {(len(results_df)-failed_generations)/len(results_df)*100:.1f}%")
    
    print(f"\nüìà Average Scores:")
    for metric, score in avg_metrics.items():
        print(f"   {metric.upper():15}: {score:.4f}")
    
    overall_avg = avg_metrics.mean()
    print(f"   {'OVERALL AVG':15}: {overall_avg:.4f}")
    
    # Save results if requested
    if save_results:
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        filename = f"evaluation_results_{model_name.replace('/', '_')}_{timestamp}.csv"
        results_df.to_csv(filename, index=False)
        print(f"üíæ Results saved to: {filename}")
    
    return results_df, avg_metrics

# Override the original function
evaluate_model_comprehensive = evaluate_model_comprehensive_SAFE

print("üîß FIXED: evaluate_model_comprehensive now handles missing 'ground_truth' keys safely")
print("üõ°Ô∏è Added comprehensive error handling and sample structure validation")
print("‚úÖ No more 'ground_truth' key errors!")

üîß FIXED: evaluate_model_comprehensive now handles missing 'ground_truth' keys safely
üõ°Ô∏è Added comprehensive error handling and sample structure validation
‚úÖ No more 'ground_truth' key errors!


In [69]:
# Quick test with first 2 repositories and limited samples
quick_results = run_complete_multi_repository_evaluation(
    repositories=detected_repos,
    models_to_test=['deepseek-coder-1.3b'],  # Just one model for testing
    max_repos=2,                             # Test only first 2 repositories
    samples_per_repo=10                      # Only 10 samples per repo
)

üöÄ COMPLETE MULTI-REPOSITORY EVALUATION
üìä Repositories: 28
ü§ñ Models: 1
üìù Samples per repo: 10
üéØ Total evaluations: 280
‚ö° Quick test mode: Testing only first 2 repositories

üîç REPOSITORY: AutoGPT
üìÅ Files: 2229 JSON files
--------------------------------------------------
   üìÇ Loading from: /kaggle/input/coderepoqa/AutoGPT/cloudide/workspace/QA_data
   üìÑ Processing 2229 JSON files
   ‚úÖ Loaded 2229 total samples
   üéØ Selected 10 samples (stratified sampling)
‚úÖ Loaded 10 samples from AutoGPT

ü§ñ Model 1/2: deepseek-coder-1.3b
üìä Repository: AutoGPT
üîÑ Model mapping: deepseek-coder-1.3b ‚Üí deepseek-ai/deepseek-coder-1.3b-instruct
üîÑ Loading tokenizer for deepseek-ai/deepseek-coder-1.3b-instruct...
üîÑ Loading model deepseek-ai/deepseek-coder-1.3b-instruct with memory optimization...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


‚úÖ Successfully loaded deepseek-coder-1.3b (deepseek-ai/deepseek-coder-1.3b-instruct)
üìä Model device: cuda:0
üîç GPU Memory - Allocated: 2.52GB, Reserved: 2.67GB
üîß Validating sample structure...
‚úÖ Validated 10 samples
üìä Evaluating on all 10 samples
üöÄ Starting evaluation of deepseek-coder-1.3b...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:3

üìè Calculating evaluation metrics...


Computing metrics: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 43.88it/s]



üéØ EVALUATION COMPLETE: deepseek-coder-1.3b
‚è±Ô∏è  Total evaluation time: 2.1 minutes
üìä Samples evaluated: 10
‚ùå Failed generations: 0
‚úÖ Success rate: 100.0%

üìà Average Scores:
   BLEU           : 0.0000
   ROUGE_L        : 0.0671
   ROUGE_1        : 0.1262
   EDIT_SIMILARITY: 0.0133
   OVERALL AVG    : 0.0517
   ‚úÖ BLEU: 0.000
   ‚úÖ ROUGE-L: 0.067
   ‚úÖ Edit Sim: 0.013

üìà REPOSITORY SUMMARY: AutoGPT
   üèÜ Best BLEU: deepseek-coder-1.3b (0.000)
   üèÜ Best ROUGE-L: deepseek-coder-1.3b (0.067)
   üìä Avg BLEU: 0.000
   üìä Avg ROUGE-L: 0.067

üîç REPOSITORY: Pillow
üìÅ Files: 2976 JSON files
--------------------------------------------------
   üìÇ Loading from: /kaggle/input/coderepoqa/Pillow/cloudide/workspace/QA_data
   üìÑ Processing 2976 JSON files
   ‚úÖ Loaded 2976 total samples
   üéØ Selected 10 samples (stratified sampling)
‚úÖ Loaded 10 samples from Pillow

ü§ñ Model 2/2: deepseek-coder-1.3b
üìä Repository: Pillow
üîÑ Model mapping: deepseek-co

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


‚úÖ Successfully loaded deepseek-coder-1.3b (deepseek-ai/deepseek-coder-1.3b-instruct)
üìä Model device: cuda:0
üîç GPU Memory - Allocated: 2.52GB, Reserved: 2.67GB
üîß Validating sample structure...
‚úÖ Validated 10 samples
üìä Evaluating on all 10 samples
üöÄ Starting evaluation of deepseek-coder-1.3b...


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:3

üìè Calculating evaluation metrics...


Computing metrics: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 45.80it/s]


üéØ EVALUATION COMPLETE: deepseek-coder-1.3b
‚è±Ô∏è  Total evaluation time: 2.1 minutes
üìä Samples evaluated: 10
‚ùå Failed generations: 0
‚úÖ Success rate: 100.0%

üìà Average Scores:
   BLEU           : 0.0000
   ROUGE_L        : 0.0745
   ROUGE_1        : 0.1378
   EDIT_SIMILARITY: 0.0261
   OVERALL AVG    : 0.0596
   ‚úÖ BLEU: 0.000
   ‚úÖ ROUGE-L: 0.075
   ‚úÖ Edit Sim: 0.026

üìà REPOSITORY SUMMARY: Pillow
   üèÜ Best BLEU: deepseek-coder-1.3b (0.000)
   üèÜ Best ROUGE-L: deepseek-coder-1.3b (0.075)
   üìä Avg BLEU: 0.000
   üìä Avg ROUGE-L: 0.075

üéØ COMPLETE MULTI-REPOSITORY EVALUATION FINISHED

üíæ Results saved:
   üìÑ Raw results: multi_repo_results_20251114_192558.json
   üìä Summary: repository_summary_20251114_192558.csv



