In [1]:
import os
# Set CUDA_VISIBLE_DEVICES to use only GPU number 7
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

import json
import transformers
import torch
import os
from vllm import LLM, SamplingParams
import time



In [2]:
import os
import json
import transformers
import torch
from vllm import LLM, SamplingParams
import time
from tqdm import tqdm

def map_evaluation_to_score(evaluation):
    """
    Map the evaluation text to scores based on pattern matches.
    Includes both majority voting score and direct answer score.
    """
    # Convert to lowercase for case-insensitive matching
    eval_lower = evaluation.lower()
    
    # Patterns indicating different scores
    yes_patterns = [
        "yes",
        "**yes**",
        "the statement is supported by the context",
        "is indeed supported",
    ]
    
    no_patterns = [
        "no.",
        "**no**"
        "the statement is not supported by the context",
        "statement is not fully supported",
    ]

    # Exact patterns for direct scoring
    direct_patterns = {
        "answer: yes": 0,
        "answer: no": 1
    }
    
    # Count matches for each category
    yes_matches = sum(1 for pattern in yes_patterns if pattern in eval_lower)
    no_matches = sum(1 for pattern in no_patterns if pattern in eval_lower)

     # Check for direct answer matches
    direct_score = 0.5  # default if no exact match
    direct_match = None
    for pattern, score in direct_patterns.items():
        if pattern in eval_lower:  # Note: using eval_lower (not case sensitive)
            direct_score = score
            break
    
    # Create detailed result
    result = {
        'yes_count': yes_matches,
        'no_count': no_matches,
        'total_matches': yes_matches + no_matches,
        'confidence': 'low',
        'direct_score': direct_score,
    }
    
    # Determine score based on majority
    if yes_matches == 0 and no_matches == 0:
        result['score'] = 0.5
        result['decision_type'] = 'no_matches'
    elif yes_matches > no_matches:
        result['score'] = 0
        result['decision_type'] = 'yes_majority'
        result['confidence'] = 'high' if yes_matches > 1 else 'medium'
    elif no_matches > yes_matches:
        result['score'] = 1
        result['decision_type'] = 'no_majority'
        result['confidence'] = 'high' if no_matches > 1 else 'medium'
    else:  # Equal matches
        result['score'] = 0.5
        result['decision_type'] = 'tie'
        result['confidence'] = 'medium' if yes_matches > 0 else 'low'
    
    return result

def create_prompt_factuality_evaluation(context, sentence):
    """Creates a prompt for factuality evaluation"""
    prompt = (
        "You are an advanced clinical language model. Your task is to answer whether a statement is supported by a given context.\n\n"
        "The **statement**.\n\n"
        "STATEMENT:"
        f"{sentence}\n\n"
        "The **context**.\n\n"
        "CONTEXT:\n"
        f"{context}\n\n"
        "Is the statement supported by the context above?. Answer Yes or No.\n\n"
        "ANSWER:"
        "Explain your decision:"
    )
    return prompt

def process_files(llm, statement_folder, context_folder, output_folder):
    """Process all files and questions"""
    try:
        # Create output folder
        os.makedirs(output_folder, exist_ok=True)
        
        # Get list of files
        statement_files = [f for f in os.listdir(statement_folder) if f.endswith('.json')]
        print(f"Found {len(statement_files)} files to process")
        
        # Set sampling parameters
        sampling_params = SamplingParams(temperature=0.3, top_p=0.85, max_tokens=512)
        
        # Process each file
        for filename in tqdm(statement_files, desc="Processing files"):
            try:
                # Load statement file
                statement_path = os.path.join(statement_folder, filename)
                with open(statement_path, "r", encoding='utf-8') as f:
                    statement_data = json.load(f)
                
                # Load context file
                context_path = os.path.join(context_folder, filename)
                with open(context_path, "r", encoding='utf-8') as f:
                    context_data = json.load(f)
                
                # Initialize results dictionary
                file_results = {}
                
                # Process each question
                for question_number in range(1, 16):
                    try:
                        key = f"Q{question_number}"
                        if key in statement_data:
                            # Create prompt
                            sentence = statement_data[key]
                            context = context_data["Final_text"]
                            prompt = create_prompt_factuality_evaluation(context, sentence)
                            
                            # Generate output
                            outputs = llm.generate(prompt, sampling_params)
                            evaluation_text = outputs[0].outputs[0].text
                            
                            
                            # Get detailed scoring result
                            score_result = map_evaluation_to_score(evaluation_text)
        
                            # Store results with detailed information
                            file_results[key] = {
                                'statement': sentence,
                                'evaluation': evaluation_text,
                                'score': score_result['score'],
                                'direct_score': score_result['direct_score'],
                                'yes_matches': score_result['yes_count'],
                                'no_matches': score_result['no_count'],
                                'total_matches': score_result['total_matches'],
                                'decision_type': score_result['decision_type'],
                                'confidence': score_result['confidence']
                            }
                            
                    except Exception as e:
                        print(f"Error processing {filename} Q{question_number}: {str(e)}")
                        file_results[key] = {
                            'statement': sentence if 'sentence' in locals() else None,
                            'error': str(e),
                            'score': 0.5  # Default score for errors
                        }
                
                # Save results
                output_path = os.path.join(output_folder, f"{filename}")
                with open(output_path, 'w', encoding='utf-8') as f:
                    json.dump(file_results, f, indent=2, ensure_ascii=False)
                
            except Exception as e:
                print(f"Error processing file {filename}: {str(e)}")
        
        print("\nProcessing complete!")
        
    except Exception as e:
        print(f"Error initializing process: {str(e)}")

if __name__ == "__main__":
    try:
        # Define folders
        statement_folder = "Inference_summary"
        context_folder = "Files_with_Summary"
        output_folder = "Evaluation_summary"
        
        # Initialize model
        print("Initializing model...")
        llm = LLM(
            "/proj/lab_valdes/models/HF_models/Llama-3.3-70B-Instruct/", 
            tensor_parallel_size=4,
            gpu_memory_utilization=0.92,
            max_num_batched_tokens=2048,
            )
        
        # Process all files
        process_files(
            llm=llm,
            statement_folder=statement_folder,
            context_folder=context_folder,
            output_folder=output_folder
        )
        
    except Exception as e:
        print(f"Error initializing model: {str(e)}")

Initializing model...
INFO 01-27 10:14:05 config.py:813] Defaulting to use mp for distributed inference
INFO 01-27 10:14:05 config.py:911] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 01-27 10:14:05 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='/proj/lab_valdes/models/HF_models/Llama-3.3-70B-Instruct/', speculative_config=None, tokenizer='/proj/lab_valdes/models/HF_models/Llama-3.3-70B-Instruct/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_tr

Loading safetensors checkpoint shards:   0% Completed | 0/30 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=4000887)[0;0m INFO 01-27 10:14:31 model_runner.py:890] Loading model weights took 32.8892 GB
INFO 01-27 10:14:31 model_runner.py:890] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=4000888)[0;0m INFO 01-27 10:14:31 model_runner.py:890] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=4000886)[0;0m INFO 01-27 10:14:31 model_runner.py:890] Loading model weights took 32.8892 GB
INFO 01-27 10:14:34 distributed_gpu_executor.py:56] # GPU blocks: 26655, # CPU blocks: 3276
[1;36m(VllmWorkerProcess pid=4000888)[0;0m INFO 01-27 10:14:37 model_runner.py:1181] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
[1;36m(VllmWorkerProcess pid=4000888)[0;0m INFO 01-27 10:14:37 model_runner.py:1185] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of m

Processing files:   0%|                                                                                                                                                                                                                                                                                                                                       | 0/100 [00:00<?, ?it/s]
Processed prompts:   0%|                                                                                                                                                                                                                                                                                    | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
Processed prompts: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████


Processing complete!



