In [1]:
import os
# Set CUDA_VISIBLE_DEVICES to use only GPU number 7
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

import json
import transformers
import torch
import os
from vllm import LLM, SamplingParams
import time



In [None]:
import os
import json
import transformers
import torch
from vllm import LLM, SamplingParams
import time
from tqdm import tqdm

def map_evaluation_to_score(evaluation):
    """
    Map the evaluation text to a score based on specific patterns
    """
    # Convert to lowercase for case-insensitive matching
    eval_lower = evaluation.lower()
    
    # Patterns indicating "Yes" (score = 0)
    yes_patterns = [
        "yes",
        "**yes**",
        "the statement is supported by the context",
        "answer: yes",
        "is fully supported",
    ]
    
    # Patterns indicating "No" (score = 1)
    no_patterns = [
        "no.",
        "the statement is not supported by the context",
        "answer: no",
        "statement is not fully supported",
        "**no**",
        "partially supported",
    ]
    
    # Check for yes patterns
    for pattern in yes_patterns:
        if pattern in eval_lower:
            return 0
            
    # Check for no patterns
    for pattern in no_patterns:
        if pattern in eval_lower:
            return 1
            
    # Default case
    return 0.5

def create_prompt_factuality_evaluation(context, sentence):
    """Creates a prompt for factuality evaluation"""
    prompt = (
        "You are an advanced clinical language model. Your task is to answer whether a statement is supported by a given context.\n\n"
        "The **statement**.\n\n"
        "STATEMENT:"
        f"{sentence}\n\n"
        "The **context**.\n\n"
        "CONTEXT:\n"
        f"{context}\n\n"
        "Is the statement supported by the context above?. Answer Yes or No.\n\n"
        "ANSWER:"
        "Explain your decision:"
    )
    return prompt

def process_files(llm, statement_folder, context_folder, output_folder):
    """Process all files and questions"""
    try:
        # Create output folder
        os.makedirs(output_folder, exist_ok=True)
        
        # Get list of files
        statement_files = [f for f in os.listdir(statement_folder) if f.endswith('.json')]
        print(f"Found {len(statement_files)} files to process")
        
        # Set sampling parameters
        sampling_params = SamplingParams(temperature=0.3, top_p=0.85, max_tokens=512)
        
        # Process each file
        for filename in tqdm(statement_files, desc="Processing files"):
            try:
                # Load statement file
                statement_path = os.path.join(statement_folder, filename)
                with open(statement_path, "r", encoding='utf-8') as f:
                    statement_data = json.load(f)
                
                # Load context file
                context_path = os.path.join(context_folder, filename)
                with open(context_path, "r", encoding='utf-8') as f:
                    context_data = json.load(f)
                
                # Initialize results dictionary
                file_results = {}
                
                # Process each question
                for question_number in range(1, 16):
                    try:
                        key = f"Q{question_number}"
                        if key in statement_data:
                            # Create prompt
                            sentence = statement_data[key]
                            context = context_data["Final_text"]
                            prompt = create_prompt_factuality_evaluation(context, sentence)
                            
                            # Generate output
                            outputs = llm.generate(prompt, sampling_params)
                            evaluation_text = outputs[0].outputs[0].text
                            
                            # Map evaluation to score
                            score = map_evaluation_to_score(evaluation_text)
                            
                            # Store results
                            file_results[key] = {
                                'statement': sentence,
                                'evaluation': evaluation_text,
                                'score': score
                            }
                            
                    except Exception as e:
                        print(f"Error processing {filename} Q{question_number}: {str(e)}")
                        file_results[key] = {
                            'statement': sentence if 'sentence' in locals() else None,
                            'error': str(e),
                            'score': 0.5  # Default score for errors
                        }
                
                # Save results
                output_path = os.path.join(output_folder, f"{filename}")
                with open(output_path, 'w', encoding='utf-8') as f:
                    json.dump(file_results, f, indent=2, ensure_ascii=False)
                
            except Exception as e:
                print(f"Error processing file {filename}: {str(e)}")
        
        print("\nProcessing complete!")
        
    except Exception as e:
        print(f"Error initializing process: {str(e)}")

if __name__ == "__main__":
    try:
        # Define folders
        statement_folder = "Inference_json"
        context_folder = "Files_with_Summary"
        output_folder = "Evaluation_json"
        
        # Initialize model
        print("Initializing model...")
        llm = LLM(
            "/proj/lab_valdes/models/HF_models/Llama-3.3-70B-Instruct/", 
            tensor_parallel_size=4
        )
        
        # Process all files
        process_files(
            llm=llm,
            statement_folder=statement_folder,
            context_folder=context_folder,
            output_folder=output_folder
        )
        
    except Exception as e:
        print(f"Error initializing model: {str(e)}")

Initializing model...
INFO 01-24 16:06:06 config.py:813] Defaulting to use mp for distributed inference
INFO 01-24 16:06:06 config.py:911] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 01-24 16:06:06 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='/proj/lab_valdes/models/HF_models/Llama-3.3-70B-Instruct/', speculative_config=None, tokenizer='/proj/lab_valdes/models/HF_models/Llama-3.3-70B-Instruct/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_tra

Loading safetensors checkpoint shards:   0% Completed | 0/30 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=3802032)[0;0m INFO 01-24 16:06:30 model_runner.py:890] Loading model weights took 32.8892 GB
INFO 01-24 16:06:30 model_runner.py:890] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=3802029)[0;0m INFO 01-24 16:06:30 model_runner.py:890] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=3802031)[0;0m INFO 01-24 16:06:30 model_runner.py:890] Loading model weights took 32.8892 GB
INFO 01-24 16:06:34 distributed_gpu_executor.py:56] # GPU blocks: 25362, # CPU blocks: 3276
INFO 01-24 16:06:36 model_runner.py:1181] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 01-24 16:06:36 model_runner.py:1185] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also 

Processing files:   0%|                                                                                                                                                                                                                                                                                                                                       | 0/100 [00:00<?, ?it/s]
Processed prompts:   0%|                                                                                                                                                                                                                                                                                    | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
Processed prompts: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████