In [3]:
import random
import os
from pathlib import Path

def get_random_trials(input_file, output_file, num_trials=100, seed=42):
    """
    Read trial paths from file, select random trials, and save their names
    
    Parameters:
    - input_file: path to file containing trial paths
    - output_file: path to save selected trial names
    - num_trials: number of trials to select
    - seed: random seed for reproducibility
    """
    try:
        # Set random seed for reproducibility
        random.seed(seed)
        
        # Read and process file paths
        with open(input_file, 'r') as f:
            all_trials = [line.strip() for line in f if line.strip()]
        
        # Extract filenames and create a mapping
        filename_map = {path.split('/')[-1]: path for path in all_trials}
        filenames = list(filename_map.keys())
        
        # Validate number of trials
        available_trials = len(filenames)
        if num_trials > available_trials:
            print(f"Warning: Requested {num_trials} trials but only {available_trials} available")
            num_trials = available_trials
        
        # Select random trials
        selected_trials = random.sample(filenames, num_trials)
        
        # Sort for readability
        selected_trials.sort()
        
        # Save selected trial names
        with open(output_file, 'w') as f:
            for trial in selected_trials:
                f.write(f"{trial}\n")
        
        # Print summary
        print(f"\nSummary:")
        print(f"Total trials available: {available_trials}")
        print(f"Trials selected: {num_trials}")
        print(f"Output saved to: {output_file}")
        print(f"Random seed used: {seed}")
        
        # Optional: return the full paths of selected trials
        selected_paths = [filename_map[trial] for trial in selected_trials]
        return selected_paths
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

if __name__ == "__main__":
    input_file = "cancer_trials.txt"
    output_file = "trials_with_results.txt"
    
    # Get random trials and their full paths
    selected_paths = get_random_trials(
        input_file=input_file,
        output_file=output_file,
        num_trials=100,
        seed=42  # Set seed for reproducibility
    )


Summary:
Total trials available: 18284
Trials selected: 100
Output saved to: trials_with_results.txt
Random seed used: 42


## Copy original files

In [8]:
import shutil
import os
# Create destination folder
destination_folder = "Original_files_with_results"
source_folder = "/home/4481281/Clinical_trials/Original_format/Original_format/"
trials_file = "trials_with_results.txt"
os.makedirs(destination_folder, exist_ok=True)
    
# Read trial names
with open(trials_file, 'r') as f:
    trial_names = [line.strip() for line in f if line.strip()]
    
# Copy files
copied = 0
for trial_name in trial_names:
    try:
        # Assuming files are directly in numbered folders
        for folder_num in range(1, 100):  # Adjust range as needed
            source_path = os.path.join(source_folder, trial_name)
            if os.path.exists(source_path):
                shutil.copy2(source_path, os.path.join(destination_folder, trial_name))
                copied += 1
                break
    except Exception as e:
        print(f"Error copying {trial_name}: {str(e)}")

## Prompt

In [7]:
def create_prompt(data, question_number):
    """
    Creates a prompt for a specific question number
    """
    questions = {
        1: "Definition: Define the title and purpose of the clinical trial.",
        2: "Condition: Describe the conditions studied.",
        3: "Design Details: Explain how the study was designed, including the number of participants enrolled.",
        4: "Interventions: Describe the interventions investigated in the clinical trial.",
        5: "Study Arms: Explain how the study arms were structured.",
        6: "Eligibility Criteria: Describe eligibility criteria for participation in the study.",
        7: "Primary Outcome: Describe the primary outcome measured.",
        8: "Primary Outcome Statistical Analysis: Describe the statistical methods used to analyze the primary outcome.",
        9: "Primary Outcome Statistical Results: Summarize the statistical results obtained for the primary outcome.",
        10: "Secondary Outcomes Overview: Provide a general summary of the secondary outcomes measured.",
        11: "Statistical Approach: Briefly describe the statistical methods used to analyze the secondary outcomes.",
        12: ("Key Results: Highlight the most important statistical results and clinically relevant findings from the secondary outcomes.\n"
             "- If no secondary outcomes are present, state: No secondary outcomes were measured in this clinical trial."),
        13: "Serious Adverse Events (SAEs): Summarize the most significant and clinically relevant serious adverse events reported.",
        14: ("Non-Serious Adverse Events: Briefly list or group the most frequent non-serious adverse events highlighting those "
             "that occurred most commonly or had the greatest impact on participant well-being."),
        15: ("Key Observations and Clinical Relevance: Provide a short overview of the overall safety profile based on the adverse events, "
             "focusing on any notable trends or conclusions about tolerability and risk.\n"
             "If no adverse events are present, state: No adverse events were reported in this clinical trial.")
    }
    
    prompt = (
        "You are an advanced clinical language model. Answer the following specific question about the clinical trial "
        f"{data}. Follow the structure below to enhance clinical reasoning capabilities:\n\n"
        
        f"QUESTION:\n"
        f"{questions[question_number]}\n\n"
        
        "REQUIREMENTS:\n"
        "- Ensure clinical reasoning\n"
        "- Use medically precise terminology\n"
        "- Avoid introductory or concluding sentences\n"
        "- Be specific and focused on the requested aspect\n"
        "- If information is not available, explicitly state this\n\n"
        
        "RESPONSE:"
    )
    return prompt

In [8]:
prompts = create_prompt("trial 2", 2)
print(prompts)

You are an advanced clinical language model. Answer the following specific question about the clinical trial trial 2. Follow the structure below to enhance clinical reasoning capabilities:

QUESTION:
Condition: Describe the conditions studied.

REQUIREMENTS:
- Ensure clinical reasoning
- Use medically precise terminology
- Avoid introductory or concluding sentences
- Be specific and focused on the requested aspect
- If information is not available, explicitly state this

RESPONSE:


## Files for GPT

In [9]:
import os
import shutil
from pathlib import Path

def copy_trial_files(trials_list_file, source_folder, destination_folder):
    """
    Copy files listed in trials_list_file from source_folder to destination_folder
    
    Parameters:
    - trials_list_file: txt file containing trial filenames
    - source_folder: folder containing original files (with subfolders)
    - destination_folder: where to copy the files
    """
    try:
        # Create destination folder if it doesn't exist
        os.makedirs(destination_folder, exist_ok=True)
        
        # Read trial names from file
        with open(trials_list_file, 'r') as f:
            trial_names = [line.strip() for line in f if line.strip()]
            
        print(f"Found {len(trial_names)} trials to copy")
        
        # Track progress
        copied = []
        not_found = []
        
        # Search and copy each file
        for trial_name in trial_names:
            # Search recursively for the file
            matches = list(Path(source_folder).rglob(trial_name))
            
            if matches:
                # Take the first match if multiple exist
                source_path = matches[0]
                dest_path = os.path.join(destination_folder, trial_name)
                
                try:
                    shutil.copy2(source_path, dest_path)
                    copied.append(trial_name)
                except Exception as e:
                    print(f"Error copying {trial_name}: {e}")
                    not_found.append(trial_name)
            else:
                not_found.append(trial_name)
                
        # Print summary
        print("\nCopy Summary:")
        print(f"Total files to copy: {len(trial_names)}")
        print(f"Successfully copied: {len(copied)}")
        print(f"Not found: {len(not_found)}")
        
        if not_found:
            print("\nFiles not found:")
            for file in not_found:
                print(f"- {file}")
            
            # Save list of not found files
            with open('not_found_trials.txt', 'w') as f:
                for file in not_found:
                    f.write(f"{file}\n")
                    
    except Exception as e:
        print(f"Error: {str(e)}")

trials_list = "trials_with_results.txt"
source_dir = "../Files_for_GPT/"
dest_dir = "TRIALS"
    
copy_trial_files(trials_list, source_dir, dest_dir)

Found 100 trials to copy

Copy Summary:
Total files to copy: 100
Successfully copied: 100
Not found: 0


## Copy files with summary

In [2]:
import os
import shutil

# Create Summary folder
os.makedirs("Files_with_Summary", exist_ok=True)

# Read file paths from text file
with open("summary_files.txt", 'r') as f:
    file_paths = eval(f.read())

# Copy each file
copied = 0
for file_path in file_paths:
    try:
        filename = os.path.basename(file_path)
        shutil.copy2(file_path, os.path.join("Files_with_Summary", filename))
        copied += 1
    except Exception as e:
        print(f"Error copying {file_path}: {e}")

print(f"Copied {copied} out of {len(file_paths)} files")

Copied 100 out of 100 files


## Split Q at sentences level

In [2]:
import json
import os
import re
from nltk.tokenize import sent_tokenize
import nltk
from tqdm import tqdm

def clean_sentence(sentence):
    """Clean individual sentence"""
    # Remove extra whitespace
    sentence = re.sub(r'\s+', ' ', sentence)
    # Remove leading/trailing whitespace
    sentence = sentence.strip()
    return sentence

def split_text_to_sentences(text):
    """Split text into sentences with cleaning"""
    # Clean the text
    text = re.sub(r'\s+', ' ', text).strip()
    # Split into sentences
    sentences = sent_tokenize(text)
    # Clean each sentence
    sentences = [clean_sentence(s) for s in sentences if clean_sentence(s)]
    return sentences

def process_json_files(input_folder, output_folder):
    """
    Process JSON files by splitting text into sentences for each question
    """
    # Download NLTK data if needed
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    
    # Create output folder
    os.makedirs(output_folder, exist_ok=True)
    
    # Get all JSON files
    json_files = [f for f in os.listdir(input_folder) if f.endswith('.json')]
    print(f"Found {len(json_files)} JSON files to process")
    
    # Process files with progress bar
    for file_name in tqdm(json_files, desc="Processing files"):
        try:
            input_path = os.path.join(input_folder, file_name)
            output_path = os.path.join(output_folder, file_name)
            
            # Read input JSON
            with open(input_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Process each question
            processed_data = {}
            for i in range(1, 8):
                key = f'Q{i}'
                if key in data:
                    sentences = split_text_to_sentences(data[key])
                    processed_data[key] = sentences
                    
            # Save processed data
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(processed_data, f, indent=2, ensure_ascii=False)
            
        except Exception as e:
            print(f"\nError processing {file_name}: {str(e)}")
    
    # Print summary
    print("\nProcessing Summary:")
    print(f"Total files processed: {len(json_files)}")
    print(f"Output folder: {output_folder}")

####################################################
input_folder = "Inference_summary"
output_folder = "Inference_summary_Sentence"
    
process_json_files(input_folder, output_folder)

Found 100 JSON files to process


Processing files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 453.73it/s]


Processing Summary:
Total files processed: 100
Output folder: Inference_summary_Sentence





## Split GPT test data (disorders) to sentences 

In [1]:
import json
import logging
import os
from typing import List, Dict
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')  # Download the punkt tokenizer data

def split_paragraphs_to_sentences(input_file: str, output_dir: str):
    """
    Split Q1-Q7 paragraphs into sentences while maintaining file structure
    
    Args:
        input_file: Path to input JSON file
        output_dir: Directory where the split JSON files will be saved
    """
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Read input file
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Common fields that should not be split
        common_fields = ['index', 'AUI', 'CUI', 'term', 'Definition', 'reasoning', 'statement']
        
        # Process the file
        processed_data = {}
        
        # Copy common fields
        for field in common_fields:
            processed_data[field] = data.get(field, '')
        
        # Process Q1-Q7 fields
        for i in range(1, 8):
            q_key = f'Q{i}'
            if q_key in data and data[q_key]:
                # Split paragraph into sentences
                sentences = sent_tokenize(data[q_key])
                # Store as dictionary with sentence numbers
                processed_data[q_key] = {
                    f'{j}': sentence.strip()
                    for j, sentence in enumerate(sentences)
                }
            else:
                processed_data[q_key] = {}
        
        # Save processed data
        output_file = os.path.join(output_dir, os.path.basename(input_file))
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(processed_data, f, indent=2, ensure_ascii=False)
        
        return processed_data
        
    except Exception as e:
        logging.error(f"Error processing file {input_file}: {str(e)}")
        return None

def process_all_files(input_dir: str, output_dir: str):
    """Process all JSON files in the input directory"""
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Process each file
        for filename in os.listdir(input_dir):
            if filename.endswith('.json'):
                input_path = os.path.join(input_dir, filename)
                print(f"Processing {filename}...")
                split_paragraphs_to_sentences(input_path, output_dir)
                
    except Exception as e:
        logging.error(f"Error processing directory: {str(e)}")

# Example usage
if __name__ == "__main__":
    input_dir = "/home/4481281/Clinical_trials/Original_format/Results/Factuality_Eval/Inference_files/UMLS/Paragraph_level"
    output_dir = "/home/4481281/Clinical_trials/Original_format/Results/Factuality_Eval/Inference_files/UMLS/Sentence_level"
    process_all_files(input_dir, output_dir)

Processing NCT047.json...
Processing NCT109.json...
Processing NCT039.json...
Processing NCT055.json...
Processing NCT064.json...
Processing NCT083.json...
Processing NCT108.json...
Processing NCT035.json...
Processing NCT010.json...
Processing NCT024.json...
Processing NCT119.json...
Processing NCT058.json...
Processing NCT077.json...
Processing NCT071.json...
Processing NCT001.json...
Processing NCT091.json...
Processing NCT018.json...
Processing NCT115.json...
Processing NCT004.json...
Processing NCT015.json...


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     self-signed certificate in certificate chain
[nltk_data]     (_ssl.c:1000)>


## Check Sentence splitting

In [8]:
import os
import json
from transformers import AutoTokenizer
from tqdm import tqdm

def analyze_token_counts(input_folder: str, model_name: str = "meta-llama/Llama-2-70b-hf"):
    """
    Analyze token counts in JSON files and report items with less than 5 tokens.
    
    Args:
        input_folder: Path to folder containing JSON files
        model_name: Name of the model for tokenizer
    """
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained("/proj/lab_valdes/models/HF_models/Llama-3.3-70B-Instruct/")
    
    # Initialize counters
    total_items = 0
    short_items = 0
    short_items_by_q = {f"Q{i}": 0 for i in range(1, 16)}
    total_items_by_q = {f"Q{i}": 0 for i in range(1, 16)}
    
    # Get list of JSON files
    all_files = [f for f in os.listdir(input_folder) if f.endswith('.json')]
    json_files = sorted(all_files)
    
    # Process each file
    for filename in tqdm(json_files, desc="Processing files"):
        try:
            with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as f:
                data = json.load(f)
                
            # Process each question
            for q_num in range(1, 16):
                key = f"Q{q_num}"
                if key in data:
                    items = data[key]
                    if not isinstance(items, list):
                        items = [items]
                    
                    # Count tokens for each item
                    for item in items:
                        tokens = tokenizer.encode(item)
                        token_count = len(tokens)
                        
                        total_items += 1
                        total_items_by_q[key] += 1
                        
                        if token_count < 5:
                            short_items += 1
                            short_items_by_q[key] += 1
                            print(f"\nShort item found in {filename}, {key}:")
                            print(f"Text: {item}")
                            print(f"Token count: {token_count}")
        
        except Exception as e:
            print(f"\nError processing {filename}: {str(e)}")
    
    # Print results
    print("\nAnalysis Results:")
    print(f"Total items analyzed: {total_items}")
    print(f"Items with less than 5 tokens: {short_items}")
    print(f"Percentage: {(short_items/total_items)*100:.2f}%")
    
    print("\nBreakdown by question:")
    for q_num in range(1, 16):
        key = f"Q{q_num}"
        total = total_items_by_q[key]
        short = short_items_by_q[key]
        if total > 0:
            percentage = (short/total)*100
            print(f"{key}: {short}/{total} ({percentage:.2f}%)")

if __name__ == "__main__":
    input_folder = "Inference_summary_Sentence"  # Change this to your input folder path
    analyze_token_counts(input_folder)

Processing files:  64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                  | 64/100 [00:00<00:00, 319.34it/s]


Short item found in NCT00136916.json, Q11:
Text: Not applicable.
Token count: 4

Short item found in NCT00136916.json, Q12:
Text: Not applicable.
Token count: 4

Short item found in NCT00448136.json, Q1:
Text: 3.
Token count: 3

Short item found in NCT00448136.json, Q1:
Text: 4.
Token count: 3

Short item found in NCT00448136.json, Q1:
Text: 3.
Token count: 3

Short item found in NCT00448136.json, Q1:
Text: **Design Details
Token count: 4

Short item found in NCT00448136.json, Q2:
Text: 2.
Token count: 3

Short item found in NCT00448136.json, Q2:
Text: 3.
Token count: 3

Short item found in NCT00448136.json, Q2:
Text: 4.
Token count: 3

Short item found in NCT00448136.json, Q2:
Text: 5.
Token count: 3

Short item found in NCT00448136.json, Q2:
Text: 6.
Token count: 3

Short item found in NCT00448136.json, Q2:
Text: 3.
Token count: 3

Short item found in NCT00448136.json, Q2:
Text: 4.
Token count: 3

Short item found in NCT00448136.json, Q2:
Text: 2.
Token count: 3

Short item found in

Processing files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 307.89it/s]


Short item found in NCT03003949.json, Q1:
Text: 1.
Token count: 3

Short item found in NCT03003949.json, Q1:
Text: 2.
Token count: 3

Short item found in NCT03003949.json, Q2:
Text: 1.
Token count: 3

Short item found in NCT03003949.json, Q3:
Text: 1.
Token count: 3

Short item found in NCT03003949.json, Q3:
Text: 2.
Token count: 3

Short item found in NCT03003949.json, Q3:
Text: 3.
Token count: 3

Short item found in NCT03003949.json, Q3:
Text: 4.
Token count: 3

Short item found in NCT03003949.json, Q4:
Text: 1.
Token count: 3

Short item found in NCT03003949.json, Q6:
Text: 1.
Token count: 3

Short item found in NCT03003949.json, Q6:
Text: 2.
Token count: 3

Short item found in NCT03003949.json, Q6:
Text: 3.
Token count: 3

Short item found in NCT03003949.json, Q6:
Text: 1.
Token count: 3

Short item found in NCT03003949.json, Q6:
Text: 2.
Token count: 3

Short item found in NCT03003949.json, Q6:
Text: 3.
Token count: 3

Short item found in NCT03009058.json, Q3:
Text: ```
Token cou


