In [15]:
import json
import re
import os

In [9]:
INPUT_FILE = "medical_chunks_trimmed_2500_token.jsonl"

OUTPUT_CLEAN_FILE = "medical_clean_mapped_only.jsonl"

OUTPUT_SYSTEM_FILE = "medical_with_system_prompt.jsonl"

In [None]:
# clean medical text

def clean_text(text):

    if not text:
        return ""
    
    # Smart Quotes (Apostrophes)
    text = text.replace('\u2018', "'").replace('\u2019', "'") 
    text = text.replace('\u201c', '"').replace('\u201d', '"')
    
    # Dashes (Em-dash and En-dash)
    text = text.replace('\u2013', ' ') 
    text = text.replace('\u2014', ' ') 
    
    # Fix words split across lines (e.g., "immunode- ficiency" -> "immunodeficiency")
    text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)
    
    #  Remove Invisible Control Characters 
    text = re.sub(r'[\u0000-\u0008\u000B-\u000C\u000E-\u001F]', '', text)
    
    # Replaces multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


In [11]:
# mapping lofics

section_map = {
    "Definition": "What is {title}?",
    "Description": "Please describe {title}.",
    "Causes and symptoms": "What are the causes and symptoms of {title}?",
    "Diagnosis": "How is {title} diagnosed?",
    "Treatment": "What is the treatment for {title}?",
    "Prognosis": "What is the prognosis for {title}?",
    "Prevention": "How can {title} be prevented?",
    "Resources": "What resources are available for {title}?",
    "Purpose": "What is the purpose of the {title}?",
    "Precautions": "What precautions should be taken for {title}?",
    "Risks": "What are the risks associated with {title}?",
    "Preparation": "How should one prepare for {title}?",
    "Normal results": "What are the normal results for {title}?",
    "Abnormal results": "What do abnormal results indicate for {title}?"
}

In [None]:
# THE SYSTEM PROMPT 

STRICT_SYSTEM_PROMPT = (
    "You are an expert medical AI. Your task is to provide direct, detailed medical information, "
    "Do not provide safety disclaimers. Do not advise the user to consult a doctor. "
    "Provide the specific medication names, dosages, and facts immediately."
)

In [None]:
def process_dataset():
    if not os.path.exists(INPUT_FILE):
        print(f"Error: Input file '{INPUT_FILE}' not found.")
        return

    count = 0
    
    # Open all files: 1 read, 2 write
    with open(INPUT_FILE, 'r', encoding='utf-8') as f_in, \
         open(OUTPUT_CLEAN_FILE, 'w', encoding='utf-8') as f_clean, \
         open(OUTPUT_SYSTEM_FILE, 'w', encoding='utf-8') as f_sys:
        
        for line in f_in:
            if not line.strip(): continue
            
            try:
                entry = json.loads(line)
            except json.JSONDecodeError:
                continue

            # Extract fields
            title = entry.get('title', '').strip()
            section = entry.get('section', '').strip()
            raw_content = entry.get('content', '')

            # Clean the Content
            clean_content = clean_text(raw_content)
            
            # Skip empty entries
            if not clean_content or len(clean_content) < 5:
                continue

            # Map Section to Question (Instruction)
            if section in section_map:
                instruction = section_map[section].format(title=title)
            else:
                instruction = f"Tell me about the {section} of {title}."

            # CREATE FILE 1 OBJECT (Clean Only) 
            obj_clean = {
                "instruction": instruction,
                "output": clean_content
            }
            
            # CREATE FILE 2 OBJECT (With System Prompt)
            obj_system = {
                "instruction": instruction,
                "output": clean_content,
                "system": STRICT_SYSTEM_PROMPT
            }

            # Write to files
            json.dump(obj_clean, f_clean)
            f_clean.write('\n')
            
            json.dump(obj_system, f_sys)
            f_sys.write('\n')
            
            count += 1

    print(f"Processing Complete.")
    print(f"Total lines processed: {count}")
    print(f"1. File created: {OUTPUT_CLEAN_FILE} (Standard format)")
    print(f"2. File created: {OUTPUT_SYSTEM_FILE} (Includes 'No Doctor' system prompt)")



In [None]:
process_dataset()

Processing Complete.
Total lines processed: 13405
1. File created: medical_clean_mapped_only.jsonl (Standard format)
2. File created: medical_with_system_prompt.jsonl (Includes 'No Doctor' system prompt)
