In [None]:


import os, json, re, time
from pathlib import Path
from typing import List, Dict, Any, Optional
import pandas as pd
import dotenv
import google.generativeai as genai

import concurrent.futures
from tqdm import tqdm


# --- Configuration and Setup ---

dotenv.load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") 
if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY not found in environment (.env)")
genai.configure(api_key=GEMINI_API_KEY)

MODEL_NAME = "gemini-2.5-flash"
OUTPUT_DIR = Path("enriched")
OUTPUT_DIR.mkdir(exist_ok=True)

RAW_JSON_PATH = Path("data.json")
if not RAW_JSON_PATH.exists():
    raise FileNotFoundError("data.json not found in workspace root")

with RAW_JSON_PATH.open("r", encoding="utf-8") as f:
    raw_records = json.load(f)

print(f"Loaded {len(raw_records)} records")
raw_records[:1]


# xtract question & answer from QA_Data

def parse_qa(text: str):
    if not isinstance(text, str):
        return "", ""

    cleaned = re.sub(r"\s+", " ", text).strip()
    q_part, a_part = "", ""
    m = re.match(r"(?i)question:\s*(.+?)\s*answer:\s*(.+)$", cleaned)
    if m:
        q_part = m.group(1).strip()
        a_part = m.group(2).strip()
    else:
        
        if "Answer:" in text:
            parts = text.split("Answer:", 2)
            q_part = parts[0].replace("Question:", "").strip()
            a_part = parts[1].strip()
        else:
            q_part = cleaned[:200]
            a_part = cleaned
    # Collapse duplicate question marks
    q_part = re.sub(r"\?{2,}", "?", q_part)
    return q_part, a_part

parsed_rows = []
for idx, rec in enumerate(raw_records, start=1):
    qa_text = rec.get("QA_Data", "")
    q, a = parse_qa(qa_text)
    parsed_rows.append({
        "id": idx,
        "question": q,
        "answer": a,
        "source": "medDataset"
    })

df = pd.DataFrame(parsed_rows)
print(df.head(2))
print("Parsed", len(df), "rows")
len(df)


SYSTEM_METADATA_PROMPT = (
    "You are a medical data annotator. Given a question and answer, produce concise structured JSON with: "
    "category (lowercase short topic), entities (list of unique medical entities: diseases, syndromes, pathogens, drugs, symptoms, risk groups), "
    "keywords (5-10 short tokens), safety_level (low|medium|high based on potential harm, pregnancy risk, biohazard, or clinical advice)."
)

GENERATION_SCHEMA_EXAMPLE = {
    "category": "viral infection",
    "entities": ["lymphocytic choriomeningitis virus", "rodent", "meningitis"],
    "keywords": ["LCMV", "rodent", "meningitis", "encephalitis", "transmission"],
    "safety_level": "medium"
}

def build_prompt(question: str, answer: str) -> str:
    return (
        f"Question: {question}\nAnswer: {answer}\n---\n"
        f"Return ONLY valid minified JSON matching this example shape: {json.dumps(GENERATION_SCHEMA_EXAMPLE, ensure_ascii=False)}"
    )

model = genai.GenerativeModel(MODEL_NAME)

def call_gemini(question: str, answer: str, retries: int = 3, delay: float = 2.0) -> Dict[str, Any]:
    prompt = SYSTEM_METADATA_PROMPT + "\n" + build_prompt(question, answer)
    last_err: Optional[Exception] = None
    for attempt in range(1, retries + 1):
        try:
            resp = model.generate_content(prompt)
            text = resp.text.strip()
            json_candidate = text
            json_candidate = re.sub(r'^```json|^```|```$', '', json_candidate.strip(), flags=re.IGNORECASE|re.MULTILINE)
            parsed = json.loads(json_candidate)
           
            for k in ["category", "entities", "keywords", "safety_level"]:
                if k not in parsed:
                    raise ValueError(f"Missing key {k}")
            if not isinstance(parsed.get("entities"), list):
                parsed["entities"] = []
            if not isinstance(parsed.get("keywords"), list):
                parsed["keywords"] = []
            return parsed
        except Exception as e:
            last_err = e
            time.sleep(delay * attempt)
    return {"error": str(last_err), **GENERATION_SCHEMA_EXAMPLE}

sample_meta = call_gemini(df.loc[0, 'question'], df.loc[0, 'answer'])
print("Smoke Test Result:", sample_meta)


def process_row(row: Dict[str, Any]) -> Dict[str, Any]:
    """Helper function to process a single row and call the API."""
    meta = call_gemini(row['question'], row['answer'])
    merged = {**row,
              **meta,
              'char_count': len(row['answer']),
              'combined_text': f"Question: {row['question']}\nAnswer: {row['answer']}"}
    return merged


MAX_WORKERS = 50 
print(f"Starting concurrent enrichment with {MAX_WORKERS} workers...")
rows_to_process = df.to_dict(orient='records')
enriched_rows = []

with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    results = list(tqdm(executor.map(process_row, rows_to_process), 
                        total=len(rows_to_process), 
                        desc="Enriching Records"))
    enriched_rows.extend(results)

edf = pd.DataFrame(enriched_rows)
print(edf.head(2))
print("Enriched", len(edf), "rows")



ENRICHED_JSON = OUTPUT_DIR / "enriched_records.json"
ENRICHED_JSONL = OUTPUT_DIR / "enriched_records.jsonl"
ENRICHED_CSV = OUTPUT_DIR / "enriched_records.csv"
TEXT_CORPUS = OUTPUT_DIR / "metadata_prefixed_corpus.txt"

edf.to_json(ENRICHED_JSON, orient='records', indent=2, force_ascii=False)
edf.to_json(ENRICHED_JSONL, orient='records', lines=True, force_ascii=False)
edf.to_csv(ENRICHED_CSV, index=False)

with TEXT_CORPUS.open('w', encoding='utf-8') as f:
    for r in edf.to_dict(orient='records'):
        meta_header = {
            'id': r['id'],
            'category': r.get('category'),
            'entities': r.get('entities'),
            'keywords': r.get('keywords'),
            'safety_level': r.get('safety_level')
        }
        f.write(json.dumps(meta_header, ensure_ascii=False) + "\n")
        f.write(r['combined_text'] + "\n\n")

print("Saved:")
for p in [ENRICHED_JSON, ENRICHED_JSONL, ENRICHED_CSV, TEXT_CORPUS]:
    print(" -", p, f"({p.stat().st_size/1024:.1f} KB)")

Loaded 16407 records
   id                                           question  \
0   1  Who is at risk for Lymphocytic Choriomeningiti...   
1   2  What are the symptoms of Lymphocytic Choriomen...   

                                              answer      source  
0  LCMV infections can occur after exposure to fr...  medDataset  
1  LCMV is most commonly recognized as causing ne...  medDataset  
Parsed 16407 rows
   id                                           question  \
0   1  Who is at risk for Lymphocytic Choriomeningiti...   
1   2  What are the symptoms of Lymphocytic Choriomen...   

                                              answer      source  
0  LCMV infections can occur after exposure to fr...  medDataset  
1  LCMV is most commonly recognized as causing ne...  medDataset  
Parsed 16407 rows
Smoke Test Result: {'category': 'viral infection', 'entities': ['lymphocytic choriomeningitis', 'lymphocytic choriomeningitis virus', 'rodent', 'fetus', 'organ transplant recipien

Enriching Records: 100%|██████████| 16407/16407 [1:07:37<00:00,  4.04it/s] 



   id                                           question  \
0   1  Who is at risk for Lymphocytic Choriomeningiti...   
1   2  What are the symptoms of Lymphocytic Choriomen...   

                                              answer      source  \
0  LCMV infections can occur after exposure to fr...  medDataset   
1  LCMV is most commonly recognized as causing ne...  medDataset   

          category                                           entities  \
0  viral infection  [lymphocytic choriomeningitis, lymphocytic cho...   
1  viral infection  [Lymphocytic Choriomeningitis Virus, Lymphocyt...   

                                            keywords safety_level  char_count  \
0  [LCMV, lymphocytic choriomeningitis, transmiss...       medium         466   
1  [LCMV, symptoms, neurological, febrile, mening...         high        2468   

                                       combined_text error  
0  Question: Who is at risk for Lymphocytic Chori...   NaN  
1  Question: What are the sy