## MIMIC_III QA

In [None]:
import json
import pandas as pd

# Load the JSON file
with open('test_final.json', 'r') as f:
    data = json.load(f)  # Could be a list or dictionary

# Check structure
print(f"Data type: {type(data)}")
if isinstance(data, list):
    print(f"Number of QA pairs: {len(data)}")
    print("First item:", data[0])  # Inspect first entry
elif isinstance(data, dict):
    print(f"Keys: {data.keys()}")
    print("Sample item:", next(iter(data.items())))  # Inspect first key-value pair

In [None]:
import json

# Load the JSON file
with open('test_final.json', 'r') as f:
    data = json.load(f)

# Extract all QA pairs
qa_pairs = []
for item in data['data']:
    for paragraph in item['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            qa_pairs.append({
                'id': qa['id'],
                'question': qa['question'],
                'answer': qa['answers'][0]['text'],  # Take the first answer
                'context': context
            })

# Print a sample
print(f"Total QA pairs: {len(qa_pairs)}")
print("\nSample QA pair:")
print(f"Question: {qa_pairs[0]['question']}")
print(f"Answer: {qa_pairs[0]['answer']}")
print(f"Context snippet: {qa_pairs[0]['context'][:200]}...")

In [None]:
import json
import pandas as pd

# Load the JSON file
with open('test_final.json', 'r') as f:
    data = json.load(f)

# Extract all QA pairs into a list of dictionaries
qa_records = []
for item in data['data']:
    for paragraph in item['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            qa_records.append({
                'id': qa['id'],
                'question': qa['question'],
                'answer': qa['answers'][0]['text'],  # Take the first answer
                'answer_start': qa['answers'][0]['answer_start'],  # Start position in context
                'context': context
            })

# Convert to DataFrame
df = pd.DataFrame(qa_records)

df.head()

In [None]:
# Save to CSV (without index)
df.to_csv('mimic_qa_dataset.csv', index=False)

print("CSV file saved successfully!")

In [None]:
import os
print(f"CSV saved at: {os.path.abspath('mimic_qa_dataset.csv')}")

In [None]:
import pandas as pd
import re
import json

# Load your dataset
df = pd.read_csv("mimic_qa_dataset.csv")  # Replace with the actual path

# Function to clean and extract structured clinical input
def refine_clinical_summary_v2(context):
    if pd.isnull(context): return ""
    
    # Remove [** ... **] tokens and normalize whitespace
    context = re.sub(r"\[\*\*.*?\*\*\]", "", context)
    context = re.sub(r"\s+", " ", context).strip()

    parts = []

    # Chief Complaint
    cc = re.search(r'chief complaint:\s*(.*?)(?=major surgical|history of present illness|review of systems|past medical history|social history|physical exam|$)', context, re.IGNORECASE | re.DOTALL)
    if cc:
        parts.append("Chief Complaint: " + cc.group(1).strip())

    # History of Present Illness
    hpi = re.search(r'history of present illness:\s*(.*?)(?=review of systems|past medical history|social history|physical exam|brief hospital course|$)', context, re.IGNORECASE | re.DOTALL)
    if hpi:
        parts.append("HPI: " + hpi.group(1).strip())

    # Emergency Notes
    ed_lines = [line.strip() for line in context.split(".") if any(k in line.lower() for k in ["ems", "er", "ed", "emergency"])]
    if ed_lines:
        parts.append("ED Notes: " + " ".join(ed_lines))

    # Lab Findings
    lab_lines = []
    for line in context.split("."):
        if any(k in line.lower() for k in ["wbc", "lactate", "leukocytosis", "labs"]):
            cleaned = line.strip().replace("if", "of").replace("wbc-", "WBC: ").replace("lactate", "Lactate")
            lab_lines.append(cleaned)
    if lab_lines:
        parts.append("Lab Findings: " + " ".join(lab_lines))

    # Treatment
    meds = ["solumedrol", "nebulizer", "levofloxacin", "azithromycin", "aspirin", "magnesium", "ceftriaxone", "combivent"]
    tx_lines = [line.strip() for line in context.split(".") if any(m in line.lower() for m in meds)]
    if tx_lines:
        parts.append("Treatment Given: " + " ".join(tx_lines))

    return "\n".join(parts)[:1200]

# Apply function to context column
df["input"] = df["context"].apply(refine_clinical_summary_v2)

# Create instruction/output columns
df["instruction"] = df["question"]
df["output"] = df["answer"]

# Select final columns
final_df = df[["instruction", "input", "output"]]

# Save to CSV
csv_path = "MIMIC_III_QA_Refined_Final.csv"
final_df.to_csv(csv_path, index=False)

# Save to JSONL
jsonl_path = "MIMIC_III_QA_Refined_Final.jsonl"
with open(jsonl_path, "w") as f:
    for _, row in final_df.iterrows():
        json.dump({
            "instruction": row["instruction"],
            "input": row["input"],
            "output": row["output"]
        }, f)
        f.write("\n")

print("✅ Saved:")
print(f"- {csv_path}")
print(f"- {jsonl_path}")

## MIMIC_IV QA

In [None]:
import pandas as pd
import json

# === FILE PATHS ===
QA_FILE = "MIMIC_IV_FINAL_QA.csv"          # Replace with actual path
MERGED_FILE = "merged_mimic_IV.csv"        # Your merged file with vitals, meds, diagnoses

# === LOAD DATA ===
qa_df = pd.read_csv(QA_FILE)
merged_df = pd.read_csv(MERGED_FILE)

# === GROUP MERGED DATA BY PATIENT ===
def build_patient_context(group):
    parts = []

    # Demographics
    gender = group["gender"].dropna().unique()
    if gender.size > 0:
        parts.append(f"Gender: {gender[0]}")

    # Chief complaint
    cc = group["chiefcomplaint"].dropna().unique()
    if cc.size > 0:
        parts.append(f"Chief Complaint: {cc[0]}")

    # Diagnoses
    diagnoses = group["icd_title"].dropna().unique()
    if diagnoses.size > 0:
        parts.append("Diagnoses: " + ", ".join(diagnoses[:5]))  # limit to top 5

    # Vitals (mean values)
    vitals = {
        "Temp": group["temperature"].mean(),
        "HR": group["heartrate"].mean(),
        "RR": group["resprate"].mean(),
        "O2Sat": group["o2sat"].mean(),
        "SBP": group["sbp"].mean(),
        "DBP": group["dbp"].mean()
    }
    vitals_str = ", ".join([f"{k} {round(v, 1)}" for k, v in vitals.items() if pd.notna(v)])
    if vitals_str:
        parts.append("Vitals: " + vitals_str)

    # Medications
    meds = group["name"].dropna().unique()
    if len(meds) > 0:
        parts.append("Medications Given: " + ", ".join(meds[:5]))  # limit to top 5

    return "\n".join(parts)

# Create structured input per patient
patient_inputs = merged_df.groupby("subject_id").apply(build_patient_context).reset_index()
patient_inputs.columns = ["patient_id", "input"]

# === MERGE WITH QA ===
qa_merged = qa_df.merge(patient_inputs, on="patient_id", how="left")
qa_merged["instruction"] = qa_merged["question"]
qa_merged["output"] = qa_merged["correct_answer"]

# Keep final columns
final_df = qa_merged[["instruction", "input", "output"]]

# === SAVE OUTPUT ===
final_df.to_csv("MIMIC_IV_QA_Structured.csv", index=False)

with open("MIMIC_IV_QA_Structured.jsonl", "w") as f:
    for _, row in final_df.iterrows():
        json.dump({
            "instruction": row["instruction"],
            "input": row["input"],
            "output": row["output"]
        }, f)
        f.write("\n")

print("✅ Saved:")
print("- MIMIC_IV_QA_Structured.csv")
print("- MIMIC_IV_QA_Structured.jsonl")


In [None]:
import pandas as pd

# === Load QA File ===
qa_df = pd.read_csv("MIMIC_IV_FINAL_QA.csv")

# === Load Clinical CSVs ===
diagnosis = pd.read_csv("diagnosis.csv")
edstays = pd.read_csv("edstays.csv")
medrecon = pd.read_csv("medrecon.csv")
pyxis = pd.read_csv("pyxis.csv")
triage = pd.read_csv("triage.csv")
vitals = pd.read_csv("vitalsign.csv")

# === Filter All Clinical Data to QA Patient IDs ===
qa_patients = qa_df["patient_id"].unique()

diagnosis = diagnosis[diagnosis["subject_id"].isin(qa_patients)]
edstays = edstays[edstays["subject_id"].isin(qa_patients)]
medrecon = medrecon[medrecon["subject_id"].isin(qa_patients)]
pyxis = pyxis[pyxis["subject_id"].isin(qa_patients)]
triage = triage[triage["subject_id"].isin(qa_patients)]
vitals = vitals[vitals["subject_id"].isin(qa_patients)]

# === Function to Build Input Summary ===
def build_input(subject_id):
    parts = []

    # Chief Complaint
    cc = triage[triage["subject_id"] == subject_id]["chiefcomplaint"].dropna().unique()
    if cc.size > 0:
        parts.append(f"Chief Complaint: {cc[0]}")

    # Gender
    gender = edstays[edstays["subject_id"] == subject_id]["gender"].dropna().unique()
    if gender.size > 0:
        parts.append(f"Gender: {gender[0]}")

    # Diagnoses
    dx = diagnosis[diagnosis["subject_id"] == subject_id]["icd_title"].dropna().unique()
    if dx.size > 0:
        parts.append("Diagnoses: " + ", ".join(dx[:5]))

    # Vitals
    vdf = vitals[vitals["subject_id"] == subject_id]
    vitals_vals = {
        "Temp": vdf["temperature"].mean(),
        "HR": vdf["heartrate"].mean(),
        "RR": vdf["resprate"].mean(),
        "O2Sat": vdf["o2sat"].mean(),
        "SBP": vdf["sbp"].mean(),
        "DBP": vdf["dbp"].mean(),
    }
    vitals_text = ", ".join([f"{k} {round(v,1)}" for k, v in vitals_vals.items() if pd.notna(v)])
    if vitals_text:
        parts.append("Vitals: " + vitals_text)

    # Medications
    meds1 = medrecon[medrecon["subject_id"] == subject_id]["name"].dropna().unique()
    meds2 = pyxis[pyxis["subject_id"] == subject_id]["name"].dropna().unique()
    meds = pd.Series(list(meds1) + list(meds2)).dropna().unique()
    if meds.size > 0:
        parts.append("Medications Given: " + ", ".join(meds[:5]))

    return "\n".join(parts)

# === Build Input Context per Patient ===
input_rows = [{"patient_id": sid, "input": build_input(sid)} for sid in qa_patients]
input_df = pd.DataFrame(input_rows)

# === Merge With QA ===
qa_final = qa_df.merge(input_df, on="patient_id", how="left")
qa_final["instruction"] = qa_final["question"]
qa_final["output"] = qa_final["correct_answer"]

final_df = qa_final[["instruction", "input", "output"]]
final_df.to_csv("MIMIC_IV_QA_Completed.csv", index=False)

print("✅ Saved: MIMIC_IV_QA_Completed.csv")

In [None]:
import pandas as pd

# Load your CSV
df = pd.read_csv("MIMIC_IV_QA_Completed.csv")

# Fill missing inputs with a generic placeholder
df["input"] = df["input"].fillna("No structured clinical data available.")

# Save the new file
df.to_csv("MIMIC_IV_QA_Final_Placeholder.csv", index=False)

print("✅ Saved: MIMIC_IV_QA_Final_Placeholder.csv")

## MIMIC_QA

In [None]:
import pandas as pd
import json

# Load the mimic_iv_note_qa.csv file
qa_df = pd.read_csv("mimic_iv_note_qa.csv")

# Expand qa_pairs into multiple rows
expanded_rows = []

for _, row in qa_df.iterrows():
    subject_id = row.get("subject_id")
    note_id = row.get("note_id")
    hadm_id = row.get("hadm_id")
    
    try:
        qa_list = json.loads(row["qa_pairs"])
        for qa in qa_list:
            question = qa.get("question", "").strip()
            answer = qa.get("answer", "").strip()
            if question and answer:
                expanded_rows.append({
                    "subject_id": subject_id,
                    "note_id": note_id,
                    "hadm_id": hadm_id,
                    "instruction": question,
                    "output": answer
                })
    except json.JSONDecodeError:
        continue

# Convert to DataFrame
qa_expanded = pd.DataFrame(expanded_rows)
qa_expanded.to_csv("mimic_iv_note_qa_expanded.csv", index=False)
print("✅ Saved: mimic_iv_note_qa_expanded.csv")

In [None]:
print("Diagnosis columns:", diagnosis.columns)
print("Medrecon columns:", medrecon.columns)
print("Pyxis columns:", pyxis.columns)

In [None]:
import pandas as pd

# === Load Files ===
qa_df = pd.read_csv("mimic_iv_note_qa_expanded.csv")
edstays = pd.read_csv("edstays.csv")
diagnosis = pd.read_csv("diagnosis.csv")
medrecon = pd.read_csv("medrecon.csv")
pyxis = pd.read_csv("pyxis.csv")
triage = pd.read_csv("triage.csv")
vitals = pd.read_csv("vitalsign.csv")

# === Step 1: Merge edstays to bring in stay_id and gender ===
qa_df = qa_df.merge(edstays[["subject_id", "hadm_id", "stay_id", "gender"]], on=["subject_id", "hadm_id"], how="left")

# === Step 2: Drop rows without stay_id ===
qa_df = qa_df.dropna(subset=["stay_id"])
qa_df["stay_id"] = qa_df["stay_id"].astype(int)
qa_df["subject_id"] = qa_df["subject_id"].astype(int)

# === Step 3: Filter structured data to only needed patients ===
patient_keys = qa_df[["subject_id", "stay_id"]].drop_duplicates()

def filter_by_keys(df, keys):
    return df.merge(keys, on=["subject_id", "stay_id"], how="inner")

diagnosis = filter_by_keys(diagnosis, patient_keys)
medrecon = filter_by_keys(medrecon, patient_keys)
pyxis = filter_by_keys(pyxis, patient_keys)
triage = filter_by_keys(triage, patient_keys)
vitals = filter_by_keys(vitals, patient_keys)

# === Step 4: Build structured input per patient admission ===
def build_input(sid, stay_id, gender=None):
    parts = []

    if pd.notna(gender):
        parts.append(f"Gender: {gender}")

    # Chief Complaint
    cc = triage[(triage["subject_id"] == sid) & (triage["stay_id"] == stay_id)]["chiefcomplaint"].dropna().unique()
    if cc.size > 0:
        parts.append(f"Chief Complaint: {cc[0]}")

    # Diagnoses
    dx = diagnosis[(diagnosis["subject_id"] == sid) & (diagnosis["stay_id"] == stay_id)]["icd_title"].dropna().unique()
    if dx.size > 0:
        parts.append("Diagnoses: " + ", ".join(dx[:5]))

    # Vitals
    vdf = vitals[(vitals["subject_id"] == sid) & (vitals["stay_id"] == stay_id)]
    vitals_avg = {
        "Temp": vdf["temperature"].mean(),
        "HR": vdf["heartrate"].mean(),
        "RR": vdf["resprate"].mean(),
        "O2Sat": vdf["o2sat"].mean(),
        "SBP": vdf["sbp"].mean(),
        "DBP": vdf["dbp"].mean()
    }
    vitals_str = ", ".join([f"{k} {round(v,1)}" for k, v in vitals_avg.items() if pd.notna(v)])
    if vitals_str:
        parts.append("Vitals: " + vitals_str)

    # Medications
    meds1 = medrecon[(medrecon["subject_id"] == sid) & (medrecon["stay_id"] == stay_id)]["name"].dropna().unique()
    meds2 = pyxis[(pyxis["subject_id"] == sid) & (pyxis["stay_id"] == stay_id)]["name"].dropna().unique()
    meds = pd.Series(list(meds1) + list(meds2)).dropna().unique()
    if meds.size > 0:
        parts.append("Medications Given: " + ", ".join(meds[:5]))

    return "\n".join(parts)

# === Step 5: Apply input generation ===
qa_df["input"] = qa_df.apply(lambda row: build_input(row["subject_id"], row["stay_id"], row.get("gender")), axis=1)

# === Step 6: Save final dataset ===
qa_final = qa_df[["instruction", "input", "output"]]
qa_final.to_csv("mimic_iv_final_structured_qa_stay.csv", index=False)

print("✅ Saved: mimic_iv_final_structured_qa_stay.csv")

In [None]:
df.head()

In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv("mimic_iv_final_structured_qa_stay.csv")

# For diagnosis-related answers, extract labels (adjust based on your task)
diagnosis_counts = df["output"].str.extract(r"(COPD|HIV|Hypertension|Pneumonia|Fracture)")[0].value_counts()

print(diagnosis_counts)

## combined csv

In [None]:
import pandas as pd

# === Load the files ===
mimic_iv_stay = pd.read_csv("mimic_iv_final_structured_qa_stay.csv")
mimic_iv_placeholder = pd.read_csv("MIMIC_IV_QA_Final_Placeholder.csv")
mimic_iii = pd.read_csv("MIMIC_III_QA_Refined_Final.csv")

# === Standardize column names (lowercase) ===
for df in [mimic_iv_stay, mimic_iv_placeholder, mimic_iii]:
    df.rename(columns=lambda x: x.strip().lower(), inplace=True)

# === Keep only the necessary columns ===
columns_needed = ["instruction", "input", "output"]
for df in [mimic_iv_stay, mimic_iv_placeholder, mimic_iii]:
    if not all(col in df.columns for col in columns_needed):
        raise ValueError("One or more required columns are missing in a file")

# === Merge all three ===
combined_df = pd.concat([
    mimic_iv_stay[columns_needed],
    mimic_iv_placeholder[columns_needed],
    mimic_iii[columns_needed]
], ignore_index=True)

# === Save the final merged file ===
combined_df.to_csv("mimic_qa_combined.csv", index=False)

print(f"✅ Saved: mimic_qa_combined.csv with {len(combined_df)} rows")

## preprocessing medquad and icliniq QA datasets

In [None]:
import pandas as pd

# Load MedQuAD
df = pd.read_csv("medquad.csv")

# Clean whitespace and drop NaNs
df = df.dropna(subset=["question", "focus_area", "answer"])
df["question"] = df["question"].str.strip()
df["focus_area"] = df["focus_area"].str.strip()
df["answer"] = df["answer"].str.strip()

# Remove very short or empty responses
df = df[df["answer"].str.len() > 20]

# Remove duplicate Q-A pairs
df = df.drop_duplicates(subset=["question", "answer"])

# Rename to match BioMistral format
df = df.rename(columns={
    "question": "instruction",
    "focus_area": "input",
    "answer": "output"
})

# Optional: Normalize line breaks and spacing
for col in ["instruction", "input", "output"]:
    df[col] = df[col].str.replace(r"\s+", " ", regex=True)

# Save preprocessed version
df.to_csv("medquad_preprocessed.csv", index=False)
print("✅ Saved: medquad_preprocessed.csv with", len(df), "rows")

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("icliniq_medical_qa_cleaned.csv")

# Drop rows with missing fields
df = df.dropna(subset=["Title", "Question", "Answer"])

# Rename to standard format
df = df.rename(columns={
    "Question": "instruction",
    "Title": "input",
    "Answer": "output"
})

# Clean whitespace
for col in ["instruction", "input", "output"]:
    df[col] = df[col].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()

# Filter: remove very short answers (<20 characters)
df = df[df["output"].str.len() > 20]

# Remove duplicates
df = df.drop_duplicates(subset=["instruction", "output"])

# Save final processed version
df.to_csv("icliniq_preprocessed.csv", index=False)
print(f"✅ Saved: icliniq_preprocessed.csv with {len(df)} rows")

## merge csv file with processed medquad and icliniq datasets.

In [None]:
import pandas as pd

# Load preprocessed files
mimic = pd.read_csv("mimic_qa_combined.csv")
medquad = pd.read_csv("medquad_preprocessed.csv")
icliniq = pd.read_csv("icliniq_preprocessed.csv")

# Ensure column names are standardized
for df in [mimic, medquad, icliniq]:
    df.columns = [col.strip().lower() for col in df.columns]

# Ensure all required columns are present
columns = ["instruction", "input", "output"]
for df in [mimic, medquad, icliniq]:
    if not all(col in df.columns for col in columns):
        raise ValueError("Missing one of 'instruction', 'input', or 'output' in a dataset.")

# Combine
final_df = pd.concat(
    [mimic[columns], medquad[columns], icliniq[columns]],
    ignore_index=True
)

# Optional: deduplicate
final_df.drop_duplicates(subset=["instruction", "input", "output"], inplace=True)

# Save merged dataset
final_df.to_csv("bio_mistral_qa_combined.csv", index=False)
print(f"✅ Saved: bio_mistral_qa_combined.csv with {len(final_df)} rows")