In [5]:
import xml.etree.ElementTree as ET
import csv

# Input & Output Files
xml_file = 'DBank_FullDB.xml'  # Replace with actual filename
csv_file = 'drugbank_qna_clean.csv'

# Namespace used in DrugBank XML
ns = {'db': 'http://www.drugbank.ca'}

with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['input', 'output'])

    # Efficient parsing
    context = ET.iterparse(xml_file, events=('end',))

    for event, elem in context:
        if elem.tag == f'{{{ns["db"]}}}drug':
            # Extract fields
            drug_id = elem.findtext('db:drugbank-id', default='', namespaces=ns).strip()
            name = elem.findtext('db:name', default='', namespaces=ns).strip()
            description = elem.findtext('db:description', default='', namespaces=ns).strip()
            indication = elem.findtext('db:indication', default='', namespaces=ns).strip()
            mechanism = elem.findtext('db:mechanism-of-action', default='', namespaces=ns).strip()
            toxicity = elem.findtext('db:toxicity', default='', namespaces=ns).strip()
            dosage_form = elem.findtext('db:dosage-form', default='', namespaces=ns).strip()

            # Categories
            categories = []
            for cat in elem.findall('.//db:categories/db:category/db:category', namespaces=ns):
                if cat is not None and cat.text:
                    categories.append(cat.text.strip())
            categories_text = ', '.join(categories)

            # Manufacturer
            manufacturer = elem.findtext('db:manufacturers/db:manufacturer', default='', namespaces=ns).strip()

            # Write Q&A only if data exists
            if drug_id and name:
                writer.writerow([f"What is the name of the drug with ID {drug_id}?", name])
            if name and description:
                writer.writerow([f"Describe {name}.", description])
            if name and indication:
                writer.writerow([f"What is the indication for {name}?", indication])
            if name and mechanism:
                writer.writerow([f"What is the mechanism of action of {name}?", mechanism])
            if name and dosage_form:
                writer.writerow([f"What is the dosage form of {name}?", dosage_form])
            if name and categories_text:
                writer.writerow([f"What are the categories of {name}?", categories_text])
            if name and manufacturer:
                writer.writerow([f"Who manufactures {name}?", manufacturer])
            if name and toxicity:
                writer.writerow([f"What are the toxicity/side effects of {name}?", toxicity])

            # Drug interactions
            interactions = elem.findall('.//db:drug-interactions/db:drug-interaction', namespaces=ns)
            for inter in interactions:
                inter_drug = inter.findtext('db:name', default='', namespaces=ns).strip()
                inter_desc = inter.findtext('db:description', default='', namespaces=ns).strip()
                if name and inter_drug and inter_desc:
                    writer.writerow([f"Does {name} interact with {inter_drug}?", inter_desc])

            # Clear element to free memory
            elem.clear()

print(f"\u2705 Cleaned Q&A CSV generated: {csv_file}")


✅ Cleaned Q&A CSV generated: drugbank_qna_clean.csv
