In [None]:
import lxml.etree as ET
import pandas as pd

# Path to your downloaded DrugBank XML file
drugbank_xml_path = 'full_database.xml'

# Parse XML
tree = ET.parse(drugbank_xml_path)
root = tree.getroot()

ns = {'db': 'http://www.drugbank.ca'}

drugs_data = []

for drug in root.findall('db:drug', ns):
    # Only approved drugs (optional)
    if drug.find('db:groups/db:group', ns) is not None:
        groups = [g.text for g in drug.findall('db:groups/db:group', ns)]
        if 'approved' not in groups:
            continue

    name = drug.find('db:name', ns).text
    # Get SMILES (first available)
    smiles = None
    for prop in drug.findall('db:calculated-properties/db:property', ns):
        kind = prop.find('db:kind', ns)
        if kind is not None and kind.text == 'SMILES':
            smiles = prop.find('db:value', ns).text
            break

    # Get indications text
    indications = drug.find('db:indication', ns)
    indications_text = indications.text.lower() if indications is not None else ''

    # Get pharmacology text
    pharmacology = drug.find('db:pharmacology', ns)
    pharmacology_text = pharmacology.text.lower() if pharmacology is not None else ''

    # Filter out antimalarial drugs
    if 'antimalarial' in indications_text or 'antimalarial' in pharmacology_text:
        continue

    drugs_data.append({
        'name': name,
        'smiles': smiles,
        'indications': indications_text,
        'pharmacology': pharmacology_text
    })

# Convert to DataFrame
df_drugs = pd.DataFrame(drugs_data)

# Save filtered drugs to CSV
df_drugs.to_csv('drugbank_filtered_no_antimalarial.csv', index=False)

print(f"Extracted {len(df_drugs)} drugs excluding antimalarials.")
