In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

file_path = '/content/drive/MyDrive/CDR_Data/CDR_Data/CDR.Corpus.v010516/CDR_DevelopmentSet.BioC.xml'

# Load XML data into a DataFrame
df = pd.read_xml(file_path)

# Display the DataFrame
df.head(80)


Unnamed: 0,source,date,key,id,passage,relation
0,PubTator,,,,,
1,,0/0/0,,,,
2,,,PubTator.key,,,
3,,,,6794356.0,\n,\n
4,,,,6504332.0,\n,\n
...,...,...,...,...,...,...
75,,,,1928887.0,\n,\n
76,,,,1728915.0,\n,\n
77,,,,20558148.0,\n,\n
78,,,,19940105.0,\n,\n


In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

# Define storage for results
annotations = []

# Parse the XML file
tree = ET.parse(file_path)
root = tree.getroot()
for document in root.findall("document"):
    pmid = document.find("id").text  # Extract PubMed ID (PMID)

    for passage in document.findall("passage"):
        passage_text = passage.find("text").text.strip() if passage.find("text") is not None else ""
        passage_offset = int(passage.find("offset").text) if passage.find("offset") is not None else -1

        # Iterate through annotations
        for annotation in passage.findall("annotation"):
            entity_type = annotation.find("infon[@key='type']")

            # Check if the entity is a 'Disease'
            if entity_type is not None and entity_type.text == "Disease":
                annotation_id = annotation.get("id", "N/A")  # Default to "N/A" if ID is missing
                mesh_id = annotation.find("infon[@key='MESH']")
                mesh_id = mesh_id.text if mesh_id is not None else "Unknown"

                # Extract location and text
                location = annotation.find("location")
                if location is not None and location.get("offset") and location.get("length"):
                    start_offset = int(location.get("offset"))
                    length = int(location.get("length"))
                    end_offset = start_offset + length
                else:
                    start_offset = length = end_offset = -1  # Default to invalid offsets

                # Extract entity text
                entity_text_element = annotation.find("text")
                entity_text = entity_text_element.text.strip() if entity_text_element is not None else ""

                # Append the annotation for diseases only
                annotations.append({
                    "PMID": pmid,
                    "Passage": passage_text,
                    "Annotation ID": annotation_id,
                    "Type": "Disease",  # Explicitly set to "Disease"
                    "MeSH ID": mesh_id,
                    "Start Offset": start_offset,
                    "End Offset": end_offset,
                    "Entity Text": entity_text
                })

# Convert annotations to a DataFrame
df = pd.DataFrame(annotations)

# Display the first few rows
df.head()

# Save the annotations to a CSV file
output_path = "/content/drive/MyDrive/CDR_Data/CDR_Data/Annotations.csv"
df.to_csv(output_path, index=False)



In [None]:
df.head()

Unnamed: 0,PMID,Passage,Annotation ID,Type,MeSH ID,Start Offset,End Offset,Entity Text
0,6794356,Tricuspid valve regurgitation and lithium carb...,0,Disease,D014262,0,29,Tricuspid valve regurgitation
1,6794356,Tricuspid valve regurgitation and lithium carb...,2,Disease,D064420,52,60,toxicity
2,6794356,A newborn with massive tricuspid regurgitation...,3,Disease,D014262,105,128,tricuspid regurgitation
3,6794356,A newborn with massive tricuspid regurgitation...,4,Disease,D001282,130,144,atrial flutter
4,6794356,A newborn with massive tricuspid regurgitation...,5,Disease,D006333,146,170,congestive heart failure


In [None]:
df[df['Entity Text'].str.len() < 3]

Unnamed: 0,PMID,Passage,Annotation ID,Type,MeSH ID,Start Offset,End Offset,Entity Text
171,16157917,Five patients with idiopathic generalized epil...,8,Disease,D009207,257,259,MJ
172,16157917,Five patients with idiopathic generalized epil...,10,Disease,D009207,297,299,MJ
173,16157917,Five patients with idiopathic generalized epil...,11,Disease,D009207,368,370,MJ
174,16157917,Five patients with idiopathic generalized epil...,14,Disease,D009207,458,460,MJ
421,17965424,"OBJECTIVE: A randomised, double-blind study to...",7,Disease,D001172,373,375,RA
...,...,...,...,...,...,...,...,...
4001,15686794,Amiodarone represents an effective antiarrhyth...,4,Disease,D001281,202,204,AF
4123,3461217,More than 50% of Lobund-Wistar (L-W) strain ra...,9,Disease,D011471,524,526,PA
4227,15096016,A resurgence of interest in the surgical treat...,1,Disease,D010300,128,130,PD
4228,15096016,A resurgence of interest in the surgical treat...,2,Disease,D010300,271,273,PD


In [None]:
!pip install pronto


Collecting pronto
  Downloading pronto-2.5.8-py2.py3-none-any.whl.metadata (9.5 kB)
Collecting fastobo~=0.12.2 (from pronto)
  Downloading fastobo-0.12.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.3 kB)
Downloading pronto-2.5.8-py2.py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.7/61.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fastobo-0.12.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fastobo, pronto
Successfully installed fastobo-0.12.3 pronto-2.5.8


In [None]:
import pronto

# Load the .obo file
obo_file_path = '/content/drive/MyDrive/CDR_Data/CDR_Data/doid.obo'
ontology = pronto.Ontology(obo_file_path)

# Prepare data for each term
data = []
for term in ontology.terms():
    data.append({
        "Name": term.name,
        "Definition": term.definition if term.definition else "No definition available",
        "Synonyms": ", ".join([syn.description for syn in term.synonyms]) if term.synonyms else "No synonyms"
    })

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV
output_path = '/content/drive/MyDrive/CDR_Data/CDR_Data/terms_dataset.csv'
df.to_csv(output_path, index=False)
print(f"Dataset saved to {output_path}")


Dataset saved to /content/drive/MyDrive/CDR_Data/CDR_Data/terms_dataset.csv


In [None]:
import pandas as pd

# Load the annotation dataset
annotations_path = '/content/drive/MyDrive/CDR_Data/CDR_Data/Annotations.csv'
annotations = pd.read_csv(annotations_path)

# Load the ontology terms dataset
terms_path = '/content/drive/MyDrive/CDR_Data/CDR_Data/terms_dataset.csv'
terms = pd.read_csv(terms_path)

annotations.head()


Unnamed: 0,PMID,Passage,Annotation ID,Type,MeSH ID,Start Offset,End Offset,Entity Text
0,6794356,Tricuspid valve regurgitation and lithium carb...,0,Disease,D014262,0,29,Tricuspid valve regurgitation
1,6794356,Tricuspid valve regurgitation and lithium carb...,1,Chemical,D016651,34,51,lithium carbonate
2,6794356,Tricuspid valve regurgitation and lithium carb...,2,Disease,D064420,52,60,toxicity
3,6794356,A newborn with massive tricuspid regurgitation...,3,Disease,D014262,105,128,tricuspid regurgitation
4,6794356,A newborn with massive tricuspid regurgitation...,4,Disease,D001282,130,144,atrial flutter


In [None]:
terms.head()

Unnamed: 0,Name,Definition,Synonyms
0,angiosarcoma,A vascular cancer that derives_from the cells ...,hemangiosarcoma
1,pterygium,A corneal disease that is characterized by a t...,surfer's eye
2,disease of metabolism,A disease that involving errors in metabolic p...,metabolic disease
3,shrimp allergy,A crustacean allergy that has_allergic_trigger...,No synonyms
4,aspirin allergy,A drug allergy that has_allergic_trigger acety...,"acetylsalicylic acid allergy, ASA allergy"


In [None]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.10.1


In [None]:
from rapidfuzz import process, fuzz
from nltk.stem import WordNetLemmatizer
import re
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
from tqdm.notebook import tqdm

def fast_compare_entity_text(annotations, terms, return_terms_flat=False):
    """
    Perform a faster entity text comparison with fuzzy matching, normalization, and a progress bar.
    Optionally return the terms_flat DataFrame for debugging.
    """
    # Check if input DataFrames are valid
    if annotations is None or terms is None:
        raise ValueError("Input DataFrames cannot be None.")

    # Normalize text
    lemmatizer = WordNetLemmatizer()
    def normalize_text(text):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        return lemmatizer.lemmatize(text)

    annotations['Entity Text'] = annotations['Entity Text'].apply(normalize_text)

    # Ensure ID column exists in terms
    if 'ID' not in terms.columns:
        print("Warning: 'ID' column is missing. Adding placeholder IDs.")
        terms['ID'] = range(len(terms))

    # Flatten terms for look-up
    terms_flat = (
        pd.concat([
            terms[['Name', 'ID', 'Definition']].rename(columns={'Name': 'Term'}),
            terms[['Synonyms', 'ID', 'Definition']].rename(columns={'Synonyms': 'Term'})
        ])
        .dropna(subset=['Term'])
        .drop_duplicates(subset=['Term'])  # Ensure unique terms
        .reset_index(drop=True)
    )
    terms_flat['Term'] = terms_flat['Term'].apply(normalize_text)

    # Ensure unique index for the mapping dictionary
    if not terms_flat['Term'].is_unique:
        print("Warning: Duplicate terms found. Keeping the first occurrence.")
        terms_flat = terms_flat.drop_duplicates(subset=['Term'])

    # Create a mapping dictionary
    term_map = terms_flat.set_index('Term').to_dict(orient='index')

    # Define the matching function
    def find_match(entity):
        # Ignore short terms
        if len(entity) <= 2:
            return pd.Series([None, None])

        # Exact match
        match = term_map.get(entity, None)
        if match:
            return pd.Series([match['ID'], match['Definition']])

        # Fuzzy match with stricter threshold
        fuzzy_match = process.extractOne(entity, terms_flat['Term'], scorer=fuzz.partial_ratio, score_cutoff=90)
        if fuzzy_match:
            term = fuzzy_match[0]
            match = term_map.get(term, None)
            if match:
                return pd.Series([match['ID'], match['Definition']])

        # No match
        return pd.Series([None, None])

    # Enable tqdm progress bar for pandas apply
    tqdm.pandas(desc="Matching Entities")

    # Apply matching with progress bar
    annotations[['Ontology ID', 'Ontology Definition']] = annotations['Entity Text'].progress_apply(find_match)

    # Return terms_flat if requested
    if return_terms_flat:
        return annotations, terms_flat
    else:
        return annotations





[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
mapped_annotations, terms_flat = fast_compare_entity_text(annotations, terms, return_terms_flat=True)
mapped_annotations.head()



Matching Entities:   0%|          | 0/9773 [00:00<?, ?it/s]

Unnamed: 0,PMID,Passage,Annotation ID,Type,MeSH ID,Start Offset,End Offset,Entity Text,Ontology ID,Ontology Definition
0,6794356,Tricuspid valve regurgitation and lithium carb...,0,Disease,D014262,0,29,tricuspid valve regurgitation,1648.0,An opportunistic bacterial infectious disease ...
1,6794356,Tricuspid valve regurgitation and lithium carb...,1,Chemical,D016651,34,51,lithium carbonate,1648.0,An opportunistic bacterial infectious disease ...
2,6794356,Tricuspid valve regurgitation and lithium carb...,2,Disease,D064420,52,60,toxicity,1648.0,An opportunistic bacterial infectious disease ...
3,6794356,A newborn with massive tricuspid regurgitation...,3,Disease,D014262,105,128,tricuspid regurgitation,1648.0,An opportunistic bacterial infectious disease ...
4,6794356,A newborn with massive tricuspid regurgitation...,4,Disease,D001282,130,144,atrial flutter,10723.0,A viral infectious disease that results_in_for...


In [None]:
# mapped_annotations.to_csv('/content/drive/MyDrive/CDR_Data/CDR_Data/mapped_annotations_fast.csv', index=False)


In [None]:
def compute_coverage(predictions, terms):
    """
    Compute coverage of dictionary terms in system predictions.
    """
    pred_set = set(predictions['Entity Text'].str.lower())
    terms_set = set(terms['Name'].str.lower())

    # Coverage: Proportion of predicted mentions found in the dictionary
    covered = pred_set & terms_set
    coverage = len(covered) / len(pred_set) * 100 if pred_set else 0
    missing_terms = pred_set - terms_set
    print(f"Missing Terms: {missing_terms}")

    return coverage, covered

# Compute Coverage
coverage, covered_terms = compute_coverage(mapped_annotations, terms)

print(f"Coverage: {coverage:.2f}%")
print(f"Covered Terms: {covered_terms}")


Missing Terms: {'dfo', 'thromboembolic disease', 'buprenorphine', 'polyadpribose', 'lithium', 'ischemic brain injury', 'l34dihydroxyphenylalanine', 'haemorrhagic', 'levofloxacin', 'diabetic', 'amisulpride', 'radiolabeled metaiodobenzylguanidine', 'oxaloacetate', 'lactic acid', 'pentylenetetrazole', 'hivinfected', 'choreoathetosis', 'vascular events', 'cyproterone acetate', 'alzheimers disease', 'venous thrombosis', 'amn082', 'heart failure', 'nephrotoxicity', 'alcohol', 'isoflurane', 'myotonic dystrophy', 'constipation', 'neuropathic symptoms', 'thorazine', 'ventricular ectopy', 'inability to repeat words', 'aldosterone', 'androgen', 'epileptiform activity', 'left ventricular enddiastolic volume falls', 'gtn', 'arterial or venous thromboemboli', 'valproate', 'diabenese', 'bacteremia', 'takotsubo cardiomyopathy', 'acute renal insufficiency', 'endometrial cancers', 'neurological deficits', '6ohda', 'glycopyrrolate', 'bone tumors', 'dm', 'rauwolscine', 'fk506', 'marrow toxicity', 'citalop

In [None]:
pred_set = set(mapped_annotations['Entity Text'].str.lower())
terms_set = set(terms['Name'].str.lower())
missing_terms = pred_set - terms_set


In [None]:
validated_terms = {term for term in missing_terms if len(term) > 2}
new_terms = pd.DataFrame({
    'Name': list(validated_terms),
    'ID': [None] * len(validated_terms),
    'Definition': [None] * len(validated_terms)
})
terms = pd.concat([terms, new_terms]).drop_duplicates(subset=['Name']).reset_index(drop=True)


In [None]:
coverage, covered_terms = compute_coverage(mapped_annotations, terms)
print(f"Updated Coverage: {coverage:.2f}%")


Missing Terms: {'e2', 'ka', 't', 'cl', 'dh', 'na', 'cm', 'tp', 'n', 'md', 'ee', 'ra', 'fa', 'cp', 'm', 'cc', 'pd', 'cr', 't4', 'af', 'ag', 'ne', 'dm', 'pg', 'uh', 'dx', 'mc', 'no', 'p', 'vt', 'c', 'mm', 'ad', 'pa', 'hc', 'mp', 'mi', 'mj', 'cq', 'if', 'e', 'so', 'da', 'ob', 'k', 'ma', 'ca', 'nm', 'li', 'pb', 'vf', 'lv', 'h', 'ab', 't3'}
Updated Coverage: 97.55%


In [None]:
mapped_annotations, terms_flat = fast_compare_entity_text(annotations, terms, return_terms_flat=True)
# mapped_annotations.to_csv('/content/drive/MyDrive/CDR_Data/CDR_Data/mapped_annotations_fast.csv', index=False)



Matching Entities:   0%|          | 0/9773 [00:00<?, ?it/s]

In [None]:
coverage, covered_terms = compute_coverage(mapped_annotations, terms)
print(f"Updated Coverage: {coverage:.2f}%")

Missing Terms: {'e2', 'ka', 't', 'cl', 'dh', 'na', 'cm', 'tp', 'n', 'md', 'ee', 'ra', 'fa', 'cp', 'm', 'cc', 'pd', 'cr', 't4', 'af', 'ag', 'ne', 'dm', 'pg', 'uh', 'dx', 'mc', 'no', 'p', 'vt', 'c', 'mm', 'ad', 'pa', 'hc', 'mp', 'mi', 'mj', 'cq', 'if', 'e', 'so', 'da', 'ob', 'k', 'ma', 'ca', 'nm', 'li', 'pb', 'vf', 'lv', 'h', 'ab', 't3'}
Updated Coverage: 97.55%


In [None]:
mapped_annotations.to_csv('/content/drive/MyDrive/CDR_Data/CDR_Data/mapped_annotations_fast.csv', index=False)

In [None]:
annotations_path = '/content/drive/MyDrive/CDR_Data/CDR_Data/Annotations.csv'
terms_path = '/content/drive/MyDrive/CDR_Data/CDR_Data/terms_dataset.csv'

annotations = pd.read_csv(annotations_path)
terms = pd.read_csv(terms_path)
terms['ID'] = range(len(terms))
# Normalize for case-insensitive matching
annotations['Entity Text'] = annotations['Entity Text'].str.lower()
terms['Name'] = terms['Name'].str.lower()
terms['Synonyms'] = terms['Synonyms'].str.lower()

# Create a gold standard by matching Entity Text with the symptom ontology
def verify_gold_standard(annotations, terms):
    """
    Create a gold standard dataset by matching Entity Text with terms.
    """
    # Ensure ID column exists
    if 'ID' not in terms.columns:
        print("Warning: 'ID' column is missing. Adding placeholder IDs.")
        terms['ID'] = range(len(terms))  # Add unique placeholder IDs

    # Flatten terms for matching
    terms_flat = pd.concat([
        terms[['Name', 'ID']].rename(columns={'Name': 'Term'}),
        terms[['Synonyms', 'ID']].rename(columns={'Synonyms': 'Term'})
    ]).dropna(subset=['Term']).drop_duplicates(subset=['Term'])  # Ensure unique terms
    terms_flat['Term'] = terms_flat['Term'].str.lower()

    # Create a mapping dictionary for fast look-up
    term_map = terms_flat.set_index('Term').to_dict(orient='index')  # Unique index ensured

    # Verify each annotation
    gold_standard = []
    for _, row in annotations.iterrows():
        entity = row['Entity Text'].lower()  # Normalize entity text for matching
        match = term_map.get(entity, None)
        if match:
            gold_standard.append({
                "Entity Text": row['Entity Text'],
                "Start Offset": row['Start Offset'],
                "End Offset": row['End Offset'],
                "Ontology ID": match['ID']
            })

    return pd.DataFrame(gold_standard)

# Generate gold standard dataset
gold_standard = verify_gold_standard(annotations, terms)

# Save to CSV
gold_standard_path = '/content/drive/MyDrive/CDR_Data/CDR_Data/gold_standard.csv'
gold_standard.to_csv(gold_standard_path, index=False)
print(f"Gold standard dataset saved to {gold_standard_path}")

Gold standard dataset saved to /content/drive/MyDrive/CDR_Data/CDR_Data/gold_standard.csv


In [None]:
gold_standard.head()


Unnamed: 0,Entity Text,Start Offset,End Offset,Ontology ID
0,tricuspid valve regurgitation,0,29,15092
1,lithium carbonate,34,51,14530
2,toxicity,52,60,16104
3,tricuspid regurgitation,105,128,15487
4,atrial flutter,130,144,15272


In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

# Load gold standard and predictions
gold_standard_path = '/content/drive/MyDrive/CDR_Data/CDR_Data/gold_standard.csv'
predictions_path = '/content/drive/MyDrive/CDR_Data/CDR_Data/mapped_annotations_fast.csv'

gold_standard = pd.read_csv(gold_standard_path)
predictions = pd.read_csv(predictions_path)

# Normalize text for comparison
gold_standard['Entity Text'] = gold_standard['Entity Text'].str.lower()
predictions['Entity Text'] = predictions['Entity Text'].str.lower()

# Create sets of tuples for comparison
gold_set = set(zip(gold_standard['Entity Text'], gold_standard['Ontology ID']))
pred_set = set(zip(predictions['Entity Text'], predictions['Ontology ID']))

# Calculate True Positives, False Positives, and False Negatives
tp = len(gold_set & pred_set)
fp = len(pred_set - gold_set)
fn = len(gold_set - pred_set)

# Compute Precision, Recall, and F1-Score
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

# Print results
print(f"True Positives (TP): {tp}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

True Positives (TP): 210
False Positives (FP): 8734
False Negatives (FN): 33
Precision: 0.02
Recall: 0.86
F1-Score: 0.05
