This notebook takes the formatted CLAMP, cTAKES, and MetaMap output/predictions and runs performance statistics on them compared to the labels generated with the benchmark (BM) set.

In [1]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# configurations that can be modified
ABSTRACT = True # True if running this program of abstracts, False if running on full-texts
FILTER = False # True to filter some predicted terms to increase precision, False to use raw predictions/output

RELEVANT_TUIS_FULLTEXT = ['T033', 'T048']
RELEVANT_TUIS_ABSTRACT = ['T033', 'T048']

# for naming files
if FILTER:
    filtered = "filtered_"
else:
    filtered = ""
    
STATS_DIR = "statistics" # directory where performance statistics will be saved

if not os.path.exists(STATS_DIR):
    os.makedirs(STATS_DIR)
else:
    print(f"The folder '{STATS_DIR}' already exists, so a new folder was not created.")
    

# location of full-texts/abstracts in plain text
INPUT_DIRECTORY_FULL_TEXT = "pubmed_fulltexts_544"
INPUT_DIRECTORY_ABSTRACT = "pubmed_abstracts_20408"

The folder 'statistics' already exists, so a new folder was not created.


# Helper Functions

In [3]:
# get and format CUI
def get_CUI(x):
    if pd.isna(x):
        return x
    else:
        return x.split()[0].strip()
    

# function for filtering predictions
def filter_pred(pred_df_temp, remove_non_asd=False, filter_by_cui=True, clamp_problem=False):
    
    print("Filtering")
    
    if filter_by_cui:
        pred_df_temp = pred_df_temp.dropna(subset=["CUI"]) # keep only terms with CUI
        pred_df_temp = pred_df_temp[(pred_df_temp["CUI"].str.len() == 8) & (pred_df_temp["CUI"].str[0] == 'C')] # valid CUI only
        if ABSTRACT:
            RELEVANT_TUIS = RELEVANT_TUIS_ABSTRACT
        else:
            RELEVANT_TUIS = RELEVANT_TUIS_FULLTEXT
        pred_df_temp =  pred_df_temp[(pred_df_temp["TUI"].isin(RELEVANT_TUIS)) | (pred_df_temp["CUI"]=='C0018817')] # C0018817 is atrial septal defect
    
    if clamp_problem:
        pred_df_temp = pred_df_temp[pred_df_temp["Semantic"]=="problem"]
    
    # remove non-ASD specific terms (i.e. commorbidities)
    if remove_non_asd:
        autism_comorbid = set(pd.read_csv("asd_psychiatric_commorbidities.csv")["CUI"])
        pred_df_temp = pred_df_temp[~(pred_df_temp["CUI"].isin(autism_comorbid))]
        
    pred_df = pred_df_temp
    return pred_df


# calculate precision, recall, and F1 score
def calculate_statistics(pred_df, true_df, match_cui=False, match_general=False, match_lenient=False, true_pos_df=None):
        
    if match_lenient:
        true_df = true_df.append(pred_df[(pred_df["CUI"].isin(set(true_df["CUI"])))|(pred_df["Entity_lower"]).str.contains("|".join(set(true_df["Entity_lower"])))])
        
    # count overlap and CUI match
    if match_cui:
        print("Matching CUI")
        # drop duplicate predictions on same entity span and CUI
        pred_df = pred_df.drop_duplicates(subset=["paper", "Start", "End"]) # choose one CUI per entity
        true_df = true_df.drop_duplicates(subset=["paper", "CUI", "Start", "End"])
        true_df = true_df[(true_df["CUI"].str.len() == 8) & (true_df["CUI"].str[0] == 'C')] # valid CUI only
        match_grouped = pred_df.merge(true_df, on=["paper", "CUI"], how="outer")
                
    # count overlap only - get true positives
    else:
        # drop duplicate predictions on same entity span
        pred_df = pred_df.drop_duplicates(subset=["paper", "Start", "End"])
        true_df = true_df.drop_duplicates(subset=["paper", "Start", "End"])
        match_grouped = pred_df.merge(true_df, on=["paper"], how="outer")
        
    match_grouped = match_grouped.rename(columns={"Start_x": "Start_pred", "End_x": "End_pred", "Start_y": "Start_label", "End_y": "End_label", "Entity_x": "Entity_pred", "Entity_y": "Entity_label"})
    match_grouped = match_grouped.fillna("NA")
    
    # count overlaps
    if not true_pos_df:
        temp = match_grouped[(match_grouped["Start_pred"] != "NA") & (match_grouped["Start_label"] != "NA")]
        temp = temp[((temp["Start_pred"] >= temp["Start_label"]) & (temp["Start_pred"] <= temp["End_label"])) | ((temp["Start_label"] >= temp["Start_pred"]) & (temp["Start_label"] <= temp["End_pred"]))]   
        true_pos_df = temp
    else:
        temp = true_pos_df
    
    # only count general ASD/behavioral terms
    if match_general:
        temp = temp[temp["TYPE"]=="General"]
        true_df = true_df[true_df["TYPE"]=="General"]
        num_label_pos = len(true_df)
        pred_df = pred_df.merge(true_pos_df[true_pos_df["TYPE"]!="General"][["paper", "Start_pred", "End_pred"]], left_on=["paper", "Start", "End"], right_on=["paper", "Start_pred", "End_pred"], how="outer")
        pred_df = pred_df[pred_df["Start_pred"].isnull()].drop(["Start_pred", "End_pred"], axis=1)
        num_pred_pos = len(pred_df)
        true_pos_df = temp
        num_true_pos = len(temp.drop_duplicates(["paper", "Start_label", "End_label"])) # only count max one pred per label
    elif match_cui:
        num_true_pos = len(temp.drop_duplicates(["paper", "Start_label", "End_label", "CUI"])) # only count max one pred per label
        num_label_pos = len(true_df)
        num_pred_pos = len(pred_df)
    else:
        num_true_pos = len(temp.drop_duplicates(["paper", "Start_label", "End_label"])) # only count max one pred per label
        num_label_pos = len(true_df)
        num_pred_pos = len(pred_df)

    print("Number of true positives =", num_true_pos)
    print("Number of positive labels =", num_label_pos)
    print("Number of positive predictions =", num_pred_pos)
    print()
    precision = num_true_pos/num_pred_pos
    recall = num_true_pos/num_label_pos
    print("Precision =", precision)
    print("Recall =", recall)
    print("F-Measure =", (2 * precision * recall) / (precision + recall))
    
    return true_pos_df, pred_df, true_df


# get true positives, false positives, and false negatives
def get_false_and_true_pos(true_pos_df, pred_df, true_df, cui=True):
    
    if cui:
        pred_df = pred_df.drop_duplicates(subset=["paper", "Start", "End", "CUI"])
        true_df = true_df.drop_duplicates(subset=["paper", "Start", "End", "CUI"])
    else:
        pred_df = pred_df.drop_duplicates(subset=["paper", "Start", "End"])
        true_df = true_df.drop_duplicates(subset=["paper", "Start", "End"])
    
    # group overlapping entities in true pos df
    temp = pd.DataFrame(true_pos_df.groupby(by=["Entity_label", "Entity_pred"])["Start_pred"].count()).reset_index()
    grouped = pd.DataFrame(temp.groupby(by=["Entity_label"])["Start_pred"].sum()).sort_values(by="Start_pred", ascending=False)
    temp = temp.merge(grouped, on="Entity_label")
    temp.columns = ["Entity_label", "Entity_pred", "Entity_pred count", "Entity_label count"]
    temp = temp.sort_values(by=["Entity_label count", "Entity_pred count"], ascending=False)
    true_pos_grouped = temp
    
    if cui:
        columns = ["Entity", "CUI", "TUI"]
    else:
        columns = ["Entity"]
    
    # false positives - count overlap as match
    temp = pred_df.merge(true_pos_df[["paper", "Start_pred", "End_pred"]], left_on=["paper", "Start", "End"], right_on=["paper", "Start_pred", "End_pred"], how="outer")
    
    false_pos = temp[temp["Start_pred"].isnull()].sort_values(by=["paper", "Entity"])
    false_pos_grouped = false_pos.groupby(by=columns)["Start"].count().reset_index().sort_values(by="Start", ascending=False).reset_index(drop=True)
    false_pos_grouped = false_pos_grouped.rename(columns={"Start":"count"})
    
    # false negative - count overlap as match
    temp = true_df.merge(true_pos_df[["paper", "Start_label", "End_label"]], left_on=["paper", "Start", "End"], right_on=["paper", "Start_label", "End_label"], how="outer").drop_duplicates(["paper", "Start", "End"])
    false_neg = temp[temp["Start_label"].isnull()].sort_values(by=["paper", "Entity"])
    false_neg_grouped = false_neg.groupby(by=columns)["Start"].count().reset_index().sort_values(by="Start", ascending=False).reset_index(drop=True)
    false_neg_grouped = false_neg_grouped.rename(columns={"Start":"count"})
    
    return true_pos_grouped, false_pos_grouped, false_neg_grouped, false_pos, false_neg

# Load dataframe with "true" benchmark (BM) labels 

In [4]:
BM_DIR = "BM_labelled" # folder where the results (dataframe with labels) will be stored

if ABSTRACT:
    labels_df = pd.read_csv(os.path.join(BM_DIR, "abstract_labels.csv"))
else:
    labels_df = pd.read_csv(os.path.join(BM_DIR, "full_text_labels.csv"))

In [5]:
# read in BM ASD terms and add TUI
BM_df = pd.read_csv("BM_terms.csv")
BM_df.rename(columns={"CUI": "CUI_original"}, inplace=True)
BM_df["NEGATED"] = BM_df["CUI_original"].apply(lambda x: str(x)[0] == "-")
BM_df["CUI"] = BM_df["CUI_original"].apply(lambda x: str(x).replace("-", ""))
BM_cui_to_tui_df = pd.read_csv("tui_list_BM.txt", sep="\t", index_col=0, header=None).reset_index()
BM_cui_to_tui_df.columns = ["CUI", "TUI"]
BM_df = BM_df.merge(BM_cui_to_tui_df, how="left")
BM_df["TEXT"] = BM_df["TEXT"].str.strip().str.lower()
BM_df = BM_df.drop_duplicates()

In [6]:
# merge labels with BM term information
labels_df = labels_df.merge(BM_df, left_on="Entity_lower", right_on="TEXT", how="left")

# clean-up
labels_df = labels_df.replace({'Entity_lower': {"asperger 's": "asperger's"}})
labels_df = labels_df.replace({'Entity': {"asperger 's": "asperger's"}})
labels_df = labels_df.replace({'Entity': {"Asperger 's": "Asperger's"}})

# case-sensitive for ASD and ASDs
labels_df = labels_df[~((labels_df["Entity_lower"]=="asds")&(labels_df["Entity"]!="ASDs"))]
labels_df = labels_df[~((labels_df["Entity_lower"]=="asd")&(labels_df["Entity"]!="ASD"))]

In [7]:
# check that entitiy and CUI columns are not empty
assert len(labels_df[labels_df["Entity"].str.lower() == "nan"]) == 0
assert len(labels_df[labels_df["Entity_lower"].str.lower() == "nan"]) == 0
assert len(labels_df[labels_df["CUI"].str.len() == 0]) == 0

print("Distinct true entities detected (case-sensitive):", len(set(labels_df["Entity"])))
print("Distinct true entities detected (case-insensitive):", len(set(labels_df["Entity_lower"])))

Distinct true entities detected (case-sensitive): 159
Distinct true entities detected (case-insensitive): 106


In [8]:
ASD_CUI = set(BM_df["CUI"])
print(f"There are {len(ASD_CUI)} unique CUI")

There are 101 unique CUI


In [9]:
labels_original_df = labels_df.copy()

In [10]:
# raise Exception("Manually run cells for CLAMP, cTAKES, or MetaMap anlaysis")

# CLAMP analysis

In [11]:
# configurations that can be modified
CLAMP_DIRECTORY = "clamp" # parent directory for CLAMP-related files

# formatted CLAMP output/predictions
CLAMP_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(CLAMP_DIRECTORY, "clamp_results_full_text")
CLAMP_RESULTS_DIRECTORY_ABSTRACT = os.path.join(CLAMP_DIRECTORY, "clamp_results_abstract")

if ABSTRACT:
    CLAMP_RESULTS_DIRECTORY = CLAMP_RESULTS_DIRECTORY_ABSTRACT 
else:
    CLAMP_RESULTS_DIRECTORY = CLAMP_RESULTS_DIRECTORY_FULL_TEXT

In [12]:
labels_df = labels_original_df.copy()

In [13]:
# format CLAMP output/predictions
pred_df_temp = pd.read_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, "clamp_preds.csv"))
pred_df_temp["CUI"] = pred_df_temp["CUI"].apply(lambda x: get_CUI(x)) # get CUI

# formatting
pred_df_temp["Entity"] = pred_df_temp["Entity_matched"].str.strip() # strip entity
pred_df_temp["Entity_lower"] = pred_df_temp["Entity"].str.strip().str.lower()

# map CUI to TUI
cui_to_tui_df = pd.read_csv("clamp_cui_to_tui_map.txt", sep="\t", header = None)
cui_to_tui_df.columns = ["CUI", "TUI"]
pred_df_temp = pred_df_temp.merge(cui_to_tui_df, on="CUI", how="left")

pred_df_temp = pred_df_temp[~(pred_df_temp["paper"].isnull())]

In [14]:
if not ABSTRACT:
    assert (len((set(labels_df["paper"])).difference(set(pred_df_temp["paper"])))) == 0 # all papers were analyzed
print("Number of papers with extra predictions:", len((set(pred_df_temp["paper"])).difference(set(labels_df["paper"])))) # number of papers with extra predictions

Number of papers with extra predictions: 511


In [15]:
# filter predictions
if FILTER:
    pred_df_temp = filter_pred(pred_df_temp, remove_non_asd=True, clamp_problem=False)
pred_df_temp = pred_df_temp.drop(["Semantic", "Assertion", "Entity_matched"], axis=1) 

In [16]:
print("before dropping duplicates:", len(pred_df_temp))
pred_df_temp = pred_df_temp.drop_duplicates(["Start", "End", "paper", "CUI"])
print("after dropping duplicates:", len(pred_df_temp))
pred_df_clamp = pred_df_temp

before dropping duplicates: 382703
after dropping duplicates: 370912


In [17]:
clamp_entities = pred_df_clamp.drop_duplicates(["Start", "End", "paper"])["Entity"]
print("CLAMP mean entity no. of words =", np.mean([len(str(ent).split(" ")) for ent in clamp_entities]))
print("CLAMP std entity no. of words =", np.std([len(str(ent).split(" ")) for ent in clamp_entities]))

CLAMP mean entity no. of words = 2.3123290184376804
CLAMP std entity no. of words = 1.6464393031840794


In [18]:
# all CLAMP entities should have 1 CUI
if FILTER:
    cui_df = pred_df_temp.groupby(["paper", "Start", "End"])["CUI"].count().reset_index()
    cui_df = cui_df[cui_df["CUI"]!=1].sort_values(by="CUI", ascending=False)
    assert len(cui_df.merge(pred_df_temp, on=["paper", "Start", "End"])) == 0
    assert len(pred_df_temp[(pred_df_temp["CUI"].str.len()==8)&((pred_df_temp["TUI"].isnull()))]) == 0 # all valid CUI should have a TUI after filtering 

In [19]:
# CUI for ASD/ASDs
asd_cui = pred_df_clamp[(pred_df_clamp["Entity"]=="ASD")|(pred_df_clamp["Entity"]=="ASDs")]

print("Fraction C0524528 (pervasive developmental disorder):", len(asd_cui[asd_cui["CUI"]=="C0524528"])/len(asd_cui))
print("Fraction C1510586 (autism spectrum disorders):", len(asd_cui[asd_cui["CUI"]=="C1510586"])/len(asd_cui))
print("Fraction C0018817 (atrial septal defect):", len(asd_cui[asd_cui["CUI"]=="C0018817"])/len(asd_cui))


Fraction C0524528 (pervasive developmental disorder): 0.0
Fraction C1510586 (autism spectrum disorders): 0.0
Fraction C0018817 (atrial septal defect): 1.0


In [20]:
asd_cui["CUI"].value_counts()

C0018817    24583
Name: CUI, dtype: int64

### Test overlapping NER

In [21]:
# save CLAMP results to file
if ABSTRACT:
    CLAMP_file = filtered + "clamp_statistics_abstract.txt"
else:
    CLAMP_file = filtered + "clamp_statistics_fulltext.txt"

original_stdout = sys.stdout # Save a reference to the original standard output
with open(os.path.join(STATS_DIR, CLAMP_file), "w") as f:
    sys.stdout = f # Change the standard output to the file we created.  
    print("CLAMP results")
    clamp_true_pos_df, x, y = calculate_statistics(pred_df_clamp, labels_df, match_cui=False)
    sys.stdout = original_stdout # Reset the standard output to its original value
    
with open(os.path.join(STATS_DIR, CLAMP_file), "r") as f:
    print(f.read())

CLAMP results
Number of true positives = 96235
Number of positive labels = 106284
Number of positive predictions = 370654

Precision = 0.2596356710031458
Recall = 0.9054514320123442
F-Measure = 0.40355350171301085



In [22]:
# get true positives, false positives, false negatives
true_pos_grouped, false_pos_grouped, false_neg_grouped, false_pos, false_neg = get_false_and_true_pos(clamp_true_pos_df, pred_df_clamp, labels_df)

In [23]:
# export results
true_pos_grouped.to_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, filtered + "clamp_true_positive.csv"), index=False)
clamp_true_pos_df.to_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, filtered + "clamp_true_positive_all.csv"), index=False)
false_pos_grouped.to_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, filtered + "clamp_false_positive.csv"), index=False)
false_neg_grouped.to_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, filtered + "clamp_false_negative.csv"), index=False)
false_pos.to_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, filtered + "clamp_false_positive_all.csv"), index=False)

In [24]:
# Fraction of general words
len(clamp_true_pos_df[clamp_true_pos_df["TYPE"]=="General"])/len(clamp_true_pos_df)

0.018187858151316336

### General NER

In [25]:
# save CLAMP results to file
if ABSTRACT:
    CLAMP_file = filtered + "clamp_statistics_abstract_general.txt"
else:
    CLAMP_file = filtered + "clamp_statistics_fulltext_general.txt"

original_stdout = sys.stdout # Save a reference to the original standard output
with open(os.path.join(STATS_DIR, CLAMP_file), "w") as f:
    sys.stdout = f # Change the standard output to the file we created.  
    print("CLAMP results")
    clamp_true_pos_df, pred_df_clamp_general, labels_df_general = calculate_statistics(pred_df_clamp, labels_df, match_cui=False, match_general=True)
    sys.stdout = original_stdout # Reset the standard output to its original value
    
with open(os.path.join(STATS_DIR, CLAMP_file), "r") as f:
    print(f.read())

CLAMP results
Number of true positives = 1694
Number of positive labels = 3129
Number of positive predictions = 279637

Precision = 0.006057853574455455
Recall = 0.5413870246085011
F-Measure = 0.011981638527970124



In [26]:
# get true positives, false positives, false negatives
true_pos_grouped, false_pos_grouped, false_neg_grouped, false_pos, false_neg = get_false_and_true_pos(clamp_true_pos_df, pred_df_clamp_general, labels_df_general)

In [27]:
# export results
true_pos_grouped.to_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, filtered + "clamp_true_positive_general.csv"), index=False)
false_pos_grouped.to_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, filtered + "clamp_false_positive_general.csv"), index=False)
false_neg_grouped.to_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, filtered + "clamp_false_negative_general.csv"), index=False)
false_pos.to_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, filtered + "clamp_false_positive_all_general.csv"), index=False)

# cTAKES analysis

In [28]:
# configurations that can be modified
CTAKES_DIRECTORY = "ctakes" # parent directory for cTAKES-related files

# formatted cTAKES output/predictions
CTAKES_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(CTAKES_DIRECTORY, "ctakes_results_full_text")
CTAKES_RESULTS_DIRECTORY_ABSTRACT = os.path.join(CTAKES_DIRECTORY, "ctakes_results_abstract")

if ABSTRACT:
    CTAKES_RESULTS_DIRECTORY = CTAKES_RESULTS_DIRECTORY_ABSTRACT 
else:
    CTAKES_RESULTS_DIRECTORY = CTAKES_RESULTS_DIRECTORY_FULL_TEXT

In [29]:
labels_df = labels_original_df.copy()

In [30]:
pred_df_temp = pd.read_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, "ctakes_preds.csv"))

# formatting
pred_df_temp["Entity"] = pred_df_temp["Entity_matched"].str.strip()
pred_df_temp["Entity_lower"] = pred_df_temp["Entity"].str.strip().str.lower()
pred_df_temp = pred_df_temp.drop(['conditional', 'confidence', 'generic', 'id', 'negated', 'preferred_text', 'refsem', 'scheme', 'score', 'subject',
       'textsem', 'uncertainty', 'true_text', 'part_of_speech', 'Entity_matched'], axis=1)

In [31]:
if not ABSTRACT:
    assert (len((set(labels_df["paper"])).difference(set(pred_df_temp["paper"])))) == 0 # all papers were analyzed
print("Number of papers with extra predictions:", len((set(pred_df_temp["paper"])).difference(set(labels_df["paper"])))) # number of papers with extra predictions

Number of papers with extra predictions: 507


In [32]:
if FILTER:    
    pred_df_temp = filter_pred(pred_df_temp, remove_non_asd=True)


In [33]:
print("before dropping duplicates:", len(pred_df_temp))
pred_df_temp = pred_df_temp.drop_duplicates(["Start", "End", "paper", "CUI"])
print("after dropping duplicates:", len(pred_df_temp))
pred_df_ctakes = pred_df_temp

before dropping duplicates: 938888
after dropping duplicates: 511019


In [34]:
ctakes_entities = pred_df_ctakes["Entity"]
print("cTAKES mean entity no. of words =", np.mean([len(str(ent).split(" ")) for ent in ctakes_entities]))
print("cTAKES std entity no. of words =", np.std([len(str(ent).split(" ")) for ent in ctakes_entities]))

cTAKES mean entity no. of words = 1.1616280412274298
cTAKES std entity no. of words = 0.452216646287674


In [35]:
# check to see if any entities have more than 1 CUI
cui_df = pred_df_temp.groupby(["paper", "Start", "End"])["CUI"].count().reset_index()
cui_df = cui_df[cui_df["CUI"]!=1].sort_values(by="CUI", ascending=False)
cui_df.merge(pred_df_temp, on=["paper", "Start", "End"])

Unnamed: 0,paper,Start,End,CUI_x,CUI_y,TUI,Sentence_pred,Entity,Entity_lower
0,28733900.txt,923,926,5,C0164707,T116,(Epi-)genetic factors which may increase susce...,Epi,epi
1,28733900.txt,923,926,5,C0014563,T109,(Epi-)genetic factors which may increase susce...,Epi,epi
2,28733900.txt,923,926,5,C0014582,T109,(Epi-)genetic factors which may increase susce...,Epi,epi
3,28733900.txt,923,926,5,C0267963,T047,(Epi-)genetic factors which may increase susce...,Epi,epi
4,28733900.txt,923,926,5,C0451152,T060,(Epi-)genetic factors which may increase susce...,Epi,epi
...,...,...,...,...,...,...,...,...,...
39062,22285033.txt,1101,1104,2,C0337380,T060,ERP responses to eye gaze may help characteriz...,ERP,erp
39063,22285821.txt,435,442,2,C0011923,T060,"To accomplish this goal, we performed function...",imaging,imaging
39064,22285821.txt,435,442,2,C0079595,T060,"To accomplish this goal, we performed function...",imaging,imaging
39065,9990834.txt,299,307,2,C1879338,T033,Dislocation is described as not just part of t...,mourning,mourning


In [36]:
# CUI for ASD/ASDs
asd_cui = pred_df_ctakes[(pred_df_ctakes["Entity"]=="ASD")|(pred_df_ctakes["Entity"]=="ASDs")]

print("Fraction C0524528 (pervasive developmental disorder):", len(asd_cui[asd_cui["CUI"]=="C0524528"])/len(asd_cui))
print("Fraction C1510586 (autism spectrum disorders):", len(asd_cui[asd_cui["CUI"]=="C1510586"])/len(asd_cui))
print("Fraction C0018817 (atrial septal defect):", len(asd_cui[asd_cui["CUI"]=="C0018817"])/len(asd_cui))

Fraction C0524528 (pervasive developmental disorder): 0.0
Fraction C1510586 (autism spectrum disorders): 0.0
Fraction C0018817 (atrial septal defect): 1.0


In [37]:
asd_cui["CUI"].value_counts()

C0018817    37759
Name: CUI, dtype: int64

### Test overlapping NER

In [38]:
# save cTAKES results to file
if ABSTRACT:
    CTAKES_file = filtered + "ctakes_statistics_abstract.txt"
else:
    CTAKES_file = filtered + "ctakes_statistics_fulltext.txt"

original_stdout = sys.stdout # Save a reference to the original standard output
with open(os.path.join(STATS_DIR, CTAKES_file), "w") as f:
    sys.stdout = f # Change the standard output to the file we created.  
    print("cTAKES results")
    ctakes_true_pos_df, x, y = calculate_statistics(pred_df_ctakes, labels_df, match_cui=False)
    sys.stdout = original_stdout # Reset the standard output to its original value
    
with open(os.path.join(STATS_DIR, CTAKES_file), "r") as f:
    print(f.read())

  res_values = method(rvalues)


cTAKES results
Number of true positives = 101219
Number of positive labels = 106284
Number of positive predictions = 489520

Precision = 0.20677193985945416
Recall = 0.9523446614730345
F-Measure = 0.33977281119294267



In [39]:
# get true positives, false positives, false negatives
true_pos_grouped, false_pos_grouped, false_neg_grouped, false_pos, false_neg = get_false_and_true_pos(ctakes_true_pos_df, pred_df_ctakes, labels_df)

In [40]:
# export results
true_pos_grouped.to_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, filtered + "ctakes_true_positive.csv"), index=False)
ctakes_true_pos_df.to_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, filtered + "ctakes_true_positive_all.csv"), index=False)
false_pos_grouped.to_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, filtered + "ctakes_false_positive.csv"), index=False)
false_neg_grouped.to_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, filtered + "ctakes_false_negative.csv"), index=False)
false_pos.to_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, filtered + "ctakes_false_positive_all.csv"), index=False)

In [41]:
# Fraction of general words
len(ctakes_true_pos_df[ctakes_true_pos_df["TYPE"]=="General"])/len(ctakes_true_pos_df)

0.007776099999192513

### General NER

In [42]:
# save cTAKES results to file
if ABSTRACT:
    CTAKES_file = filtered + "ctakes_statistics_abstract_general.txt"
else:
    CTAKES_file = filtered + "ctakes_statistics_fulltext_general.txt"

original_stdout = sys.stdout # Save a reference to the original standard output
with open(os.path.join(STATS_DIR, CTAKES_file), "w") as f:
    sys.stdout = f # Change the standard output to the file we created.  
    print("cTAKES results")
    ctakes_true_pos_df, pred_df_ctakes_general, labels_df_general = calculate_statistics(pred_df_ctakes, labels_df, match_cui=False, match_general=True)
    sys.stdout = original_stdout # Reset the standard output to its original value
    
with open(os.path.join(STATS_DIR, CTAKES_file), "r") as f:
    print(f.read())

cTAKES results
Number of true positives = 641
Number of positive labels = 3129
Number of positive predictions = 366659

Precision = 0.001748218371838684
Recall = 0.2048577820389901
F-Measure = 0.0034668512769478726



In [43]:
# get true positives, false positives, false negatives
true_pos_grouped, false_pos_grouped, false_neg_grouped, false_pos, false_neg = get_false_and_true_pos(ctakes_true_pos_df, pred_df_ctakes_general, labels_df_general)

In [44]:
# export results
true_pos_grouped.to_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, filtered + "ctakes_true_positive_general.csv"), index=False)
false_pos_grouped.to_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, filtered + "ctakes_false_positive_general.csv"), index=False)
false_neg_grouped.to_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, filtered + "ctakes_false_negative_general.csv"), index=False)
false_pos.to_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, filtered + "ctakes_false_positive_all_general.csv"), index=False)

# MetaMap analysis

In [None]:
# configurations that can be modified
METAMAP_DIRECTORY = "metamap"

# formatted MetaMap output/predictions
METAMAP_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(METAMAP_DIRECTORY, "metamap_results_full_text")
METAMAP_RESULTS_DIRECTORY_ABSTRACT = os.path.join(METAMAP_DIRECTORY, "metamap_results_abstract")

if ABSTRACT:
    METAMAP_RESULTS_DIRECTORY = METAMAP_RESULTS_DIRECTORY_ABSTRACT 
else:
    METAMAP_RESULTS_DIRECTORY = METAMAP_RESULTS_DIRECTORY_FULL_TEXT

In [None]:
labels_df = labels_original_df.copy()
labels_df_temp = pd.read_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, "metamap_labels.csv"))

In [None]:
# read in BM ASD terms and add TUI
BM_df = pd.read_csv("BM_terms.csv")
BM_df.rename(columns={"CUI": "CUI_original"}, inplace=True)
BM_df["NEGATED"] = BM_df["CUI_original"].apply(lambda x: str(x)[0] == "-")
BM_df["CUI"] = BM_df["CUI_original"].apply(lambda x: str(x).replace("-", ""))
BM_cui_to_tui_df = pd.read_csv("tui_list_BM.txt", sep="\t", index_col=0, header=None).reset_index()
BM_cui_to_tui_df.columns = ["CUI", "TUI"]
BM_df = BM_df.merge(BM_cui_to_tui_df, how="left")
BM_df["TEXT"] = BM_df["TEXT"].str.strip().str.lower()
BM_df = BM_df.drop_duplicates()

In [None]:
# merge labels with BM term information
labels_df_temp = labels_df_temp.merge(BM_df, left_on="Entity_lower", right_on="TEXT", how="left")

# clean-up
labels_df_temp = labels_df_temp.replace({'Entity_lower': {"asperger 's": "asperger's"}})
labels_df_temp = labels_df_temp.replace({'Entity': {"asperger 's": "asperger's"}})
labels_df_temp = labels_df_temp.replace({'Entity': {"Asperger 's": "Asperger's"}})

# case-sensitive for ASD and ASDs
labels_df_temp = labels_df_temp[~((labels_df_temp["Entity_lower"]=="asds")&(labels_df_temp["Entity"]!="ASDs"))]
labels_df_temp = labels_df_temp[~((labels_df_temp["Entity_lower"]=="asd")&(labels_df_temp["Entity"]!="ASD"))]

In [None]:
pred_df_temp = pd.read_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, "metamap_preds.csv"))

# formatting
pred_df_temp = pred_df_temp[~((pred_df_temp["Entity_matched"]=="Body")&(pred_df_temp["Start"]==5))] # filter out Body separator in text
pred_df_temp["Entity_original"] = pred_df_temp["Entity"]
pred_df_temp["Entity"] = pred_df_temp["Entity_matched"].str.strip()
pred_df_temp["Entity_lower"] = pred_df_temp["Entity"].str.strip().str.lower()

# add TUI to predictions
semantic_types_df = pd.read_csv("SemanticTypes_2018AB.txt", sep="|", header=None)
semantic_types_df.columns = ["SemType", "TUI", "SemType_long"]
pred_df_temp = pred_df_temp.merge(semantic_types_df, how="left", on="SemType")

# clean up unused columns
pred_df_temp = pred_df_temp.drop(['MappingScore', 'Negated',
       'CandidateScore', 'MatchedWords', 'MatchedPhrase',
       'paper_part', 'Entity_matched',
       'Entity_original', 'SemType', 'SemType_long'], axis=1)

In [None]:
if not ABSTRACT:
    assert (len((set(labels_df["paper"])).difference(set(pred_df_temp["paper"])))) == 0 # all papers were analyzed
    assert (len((set(labels_df_temp["paper"])).difference(set(pred_df_temp["paper"])))) == 0 # all papers were analyzed

print("Number of papers with extra predictions:", len((set(pred_df_temp["paper"])).difference(set(labels_df["paper"])))) # number of papers with extra predictions

In [None]:
if FILTER:
    pred_df_temp = filter_pred(pred_df_temp, remove_non_asd=True)


In [None]:
# check to see if any entities have more than 1 CUI
cui_df = pred_df_temp.groupby(["paper", "Start", "End"])["CUI"].count().reset_index()
cui_df = cui_df[cui_df["CUI"]!=1].sort_values(by="CUI", ascending=False)
cui_df.merge(pred_df_temp, on=["paper", "Start", "End"])

In [None]:
labels_df = labels_df_temp
pred_df_metamap = pred_df_temp

In [None]:
metamap_entities = pred_df_metamap.drop_duplicates(["Start", "End", "paper"])["Entity"]
print("METAMAP mean entity no. of words =", np.mean([len(str(ent).split(" ")) for ent in metamap_entities]))
print("METAMAP std entity no. of words =", np.std([len(str(ent).split(" ")) for ent in metamap_entities]))

In [None]:
# CUI for ASD/ASDs
asd_cui = pred_df_metamap[(pred_df_metamap["Entity"]=="ASD")|(pred_df_metamap["Entity"]=="ASDs")]
print("Fraction C0524528 (pervasive developmental disorder):", len(asd_cui[asd_cui["CUI"]=="C0524528"])/len(asd_cui))
print("Fraction C1510586 (autism spectrum disorders):", len(asd_cui[asd_cui["CUI"]=="C1510586"])/len(asd_cui))
print("Fraction C0018817 (atrial septal defect):", len(asd_cui[asd_cui["CUI"]=="C0018817"])/len(asd_cui))


In [None]:
asd_cui["CUI"].value_counts()

### Test overlapping NER

In [None]:
len(pred_df_metamap)
pred_df_metamap = pred_df_metamap.drop_duplicates(["paper", "Start", "End"])
len(pred_df_metamap)

In [None]:
pred_df_metamap

In [None]:
# save cTAKES results to file
if ABSTRACT:
    METAMAP_file = filtered + "metamap_statistics_abstract.txt"
else:
    METAMAP_file = filtered + "metamap_statistics_fulltext.txt"

original_stdout = sys.stdout # Save a reference to the original standard output
with open(os.path.join(STATS_DIR, METAMAP_file), "w") as f:
    sys.stdout = f # Change the standard output to the file we created.  
    print("MetaMap results")
    metamap_true_pos_df, x, y = calculate_statistics(pred_df_metamap, labels_df, match_cui=False)
    sys.stdout = original_stdout # Reset the standard output to its original value
    
with open(os.path.join(STATS_DIR, METAMAP_file), "r") as f:
    print(f.read())

In [None]:
# get true positives, false positives, false negatives
true_pos_grouped, false_pos_grouped, false_neg_grouped, false_pos, false_neg = get_false_and_true_pos(metamap_true_pos_df, pred_df_metamap, labels_df)

In [None]:
# export results
true_pos_grouped.to_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, filtered + "metamap_true_positive.csv"), index=False)
metamap_true_pos_df.to_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, filtered + "metamap_true_positive_all.csv"), index=False)
false_pos_grouped.to_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, filtered + "metamap_false_positive.csv"), index=False)
false_neg_grouped.to_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, filtered + "metamap_false_negative.csv"), index=False)
false_pos.to_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, filtered + "metamap_false_positive_all.csv"), index=False)

In [None]:
# Fraction of general words
len(metamap_true_pos_df[metamap_true_pos_df["TYPE"]=="General"])/len(metamap_true_pos_df)

### General NER

In [None]:
# save cTAKES results to file
if ABSTRACT:
    METAMAP_file = filtered + "metamap_statistics_abstract_general.txt"
else:
    METAMAP_file = filtered + "metamap_statistics_fulltext_general.txt"

original_stdout = sys.stdout # Save a reference to the original standard output
with open(os.path.join(STATS_DIR, METAMAP_file), "w") as f:
    sys.stdout = f # Change the standard output to the file we created.  
    print("MetaMap results")
    metamap_true_pos_df, pred_df_metamap_general, labels_df_general = calculate_statistics(pred_df_metamap, labels_df, match_cui=False, match_general=True)
    sys.stdout = original_stdout # Reset the standard output to its original value
    
with open(os.path.join(STATS_DIR, METAMAP_file), "r") as f:
    print(f.read())

In [None]:
# get true positives, false positives, false negatives
true_pos_grouped, false_pos_grouped, false_neg_grouped, false_pos, false_neg = get_false_and_true_pos(metamap_true_pos_df, pred_df_metamap_general, labels_df_general)

In [None]:
# export results
true_pos_grouped.to_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, filtered + "metamap_true_positive_general.csv"), index=False)
false_pos_grouped.to_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, filtered + "metamap_false_positive_general.csv"), index=False)
false_neg_grouped.to_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, filtered + "metamap_false_negative_general.csv"), index=False)
false_pos.to_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, filtered + "metamap_false_positive_all_general.csv"), index=False)