This notebook finds overlapping predictions between CLAMP, cTAKES, and MetaMap false positive (FP) predictions and uses them to find potential benchmark (BM) ASD terms.

In [1]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_venn import venn3
import seaborn as sns

import pandas as pd
import os
import numpy as np
from unidecode import unidecode

  import pandas.util.testing as tm


In [2]:
# configurations that can be modified
ABSTRACT = True # True if running this program of abstracts, False if running on full-texts

FIGURES_DIR = "figures" # diretory where figures will be saved

if not os.path.exists(FIGURES_DIR):
    os.makedirs(FIGURES_DIR)
else:
    print(f"The folder '{FIGURES_DIR}' already exists, so a new folder was not created.")

The folder 'figures' already exists, so a new folder was not created.


# Load dataframe with "true" benchmark (BM) labels 

In [3]:
BM_DIR = "BM_labelled" # folder where the results (dataframe with labels) will be stored

if ABSTRACT:
    labels_df = pd.read_csv(os.path.join(BM_DIR, "abstract_labels.csv"))
else:
    labels_df = pd.read_csv(os.path.join(BM_DIR, "full_text_labels.csv"))

In [4]:
# read in BM ASD terms and add TUI
BM_df = pd.read_csv("BM_terms.csv")
BM_df.rename(columns={"CUI": "CUI_original"}, inplace=True)
BM_df["NEGATED"] = BM_df["CUI_original"].apply(lambda x: str(x)[0] == "-")
BM_df["CUI"] = BM_df["CUI_original"].apply(lambda x: str(x).replace("-", ""))
BM_cui_to_tui_df = pd.read_csv("tui_list_BM.txt", sep="\t", index_col=0, header=None).reset_index()
BM_cui_to_tui_df.columns = ["CUI", "TUI"]
BM_df = BM_df.merge(BM_cui_to_tui_df, how="left")
BM_df["TEXT"] = BM_df["TEXT"].str.strip().str.lower()
BM_df = BM_df.drop_duplicates()

In [5]:
ASD_CUI = set(BM_df["CUI"])
print(f"There are {len(ASD_CUI)} unique CUI")

BM_ents = set(BM_df["TEXT"])
print(f"There are {len(BM_ents)} unique BM terms")

BM_ents_general = set(BM_df[BM_df["TYPE"]=="General"]["TEXT"])
print(f"There are {len(BM_ents_general)} unique BM terms (general)")

There are 101 unique CUI
There are 827 unique BM terms
There are 783 unique BM terms (general)


In [6]:
# merge labels with BM term information
labels_df = labels_df.merge(BM_df, left_on="Entity_lower", right_on="TEXT", how="left")

# clean-up
labels_df = labels_df.replace({'Entity_lower': {"asperger 's": "asperger's"}})
labels_df = labels_df.replace({'Entity': {"asperger 's": "asperger's"}})
labels_df = labels_df.replace({'Entity': {"Asperger 's": "Asperger's"}})

# case-sensitive for ASD and ASDs
labels_df = labels_df[~((labels_df["Entity_lower"]=="asds")&(labels_df["Entity"]!="ASDs"))]
labels_df = labels_df[~((labels_df["Entity_lower"]=="asd")&(labels_df["Entity"]!="ASD"))]

In [7]:
# check that entitiy and CUI columns are not empty
assert len(labels_df[labels_df["Entity"].str.lower() == "nan"]) == 0
assert len(labels_df[labels_df["Entity_lower"].str.lower() == "nan"]) == 0
assert len(labels_df[labels_df["CUI"].str.len() == 0]) == 0

print("Distinct true entities detected (case-sensitive):", len(set(labels_df["Entity"])))
print("Distinct true entities detected (case-insensitive):", len(set(labels_df["Entity_lower"])))

Distinct true entities detected (case-sensitive): 159
Distinct true entities detected (case-insensitive): 106


# Get CLAMP TP output/predictions

In [8]:
# configurations that can be modified
CLAMP_DIRECTORY = "clamp" # parent directory for CLAMP-related files

# formatted CLAMP output/predictions
CLAMP_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(CLAMP_DIRECTORY, "clamp_results_full_text")
CLAMP_RESULTS_DIRECTORY_ABSTRACT = os.path.join(CLAMP_DIRECTORY, "clamp_results_abstract")

if ABSTRACT:
    CLAMP_RESULTS_DIRECTORY = CLAMP_RESULTS_DIRECTORY_ABSTRACT 
else:
    CLAMP_RESULTS_DIRECTORY = CLAMP_RESULTS_DIRECTORY_FULL_TEXT

In [9]:
pred_df_temp = pd.read_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, "clamp_true_positive_all.csv"))
pred_df_temp = pred_df_temp[["Entity_pred", "TEXT"]]

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
print("before dropping duplicates:", len(pred_df_temp))
pred_df_temp = pred_df_temp.drop_duplicates()
print("after dropping duplicates:", len(pred_df_temp))
pred_df_clamp = pred_df_temp

before dropping duplicates: 96328
after dropping duplicates: 15821


# Get cTAKES TP output/predictions

In [11]:
# configurations that can be modified
CTAKES_DIRECTORY = "ctakes" # parent directory for cTAKES-related files

# formatted cTAKES output/predictions
CTAKES_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(CTAKES_DIRECTORY, "ctakes_results_full_text")
CTAKES_RESULTS_DIRECTORY_ABSTRACT = os.path.join(CTAKES_DIRECTORY, "ctakes_results_abstract")

if ABSTRACT:
    CTAKES_RESULTS_DIRECTORY = CTAKES_RESULTS_DIRECTORY_ABSTRACT 
else:
    CTAKES_RESULTS_DIRECTORY = CTAKES_RESULTS_DIRECTORY_FULL_TEXT

In [12]:
pred_df_temp = pd.read_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, "ctakes_true_positive_all.csv"))
pred_df_temp = pred_df_temp[["Entity_pred", "TEXT"]]

In [13]:
print("before dropping duplicates:", len(pred_df_temp))
pred_df_temp = pred_df_temp.drop_duplicates()
print("after dropping duplicates:", len(pred_df_temp))
pred_df_ctakes = pred_df_temp

before dropping duplicates: 123841
after dropping duplicates: 221


# Get MetaMap TP output/predictions

In [14]:
# configurations that can be modified
METAMAP_DIRECTORY = "metamap"

# formatted MetaMap output/predictions
METAMAP_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(METAMAP_DIRECTORY, "metamap_results_full_text")
METAMAP_RESULTS_DIRECTORY_ABSTRACT = os.path.join(METAMAP_DIRECTORY, "metamap_results_abstract")

if ABSTRACT:
    METAMAP_RESULTS_DIRECTORY = METAMAP_RESULTS_DIRECTORY_ABSTRACT 
else:
    METAMAP_RESULTS_DIRECTORY = METAMAP_RESULTS_DIRECTORY_FULL_TEXT

In [15]:
labels_df_temp = pd.read_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, "metamap_labels.csv"))
# merge labels with BM term information
labels_df_temp = labels_df_temp.merge(BM_df, left_on="Entity_lower", right_on="TEXT", how="left")

# clean-up
labels_df_temp = labels_df_temp.replace({'Entity_lower': {"asperger 's": "asperger's"}})
labels_df_temp = labels_df_temp.replace({'Entity': {"asperger 's": "asperger's"}})
labels_df_temp = labels_df_temp.replace({'Entity': {"Asperger 's": "Asperger's"}})

# case-sensitive for ASD and ASDs
labels_df_temp = labels_df_temp[~((labels_df_temp["Entity_lower"]=="asds")&(labels_df_temp["Entity"]!="ASDs"))]
labels_df_temp = labels_df_temp[~((labels_df_temp["Entity_lower"]=="asd")&(labels_df_temp["Entity"]!="ASD"))]

labels_df_metamap = labels_df_temp

In [16]:
pred_df_temp = pd.read_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, "metamap_true_positive_all.csv"))
pred_df_temp = pred_df_temp[["Entity_pred", "TEXT"]]

In [17]:
print("before dropping duplicates:", len(pred_df_temp))
pred_df_temp = pred_df_temp.drop_duplicates()
print("after dropping duplicates:", len(pred_df_temp))
pred_df_metamap = pred_df_temp

before dropping duplicates: 100398
after dropping duplicates: 735


# Overlapping predictions with BM term

In [18]:
pred_df = pd.concat([pred_df_clamp, pred_df_ctakes, pred_df_metamap])

In [19]:
df = pd.DataFrame()
for i, row in pred_df.iterrows():
    true = unidecode(str(row["TEXT"]))
    pred = unidecode(str(row["Entity_pred"]))
    if true in BM_ents and true in pred:
        df = df.append({"Entity_pred": pred, "TEXT": true}, ignore_index=True)

In [20]:
df.columns = ["Entity_pred", "TEXT"]
pred_df = df

In [21]:
pred_df_grouped = pred_df.groupby("TEXT").agg({"Entity_pred": lambda x: "; ".join(sorted(list(set([str(i) for i in x]))))}).reset_index()
pred_df_grouped.columns = ["BM term", "Overlapping_predictions (separated by ';')"]

if ABSTRACT:
    pred_df_grouped.to_csv("tp_overlap_abstract.csv", index=False)
else:
    pred_df_grouped.to_csv("tp_overlap_fulltext.csv", index=False)

In [22]:
ABSTRACT

True