# Imports

In [1]:
import os, random, string
from io import StringIO
import numpy as np
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokenizer import Tokenizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt

nlp = spacy.load("en_core_web_sm")

# Helper functions

In [2]:
def is_file_empty(DIRECTORY, filename):
    with open(os.path.join(DIRECTORY, filename)) as f:
        data = f.read()
        
    return data.isspace() or data == ""

In [3]:
def calculate_statistics(pred_df, true_df, match_cui=False):
    
    # count overlap and CUI match
    if match_cui:
        match_grouped = pred_df.merge(true_df, on=["paper", "CUI"], how="outer")
        # drop duplicate predictions on same entity span and CUI
        pred_df = pred_df.drop_duplicates(subset=["paper", "CUI", "Start", "End"])
        true_df = true_df.drop_duplicates(subset=["paper", "CUI", "Start", "End"])
        
    # count overlap only - get true positives
    else:
        match_grouped = pred_df.merge(true_df, on=["paper"], how="outer")
        # drop duplicate predictions on same entity span
        pred_df = pred_df.drop_duplicates(subset=["paper", "Start", "End"])
        true_df = true_df.drop_duplicates(subset=["paper", "Start", "End"])
    
    match_grouped = match_grouped.rename(columns={"Start_x": "Start_pred", "End_x": "End_pred", "Start_y": "Start_label", "End_y": "End_label", "Entity_x": "Entity_pred", "Entity_y": "Entity_label"})
    match_grouped = match_grouped.fillna("NA")
    # count overlaps
    temp = match_grouped[(match_grouped["Start_pred"] != "NA") & (match_grouped["Start_label"] != "NA")]
    temp = temp[((temp["Start_pred"] >= temp["Start_label"]) & (temp["Start_pred"] <= temp["End_label"])) | ((temp["Start_label"] >= temp["Start_pred"]) & (temp["Start_label"] <= temp["End_pred"]))]   
    true_pos_df = temp

    num_true_pos = len(temp.drop_duplicates(["paper", "Start_label", "End_label"])) # only count max one pred per label
    num_label_pos = len(true_df)
    num_pred_pos = len(pred_df)

    print("Number of true positives =", num_true_pos)
    print("Number of positive labels =", num_label_pos)
    print("Number of positive predictions =", num_pred_pos)
    print()
    precision = num_true_pos/num_pred_pos
    recall = num_true_pos/num_label_pos
    print("Precision =", 100 * precision)
    print("Recall =", 100 * recall)
    print("F-Measure =", (2 * precision * recall) / (precision + recall))
    
    return true_pos_df

In [4]:
def filter_pred(pred_df_temp, remove_non_asd=False, cui=True):
    
    if cui:
        # valid CUI only
        pred_df_temp = pred_df_temp[(pred_df_temp["CUI"].str.len() == 8) & (pred_df_temp["CUI"].str[0] == 'C')] 
    
    # strip entity
    pred_df_temp["Entity"] = pred_df_temp["Entity"].str.strip()
    
    # remove non-asd specific terms
    if remove_non_asd:
        autism_comorbid = set(pd.read_csv("not-asd-specific.csv")["Entity"])
        entities = set(pred_df_temp["Entity"])
        remove = set()
        # remove entities that contain comorbid term
        for e in entities:
            for c in autism_comorbid:
                if str(c) in str(e) or str(c) in str(e).lower():
                    remove.add(e)
                    
        pred_df_temp = pred_df_temp[~pred_df_temp["Entity"].isin(remove)]
        
        
    pred_df = pred_df_temp
    
    print("Number of entities predicted =", len(pred_df))
    return pred_df

In [5]:
def filter_true(true_temp_df, gold_standard, cui_to_tui):
    
    # remove nan
    true_temp_df = true_temp_df[true_temp_df["Entity_lower"].str.lower() != "nan"]
    
    # double check all entities are in gold standard
    for e in set(true_temp_df["Entity_lower"]):
        if not e in set(gold_standard["TEXT"]):
            print(e)

    # merge entity with CUI
    true_df = true_temp_df.merge(gold_standard, left_on="Entity_lower", right_on="TEXT", how="inner")
    #true_df = true_df[(true_df["CUI"].str.len() == 8) & (true_df["CUI"].str[0] == 'C')] # valid CUI ID only
    
    # merge cui with TUI
    true_df = true_df.merge(cui_to_tui, on="CUI", how="left")
    
    print("Number of entities labelled =", len(true_df))
    return true_df

In [6]:
def get_CUI(x):
    
    if pd.isna(x):
        return x
    else:
        return x.split()[0].strip()
    
def remove_stop(s):
    s = str(s)
    spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

    spl = s.split(" ", 1)
    if spl[0] in spacy_stopwords and len(spl) > 1:
        return spl[1]
    else:
        return s

In [7]:
def get_results_grouped(df, write_to_file=False, filename=None):
    pred_df = df
    grouped_paper_pred = pred_df.groupby(by=["CUI", "Entity"])["paper"].nunique().reset_index().sort_values('paper', ascending=False)
    grouped_pred = pred_df.groupby(by=["CUI", "Entity"])["Start"].count().reset_index().sort_values('Start', ascending=False)
    grouped_pred = grouped_pred.merge(grouped_paper_pred, on=["CUI", "Entity"])
    grouped_pred.columns = ["CUI", "Entity", "count", "num_papers"]
    
    if write_to_file:
        # write results to text file
        with open(filename, "w") as f:
            print("CUI", "Entity", "count", "num_papers", sep="\t", file=f)
            for index, row in grouped_pred.iterrows():
                print(str(row["CUI"]), str(row["Entity"]), str(row["count"]), str(row["num_papers"]), sep="\t", file=f)
    
    return grouped_pred

In [8]:
def get_false_and_true_pos(true_pos_df, pred_df, true_df, cui=True):
    # group similar entities in true pos df
    temp = pd.DataFrame(true_pos_df.groupby(by=["Entity_label", "Entity_pred"])["Start_pred"].count()).reset_index()
    grouped = pd.DataFrame(temp.groupby(by=["Entity_label"])["Start_pred"].sum()).sort_values(by="Start_pred", ascending=False)
    temp = temp.merge(grouped, on="Entity_label")
    temp.columns = ["Entity_label", "Entity_pred", "Entity_pred count", "Entity_label count"]
    temp = temp.sort_values(by=["Entity_label count", "Entity_pred count"], ascending=False)
    true_pos_grouped = temp
    
    if cui:
        columns = ["Entity", "CUI", "TUI"]
    else:
        columns = ["Entity"]
    
    # false positives - count overlap as match
    temp = pred_df.merge(true_pos_df[["paper", "Start_pred", "End_pred"]], left_on=["paper", "Start", "End"], right_on=["paper", "Start_pred", "End_pred"], how="outer")
    false_pos = temp[temp["Start_pred"].isnull()].sort_values(by=["paper", "Entity"])
    false_pos_grouped = false_pos.groupby(by=columns)["Start"].count().reset_index().sort_values(by="Start", ascending=False).reset_index(drop=True)
    false_pos_grouped = false_pos_grouped.rename(columns={"Start":"count"})
    
    # false negative - count overlap as match
    temp = true_df.merge(true_pos_df[["paper", "Start_label", "End_label"]], left_on=["paper", "Start", "End"], right_on=["paper", "Start_label", "End_label"], how="outer").drop_duplicates(["paper", "Start", "End"])
    false_neg = temp[temp["Start_label"].isnull()].sort_values(by=["paper", "Entity"])
    false_neg_grouped = false_neg.groupby(by=columns)["Start"].count().reset_index().sort_values(by="Start", ascending=False).reset_index(drop=True)
    false_neg_grouped = false_neg_grouped.rename(columns={"Start":"count"})
    
    return true_pos_grouped, false_pos_grouped, false_neg_grouped, false_pos, false_neg

# True labels

In [9]:
gold_standard = pd.read_csv("gold-standard.csv").drop(["FWORD", "CODE"], axis=1)

In [10]:
new_labels = pd.read_csv("new-labels.csv")
new_labels.head(10)

Unnamed: 0,CUI,TEXT
0,C0004352,Autism
1,C1510586,Autism Spectrum Disorder
2,C0004352,Autistic
3,C1510586,ASD
4,C1510586,ASDs
5,C0236792,Asperger's
6,C0236792,Asperger Syndrome


In [11]:
gold_standard = gold_standard.append(new_labels)
gold_standard.head(10)

Unnamed: 0,CUI,TEXT
0,C0236792,Aspergers
1,C1837434,"ASPERGER SYNDROME, SUSCEPTIBILITY TO, 3"
2,C1837646,"ASPERGER SYNDROME, SUSCEPTIBILITY TO, 1"
3,C1837697,"ASPERGER SYNDROME, SUSCEPTIBILITY TO, 2"
4,C1864961,"ASPERGER SYNDROME, SUSCEPTIBILITY TO, 4"
5,C1845334,"ASPERGER SYNDROME, X-LINKED, SUSCEPTIBILITY TO..."
6,C1845341,"ASPERGER SYNDROME, X-LINKED, SUSCEPTIBILITY TO..."
7,C3151708,"ASPERGER SYNDROME, SUSCEPTIBILITY TO, X-LINKED 1"
8,C3151722,"ASPERGER SYNDROME, SUSCEPTIBILITY TO, X-LINKED 2"
9,C0004352,Autistic Disorder


In [12]:
gold_standard["TEXT"] = gold_standard["TEXT"].str.strip().str.lower()
autism_terms = set(gold_standard["TEXT"])
print(autism_terms)

{'autistic children', 'did not initiate any interaction, verbal or nonverbal', "likes to be around children, doesn't play with them", 'no verbal communication', 'unusual hand movements', 'he only provides a fleeting eye contact', 'interacted with examiner in pleasant, but disengaged, manner', 'blowing on hand', 'moving his jaw, rubbing his hands together', 'he has no social interaction with other children', 'arm waving when excited', 'focused attention on wheels of car', 'repetitive spinning of whole body', 'making hee-haw sounds', 'no symbolic play', 'does not play with toys', 'runs on tip toes', 'hyper- or hypo-reactivity to sensory input or unusual interest in sensory aspects of environment; (such as apparent indifference to pain/heat/cold, adverse response to specific sounds or textures, excessive smelling or touching of objects, fascination with lights or spinning objects).', 'he did not initiate eye contact', 'solitary in his play', 'play on trampoline for hours', 'initiates very

In [13]:
# create spacy token matcher
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(text) for text in autism_terms]
matcher.add("AutismTerms", None, *patterns)

In [14]:
# gold standard CUI to TUI
gold_standard_cui_to_tui = pd.read_csv("tui_list_gold_standard.txt", sep="\t", index_col=0, header=None).reset_index()
gold_standard_cui_to_tui.columns = ["CUI", "TUI"]

In [15]:
len(autism_terms)

830

# MetaMap results

In [None]:
# metamap input and output
METAMAP_DIRECTORY = "metamap"

INPUT_DIRECTORY_FULL_TEXT = "pubmed_all_fulltext_input"
INPUT_DIRECTORY_ABSTRACT = "pubmed_abstracts_24860"

METAMAP_OUTPUT_DIRECTORY_FULL_TEXT = os.path.join(METAMAP_DIRECTORY, "metamap_asciiignorexmlparsed") 
METAMAP_OUTPUT_DIRECTORY_ABSTRACT = os.path.join(METAMAP_DIRECTORY, "metamap_output_abstract_ignore_parsed") 

METAMAP_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(METAMAP_DIRECTORY, "metamap_results_full_text")
METAMAP_RESULTS_DIRECTORY_ABSTRACT = os.path.join(METAMAP_DIRECTORY, "metamap_results_abstract")

# choose whether to use full-text or abstract - MODIFY HERE
ABSTRACT = False

if ABSTRACT:
    METAMAP_OUTPUT_DIRECTORY = METAMAP_OUTPUT_DIRECTORY_ABSTRACT
    METAMAP_RESULTS_DIRECTORY = METAMAP_RESULTS_DIRECTORY_ABSTRACT 
    INPUT_DIRECTORY = INPUT_DIRECTORY_ABSTRACT
    METAMAP_FULL_TEXT_DIRECTORY = os.path.join(METAMAP_DIRECTORY, "metamap_abstract")
else:
    METAMAP_OUTPUT_DIRECTORY = METAMAP_OUTPUT_DIRECTORY_FULL_TEXT
    METAMAP_RESULTS_DIRECTORY = METAMAP_RESULTS_DIRECTORY_FULL_TEXT
    INPUT_DIRECTORY = INPUT_DIRECTORY_FULL_TEXT
    METAMAP_FULL_TEXT_DIRECTORY = os.path.join(METAMAP_DIRECTORY, "metamap_full_text")

In [None]:
metamap_files = os.listdir(METAMAP_OUTPUT_DIRECTORY)
if '.DS_Store'in metamap_files:
    metamap_files.remove('.DS_Store')


In [None]:
input_files = os.listdir(INPUT_DIRECTORY)
if '.DS_Store'in input_files:
    input_files.remove('.DS_Store')
    
input_files = [f.replace(".txt", "") for f in input_files]

In [None]:
metamap_papers = set([f.split("_")[0] for f in metamap_files])

In [None]:
metamap_files = sorted(metamap_files, key = lambda x: (x.split("_")[0], int(x.split("_")[1])))

In [None]:
len([x for x in metamap_files if "_" in x])

In [None]:
header = "id	MappingScore	CandidateCUI	CandidateMatched	SemType	StartPos	Length	Negated	CandidateScore	MatchedWords\n"
columns = ["id", "MappingScore", "CandidateCUI", "CandidateMatched", "SemType", "StartPos", "Length", "Negated", "CandidateScore", "MatchedWords"]
true_temp_df = pd.DataFrame()
pred_temp_df = pd.DataFrame()

i = 0
j = 0
papers_analyzed = []
full_text = ""
for filename in metamap_files:
    
    if "PMC6061181" in filename: # skip this paper
        continue
    
    if ".DS_Store" in filename:
        continue
    
    if "_" in filename:
    
        paper = filename.split("_")[0]
        
        # new paper
        if paper not in papers_analyzed:
            
            if len(papers_analyzed) > 0:
                
                with open(os.path.join(METAMAP_FULL_TEXT_DIRECTORY, papers_analyzed[-1] + ".txt"), "w") as f:
                    f.write(full_text)
                
                # analyze previous paper
                doc = nlp(full_text)
                matches = matcher(doc)
                spans = []

                for match_id, start, end in matches:
                    span = doc[start:end]
                    spans.append(span)

                filtered = spacy.util.filter_spans(spans)

                for span in filtered:
                    ent = span.text.lower().strip()
                    #temp = pd.DataFrame({"Entity": ent, "paper": filename, "Start": span.start_char, "End":span.end_char, "Sentence":span.sent.text}, index=[j])
                    temp = pd.DataFrame({"Entity": span.text, "Entity_lower": ent, "paper": papers_analyzed[-1], "Start": span.start_char, "End":span.end_char, "Sentence": span.sent.text}, index=[j])
                    true_temp_df = true_temp_df.append(temp)
                    j = j + 1
            
            if len(papers_analyzed) % 100 == 0:
                print(len(papers_analyzed), paper)
                
            full_text = ""
            papers_analyzed.append(paper)
            
            
        with open(os.path.join(METAMAP_OUTPUT_DIRECTORY, filename), "r") as f:
            data = f.read()
            
        if header not in data:
            print(filename, "has no header")
            
        splits = data.split(header)
        
        # this part contains the pmid and utterances
        info = splits[0].split("\n")
        pmid = ""
        utterance = False
        start_idx = len(full_text)
        
        for line in info:
            if "PMID: " in line:
                pmid_found = line.replace("PMID: ", "")
                pmid_found = pmid_found.split("_")[0]
                
                # check if pmid matches paper
                if pmid_found != paper:
                    raise Exception("PMID doesn't match paper:", line)
                else:
                    pmid = pmid_found
                
            if utterance:
                full_text = full_text + line
            
            if "UttText:" in line:
                utterance = True
            else:
                utterance = False
            
        full_text = full_text + " "
        
        # no terms detected
        if len(splits) < 2:
            i = i + 1
            continue
        
        temp = pd.read_csv(StringIO(splits[1]), sep="\t", header=None)
        temp.columns = columns
        
        temp["MatchedPhrase"] = temp["MatchedWords"].apply(lambda x: " ".join(str(x).split(",")))
        temp["paper"] = paper
        temp["paper_part"] = filename
        temp = temp.rename(columns={"StartPos": "Start", "CandidateCUI":"CUI"})
        temp["Start"] = temp["Start"] + start_idx
        temp["End"] = temp["Start"] + temp["Length"]
        temp = temp.drop(["id", "Length", "MappingScore"], axis=1)
        temp["Entity_matched"] = temp.apply(lambda row: full_text[row['Start']:row['End']], axis=1)
        # only use row with highest CandidateScore for given StartPos and Length
        temp = temp.sort_values('CandidateScore', ascending=True).drop_duplicates(['Start','End']).sort_values(by="Start").reset_index(drop=True)
        temp = temp.rename(columns={'CandidateMatched':'Entity'})
        pred_temp_df = pred_temp_df.append(temp)
        
        i = i + 1
        
        
# analyze previous paper
with open(os.path.join(METAMAP_FULL_TEXT_DIRECTORY, papers_analyzed[-1] + ".txt"), "w") as f:
    f.write(full_text)

doc = nlp(full_text)
matches = matcher(doc)
spans = []

for match_id, start, end in matches:
    span = doc[start:end]
    spans.append(span)

filtered = spacy.util.filter_spans(spans)

for span in filtered:
    ent = span.text.lower().strip()
    #temp = pd.DataFrame({"Entity": ent, "paper": filename, "Start": span.start_char, "End":span.end_char, "Sentence":span.sent.text}, index=[j])
    temp = pd.DataFrame({"Entity": span.text, "Entity_lower": ent, "paper": papers_analyzed[-1], "Start": span.start_char, "End":span.end_char, "Sentence": span.sent.text}, index=[j])
    true_temp_df = true_temp_df.append(temp)
    j = j + 1
            

In [None]:
print("MetaMap number of predicted entities =", len(pred_temp_df))
pred_temp_df.to_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, "metamap_pred_temp_df.csv"))

In [None]:
print("MetaMap number of labelled entities = ", len(true_temp_df))
true_temp_df.to_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, "metamap_true_temp_df.csv"))

### Start here if you want to load metamap results

In [None]:
# metamap input and output
METAMAP_DIRECTORY = "metamap"

INPUT_DIRECTORY_FULL_TEXT = "pubmed_all_fulltext_input"
INPUT_DIRECTORY_ABSTRACT = "pubmed_abstracts_24860"

METAMAP_OUTPUT_DIRECTORY_FULL_TEXT = os.path.join(METAMAP_DIRECTORY, "metamap_output_full_text") 
METAMAP_OUTPUT_DIRECTORY_ABSTRACT = os.path.join(METAMAP_DIRECTORY, "metamap_output_abstract") 

METAMAP_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(METAMAP_DIRECTORY, "metamap_results_full_text")
METAMAP_RESULTS_DIRECTORY_ABSTRACT = os.path.join(METAMAP_DIRECTORY, "metamap_results_abstract")

# choose whether to use full-text or abstract - MODIFY HERE
ABSTRACT = True

if ABSTRACT:
    METAMAP_OUTPUT_DIRECTORY = METAMAP_OUTPUT_DIRECTORY_ABSTRACT
    METAMAP_RESULTS_DIRECTORY = METAMAP_RESULTS_DIRECTORY_ABSTRACT 
    INPUT_DIRECTORY = INPUT_DIRECTORY_ABSTRACT
else:
    METAMAP_OUTPUT_DIRECTORY = METAMAP_OUTPUT_DIRECTORY_FULL_TEXT
    METAMAP_RESULTS_DIRECTORY = METAMAP_RESULTS_DIRECTORY_FULL_TEXT
    INPUT_DIRECTORY = INPUT_DIRECTORY_FULL_TEXT

In [None]:
if ABSTRACT:
    true_temp_df = pd.read_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, "metamap_true_temp_df.csv"), index_col=0)
    pred_temp_df = pd.read_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, "metamap_pred_temp_df.csv"), index_col=0)
    
else:
    true_temp_df = pd.read_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, "metamap_true_temp_df.csv"), index_col=0)
    pred_temp_df = pd.read_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, "metamap_pred_temp_df.csv"), index_col=0)


In [None]:
len(set(true_temp_df["Entity_lower"]))

In [None]:
pred_temp_df = pred_temp_df.rename(columns={"SemType":"TUI"})

In [None]:
pred_temp_df.head()

In [None]:
print("Distinct true entities detected:", len(set(true_temp_df["Entity"])))

In [None]:
FILTER = True

In [None]:
if FILTER:
    # filter predicted terms for relevant semtypes
    relevant_semtypes = ["fndg", "mobd"]
    pred_temp_df_filtered = pred_temp_df[(pred_temp_df["TUI"].isin(relevant_semtypes)) | (pred_temp_df["Entity"].str.contains("ASD"))]
    pred_df_metamap = filter_pred(pred_temp_df_filtered, remove_non_asd=True) # use this to only use relevant semtypes

else:
    pred_df_metamap = filter_pred(pred_temp_df, remove_non_asd=False)


In [None]:
# filter true df and add CUI
true_df_metamap = filter_true(true_temp_df, gold_standard, gold_standard_cui_to_tui)

In [None]:
# drop duplicate predictions on same entity span
pred_df_metamap = pred_df_metamap.drop_duplicates(subset=["paper", "Start", "End"])
true_df_metamap = true_df_metamap.drop_duplicates(subset=["paper", "Start", "End"])

In [None]:
metamap_entities = pred_df_metamap["Entity"]
print("MetaMap mean entity no. of words =", np.mean([len(str(ent).split(" ")) for ent in metamap_entities]))
print("MetaMap mean entity no. of words =", np.std([len(str(ent).split(" ")) for ent in metamap_entities]))

In [None]:
true_df_metamap_small = true_df_metamap[["Entity", "TUI", "Start", "End", "paper"]]

In [None]:
pred_df_metamap_small = pred_df_metamap[["Entity", "TUI", "Start", "End", "paper"]]

In [None]:
# get results for metamap
print("MetaMap results:")
metamap_true_pos_df = calculate_statistics(pred_df_metamap_small, true_df_metamap_small)

### Analyze true and false positive, and false negative

In [None]:
true_pos_grouped, false_pos_grouped, false_neg_grouped, false_pos, false_neg = get_false_and_true_pos(metamap_true_pos_df, pred_df_metamap, true_df_metamap)


In [None]:
if FILTER:
    filtered = "filtered_"
else:
    filtered = ""

In [None]:
metamap_true_pos_df.head(20)

In [None]:
metamap_true_pos_df.to_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, filtered + "metamap_true_positive.csv"))

In [None]:
false_pos_grouped.head(20)

In [None]:
false_pos_grouped.to_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, filtered + "metamap_false_positive.csv"))

In [None]:
false_neg_grouped.head(20)

In [None]:
false_neg_grouped.to_csv(os.path.join(METAMAP_RESULTS_DIRECTORY, filtered + "metamap_false_negative.csv"))

# Labels from PLOS ONE

In [None]:
#INPUT_DIRECTORY = "pubmed_all_fulltext_input"
#INPUT_DIRECTORY = "pubmed_abstracts_24860"

INPUT_DIRECTORY = "full_text_train_test/test"
#INPUT_DIRECTORY = "abstract_train_test/test"

In [None]:
# get true/labelled terms/entities

i = 0
true_temp_df = pd.DataFrame()
for idx, filename in enumerate(os.listdir(INPUT_DIRECTORY)):

    if filename.endswith(".txt"):
        path = os.path.join(INPUT_DIRECTORY, filename)
        
        if idx % 5 == 0:
            print(idx, filename)
        
        # tag entities in abstract
        with open(path, "r") as f:
            data = f.read()

        doc = nlp(data)
        matches = matcher(doc)
        spans = []
        
        for match_id, start, end in matches:
            span = doc[start:end]
            spans.append(span)
            
        filtered = spacy.util.filter_spans(spans)
        
        for span in filtered:
            ent = span.text.lower().strip()
            temp = pd.DataFrame({"Entity": span.text, "Entity_lower": ent, "paper": filename, "Start": span.start_char, "End":span.end_char, "Sentence":span.sent.text}, index=[i])
            true_temp_df = true_temp_df.append(temp)
            i = i + 1
            

In [None]:
#true_temp_df.to_csv("full_text_labels_df.csv")
#true_temp_df.to_csv("abstract_labels_df.csv")

true_temp_df.to_csv("full_text_labels_test_df_spacy.csv")
#true_temp_df.to_csv("abstract_labels_test_df.csv")

# Start here to load true labels for training

In [None]:
ABSTRACT = False

if ABSTRACT:
    LABELLED_DIR = os.path.join("abstract_train_test", "labelled_text", "test")
else:
    LABELLED_DIR = os.path.join("full_text_train_test", "labelled_text", "test")

In [None]:
# get true/labelled terms/entities

i = 0
true_temp_df = pd.DataFrame()
for idx, filename in enumerate(os.listdir(LABELLED_DIR)):

    if filename.endswith(".txt"):
        path = os.path.join(LABELLED_DIR, filename)
        
        temp = pd.read_csv(path, sep="\t")
        temp = temp[temp["Semantic"]=="AutismTerm"]
        temp = temp.drop(["CUI", "Assertion"], axis=1)
        temp["Entity_lower"] = temp["Entity"].str.lower()
        temp["paper"] = filename
        
        true_temp_df = true_temp_df.append(temp)

In [None]:
if ABSTRACT:
    true_temp_df.to_csv("abstract_labels_clamp_test_df.csv")
else:
    true_temp_df.to_csv("full_text_labels_clamp_test_df.csv")

# Start here to load true labels

In [18]:
# labels from CLAMP dictionary match

#true_temp_df = pd.read_csv("full_text_labels_clamp_test_df.csv", index_col=0)

#true_temp_df = pd.read_csv("abstract_labels_clamp_test_df.csv", index_col=0)

# spacy labels
#true_temp_df = pd.read_csv("full_text_labels_test_df_spacy.csv", index_col=0)

In [19]:
ABSTRACT = False

if ABSTRACT:
    true_temp_df = pd.read_csv("abstract_labels_df.csv", index_col=0)
else:
    true_temp_df = pd.read_csv("full_text_labels_df.csv", index_col=0)

In [21]:
true_df_full_text = filter_true(true_temp_df, gold_standard, gold_standard_cui_to_tui)

Number of entities labelled = 48366


In [22]:
true_df_full_text = true_df_full_text.drop_duplicates(["paper", "Start", "End"])

In [23]:
len(true_df_full_text)

48309

In [24]:
if ABSTRACT:
    grouped_true_full_text = get_results_grouped(true_df_full_text, write_to_file=False, filename="abstract-labels.txt")
    grouped_true_full_text.to_csv("abstract-labels.csv")
else:
    grouped_true_full_text = get_results_grouped(true_df_full_text, write_to_file=False, filename="full-text-labels.txt")
    grouped_true_full_text.to_csv("full-text-labels.csv")
    

In [26]:
print("Distinct true entities detected (case-sensitive):", len(set(true_df_full_text["Entity"])))
print("Distinct true entities detected (case-insensitive):", len(set(true_df_full_text["Entity_lower"])))

Distinct true entities detected (case-sensitive): 162
Distinct true entities detected (case-insensitive): 98


In [None]:
true_df_full_text[true_df_full_text["Entity_lower"] == "head shaking"]

In [None]:
true_df_full_text = true_df_full_text.drop_duplicates(["paper", "Start", "End"])

In [None]:
grouped_true_full_text.head(25)

In [None]:
freq_dict = {}
for i, row in grouped_true_full_text.iterrows():
    ent = row["Entity"]
    freq = row["count"]
    freq_dict[ent] = freq

In [None]:
freq_dict

In [None]:
plt.figure(figsize=(16,8))
wc = WordCloud(background_color="white", max_words=1000, width=4000, height=2000, min_font_size=10, relative_scaling=0.2, max_font_size=500)
# generate word cloud
wc.generate_from_frequencies(freq_dict)

# show
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")

FIGURES_DIR = 'figures'
if ABSTRACT:
    fig_title = "word_cloud_abstract"
else:
    fig_title = "word_cloud_full_text"
plt.savefig(FIGURES_DIR + "/" + fig_title + ".png", dpi=300)
plt.show()

# CLAMP get predictions from trained model

In [None]:
CLAMP_DIRECTORY = "clamp"

ABSTRACT = False

if ABSTRACT:
    CLAMP_OUTPUT_DIRECTORY = os.path.join(CLAMP_DIRECTORY, "clamp_crf_output_abstract") 
    CLAMP_OUTPUT_DIRECTORY_UPDATED = os.path.join(CLAMP_DIRECTORY, "clamp_crf_output_abstract_updated") 
    CLAMP_RESULTS_DIRECTORY = os.path.join(CLAMP_DIRECTORY, "clamp_crf_results_abstract") 
    INPUT_DIRECTORY = os.path.join("abstract_train_test", "test")
else:
    CLAMP_OUTPUT_DIRECTORY = os.path.join(CLAMP_DIRECTORY, "clamp_crf_output_full_text") 
    CLAMP_OUTPUT_DIRECTORY_UPDATED = os.path.join(CLAMP_DIRECTORY, "clamp_crf_output_full_text_updated") 
    CLAMP_RESULTS_DIRECTORY = os.path.join(CLAMP_DIRECTORY, "clamp_crf_results_full_text") 
    INPUT_DIRECTORY = os.path.join("full_text_train_test", "test")

In [None]:
def extract_entity(full_text, row):
    ent = full_text[row['Start']:row['End']]
    return ent

In [None]:
def extract_sentence(sentences, row):
    ent_start = row['Start']
    for s in doc.sents:
        if s.start_char <= ent_start and ent_start < s.end_char:
            return s.text
    return ""

In [None]:
# keep only relevant predictions
def update_clamp_output(file, file_out):
    for line in file:
        splits = line.split("\t")
        if splits[0].strip() == "NamedEntity" and splits[3].strip() == "semantic=AutismTerm":
            splits_updated = []
            for s in splits:
                # keep only part after "="
                if "=" in s:
                    splits_updated.append(s.split("=")[-1].strip())
                else:
                    splits_updated.append(s)
            file_out.write("\t".join(splits_updated) + "\n")


In [None]:
for filename in os.listdir(CLAMP_OUTPUT_DIRECTORY):
    
    if filename.endswith(".txt"):
        
        file_in_path = os.path.join(CLAMP_OUTPUT_DIRECTORY, filename)
        file_out_path = os.path.join(CLAMP_OUTPUT_DIRECTORY_UPDATED, filename)
        
        with open(file_in_path) as file_in:
            with open(file_out_path, "w") as file_out:
                update_clamp_output(file_in, file_out)
        

In [None]:
idx = 0
pred_df_original = pd.DataFrame()
for filename in os.listdir(CLAMP_OUTPUT_DIRECTORY_UPDATED):
    
    if filename.endswith(".txt"):
        
        if idx % 100 == 0:
            print(idx, filename)
            
        # ignore empty files
        if is_file_empty(CLAMP_OUTPUT_DIRECTORY_UPDATED, filename):
            continue
        
        with open(os.path.join(INPUT_DIRECTORY, filename)) as f:
            full_text = f.read()
            doc = nlp(full_text)
        
        temp = pd.read_csv(os.path.join(CLAMP_OUTPUT_DIRECTORY_UPDATED, filename), sep="\t", quoting=3, header=None, error_bad_lines=False)
        temp.columns = ["type", "Start", "End", "semantic", "assertion", "CUI", "sentProb", "conceptProb", "Entity"]
        temp["paper"] = filename
        temp["Entity_matched"] = list(temp.apply(lambda row: extract_entity(full_text, row), axis=1))
        temp["Sentence_pred"] = temp.apply(lambda row: extract_sentence(doc.sents, row), axis=1)
        
        pred_df_original = pred_df_original.append(temp)
        
        idx = idx + 1

In [None]:
pred_df_temp = pred_df_original

In [None]:
pred_df_temp.to_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, "clamp_preds.csv"))

# CLAMP get predictions

In [None]:
# clamp input and output
CLAMP_DIRECTORY = "clamp"

INPUT_DIRECTORY_FULL_TEXT = "pubmed_all_fulltext_input"
INPUT_DIRECTORY_ABSTRACT = "pubmed_abstracts_24860"

CLAMP_OUTPUT_DIRECTORY_FULL_TEXT = os.path.join(CLAMP_DIRECTORY, "clamp_output_full_text") 
CLAMP_OUTPUT_DIRECTORY_ABSTRACT = os.path.join(CLAMP_DIRECTORY, "clamp_output_abstract") 

CLAMP_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(CLAMP_DIRECTORY, "clamp_results_full_text")
CLAMP_RESULTS_DIRECTORY_ABSTRACT = os.path.join(CLAMP_DIRECTORY, "clamp_results_abstract")

# choose whether to use full-text or abstract - MODIFY HERE
ABSTRACT = True

if ABSTRACT:
    CLAMP_OUTPUT_DIRECTORY = CLAMP_OUTPUT_DIRECTORY_ABSTRACT
    CLAMP_RESULTS_DIRECTORY = CLAMP_RESULTS_DIRECTORY_ABSTRACT 
    INPUT_DIRECTORY = INPUT_DIRECTORY_ABSTRACT
else:
    CLAMP_OUTPUT_DIRECTORY = CLAMP_OUTPUT_DIRECTORY_FULL_TEXT
    CLAMP_RESULTS_DIRECTORY = CLAMP_RESULTS_DIRECTORY_FULL_TEXT
    INPUT_DIRECTORY = INPUT_DIRECTORY_FULL_TEXT

In [None]:
def extract_entity(full_text, row):
    ent = full_text[row['Start']:row['End']]
    return ent

In [None]:
def extract_sentence(sentences, row):
    ent_start = row['Start']
    for s in doc.sents:
        if s.start_char <= ent_start and ent_start < s.end_char:
            return s.text
    return ""

In [None]:
idx = 0
pred_df_original = pd.DataFrame()
for filename in os.listdir(CLAMP_OUTPUT_DIRECTORY):
    
    if filename.endswith(".txt"):
        
        if idx % 500 == 0:
            print(idx, filename)
        
        with open(os.path.join(INPUT_DIRECTORY, filename)) as f:
            full_text = f.read()
            doc = nlp(full_text)
        
        temp = pd.read_csv(os.path.join(CLAMP_OUTPUT_DIRECTORY, filename), sep="\t", quoting=3)
        temp["paper"] = filename
        temp["Entity_matched"] = list(temp.apply(lambda row: extract_entity(full_text, row), axis=1))
        temp["Sentence_pred"] = temp.apply(lambda row: extract_sentence(doc.sents, row), axis=1)
        
        pred_df_original = pred_df_original.append(temp)
        
        idx = idx + 1

In [None]:
pred_df_temp = pred_df_original

In [None]:
pred_df_temp.to_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, "clamp_preds.csv"))

# Start here to load CLAMP predictions and output results

In [None]:
# clamp input and output
CLAMP_DIRECTORY = "clamp"

INPUT_DIRECTORY_FULL_TEXT = "pubmed_all_fulltext_input"
INPUT_DIRECTORY_ABSTRACT = "pubmed_abstracts_24860"

CLAMP_OUTPUT_DIRECTORY_FULL_TEXT = os.path.join(CLAMP_DIRECTORY, "clamp_output_full_text") 
CLAMP_OUTPUT_DIRECTORY_ABSTRACT = os.path.join(CLAMP_DIRECTORY, "clamp_output_abstract") 

CLAMP_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(CLAMP_DIRECTORY, "clamp_results_full_text")
CLAMP_RESULTS_DIRECTORY_ABSTRACT = os.path.join(CLAMP_DIRECTORY, "clamp_results_abstract")

# choose whether to use full-text or abstract - MODIFY HERE
ABSTRACT = True

if ABSTRACT:
    CLAMP_OUTPUT_DIRECTORY = CLAMP_OUTPUT_DIRECTORY_ABSTRACT
    CLAMP_RESULTS_DIRECTORY = CLAMP_RESULTS_DIRECTORY_ABSTRACT 
    INPUT_DIRECTORY = INPUT_DIRECTORY_ABSTRACT
else:
    CLAMP_OUTPUT_DIRECTORY = CLAMP_OUTPUT_DIRECTORY_FULL_TEXT
    CLAMP_RESULTS_DIRECTORY = CLAMP_RESULTS_DIRECTORY_FULL_TEXT
    INPUT_DIRECTORY = INPUT_DIRECTORY_FULL_TEXT

In [None]:
FILTER = True # filter some predicted terms to increase precision

In [None]:
# # for getting results of trained model

# CLAMP_DIRECTORY = "clamp"

# ABSTRACT = False

# FILTER = False

# if ABSTRACT:
#     CLAMP_OUTPUT_DIRECTORY = os.path.join(CLAMP_DIRECTORY, "clamp_crf_output_abstract") 
#     CLAMP_OUTPUT_DIRECTORY_UPDATED = os.path.join(CLAMP_DIRECTORY, "clamp_crf_output_abstract_updated") 
#     CLAMP_RESULTS_DIRECTORY = os.path.join(CLAMP_DIRECTORY, "clamp_crf_results_abstract") 
#     INPUT_DIRECTORY = os.path.join("abstract_train_test", "test")
# else:
#     CLAMP_OUTPUT_DIRECTORY = os.path.join(CLAMP_DIRECTORY, "clamp_crf_output_full_text") 
#     CLAMP_OUTPUT_DIRECTORY_UPDATED = os.path.join(CLAMP_DIRECTORY, "clamp_crf_output_full_text_updated") 
#     CLAMP_RESULTS_DIRECTORY = os.path.join(CLAMP_DIRECTORY, "clamp_crf_results_full_text") 
#     INPUT_DIRECTORY = os.path.join("full_text_train_test", "test")

In [None]:
pred_df_temp = pd.read_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, "clamp_preds.csv"), index_col=0)

In [None]:
if FILTER:
    pred_df_temp = pred_df_temp[pred_df_temp["Semantic"]=="problem"] # keep only terms where semantic is "problem"#
    pred_df_temp = pred_df_temp.dropna(subset=["CUI"]) # keep only terms with CUI

pred_df_temp["CUI"] = pred_df_temp["CUI"].apply(lambda x: get_CUI(x)) # get CUI
    

In [None]:
# filter predictions
# map CUI to TUI
cui_to_tui_map = pd.read_csv("clamp_cui_to_tui_map.txt", sep="\t", header = None)
cui_to_tui_map.columns = ["CUI", "TUI"]

if FILTER:

    # filter predicted terms for relevant semtypes/TUIs
    #relevant_tuis = ["T184", "T048", "T028", "T052", "T041", "T080", "T054", "T058", "T101", "T056", "T047", "T033", "T055"]
 
    relevant_tuis = ["T033", "T048"]
    
    pred_temp_df_filtered = pred_df_temp.merge(cui_to_tui_map, on="CUI")
    #pred_temp_df_filtered = pred_temp_df_filtered[(pred_temp_df_filtered["TUI"].isin(relevant_tuis))]
    pred_temp_df_filtered = pred_temp_df_filtered[(pred_temp_df_filtered["TUI"].isin(relevant_tuis)) | (pred_temp_df_filtered["Entity"].str.contains("ASD"))]

    # correct CUI
    #pred_temp_df_filtered = pred_temp_df_filtered.replace({'CUI': 'C0018817'}, "C1510586")
    
    pred_df_clamp = filter_pred(pred_temp_df_filtered, remove_non_asd=True) # use clamp preds filtered by semtype

else:
    pred_df_temp = pred_df_temp.merge(cui_to_tui_map, on="CUI", how="left")
    pred_df_clamp = filter_pred(pred_df_temp, remove_non_asd=False)


In [None]:
pred_df_clamp = pred_df_clamp.drop_duplicates(["Start", "End", "paper"])
true_df_full_text = true_df_full_text.drop_duplicates(["Start", "End", "paper"])

In [None]:
clamp_entities = pred_df_clamp["Entity"]
print("CLAMP mean entity no. of words =", np.mean([len(str(ent).split(" ")) for ent in clamp_entities]))
print("CLAMP mean entity no. of words =", np.std([len(str(ent).split(" ")) for ent in clamp_entities]))

In [None]:
print("CLAMP results:")
clamp_true_pos_df = calculate_statistics(pred_df_clamp, true_df_full_text, match_cui=False)

### Analyze true and false positive, and false negative

In [None]:
true_pos_grouped, false_pos_grouped, false_neg_grouped, false_pos, false_neg = get_false_and_true_pos(clamp_true_pos_df, pred_df_clamp, true_df_full_text)


In [None]:
if FILTER:
    filtered = "filtered_"
else:
    filtered = ""

In [None]:
true_pos_grouped.head(20)

In [None]:
true_pos_grouped.to_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, filtered + "clamp_true_positive.csv"))

In [None]:
false_pos_grouped.head(20)

In [None]:
false_pos_grouped.to_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, filtered + "clamp_false_positive.csv"))

In [None]:
false_neg_grouped.head(20)

In [None]:
false_neg_grouped.to_csv(os.path.join(CLAMP_RESULTS_DIRECTORY, filtered + "clamp_false_negative.csv"))

# cTakes Results

In [None]:
# ctakes input and output
CTAKES_DIRECTORY = "ctakes"

INPUT_DIRECTORY_FULL_TEXT = "pubmed_all_fulltext_input"
INPUT_DIRECTORY_ABSTRACT = "pubmed_abstracts_24860"

CTAKES_OUTPUT_DIRECTORY_FULL_TEXT = os.path.join(CTAKES_DIRECTORY, "ctakes_output_full_text") 
CTAKES_OUTPUT_DIRECTORY_ABSTRACT = os.path.join(CTAKES_DIRECTORY, "ctakes_output_abstract") 

CTAKES_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(CTAKES_DIRECTORY, "ctakes_results_full_text")
CTAKES_RESULTS_DIRECTORY_ABSTRACT = os.path.join(CTAKES_DIRECTORY, "ctakes_results_abstract")

# choose whether to use full-text or abstract - MODIFY HERE
ABSTRACT = True

if ABSTRACT:
    CTAKES_OUTPUT_DIRECTORY = CTAKES_OUTPUT_DIRECTORY_ABSTRACT
    CTAKES_RESULTS_DIRECTORY = CTAKES_RESULTS_DIRECTORY_ABSTRACT 
    INPUT_DIRECTORY = INPUT_DIRECTORY_ABSTRACT
else:
    CTAKES_OUTPUT_DIRECTORY = CTAKES_OUTPUT_DIRECTORY_FULL_TEXT
    CTAKE_RESULTS_DIRECTORY = CTAKES_RESULTS_DIRECTORY_FULL_TEXT
    INPUT_DIRECTORY = INPUT_DIRECTORY_FULL_TEXT

In [None]:
# compile predicted entities for ctakes

pred_df_temp = pd.DataFrame()
i = 0
for filename in os.listdir(CTAKES_OUTPUT_DIRECTORY):
    if filename.endswith(".csv"):
        
        # ignore empty files
        if is_file_empty(CTAKES_OUTPUT_DIRECTORY, filename):
            continue
        
        if ABSTRACT:
            filename_updated = filename.replace(".txt", "")
        else:
            filename_updated = filename
            
        if i % 5 == 0:
            print(i, filename_updated)
        
        input_filename = filename_updated.replace(".csv", ".txt")
        
        if input_filename == "PMC6061181.txt": # skip this paper
            continue
             
        with open(os.path.join(INPUT_DIRECTORY, input_filename)) as f:
            full_text = f.read()       
          
        temp = pd.read_csv(os.path.join(CTAKES_OUTPUT_DIRECTORY, filename))
        temp["paper"] = filename.replace(".csv", ".txt")
        temp = temp.rename(columns={"cui":"CUI", "tui":"TUI", "pos_start":"Start", "pos_end":"End"})
        temp["Entity"] = temp.apply(lambda row: full_text[row['Start']:row['End']], axis=1)
        pred_df_temp = pred_df_temp.append(temp)
        

        i = i + 1

In [None]:
pred_df_temp.to_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, "ctakes_preds.csv"))

### Start here to load cTAKES results

In [None]:
# ctakes input and output
CTAKES_DIRECTORY = "ctakes"

INPUT_DIRECTORY_FULL_TEXT = "pubmed_all_fulltext_input"
INPUT_DIRECTORY_ABSTRACT = "pubmed_abstracts_24860"

CTAKES_OUTPUT_DIRECTORY_FULL_TEXT = os.path.join(CTAKES_DIRECTORY, "ctakes_output_full_text") 
CTAKES_OUTPUT_DIRECTORY_ABSTRACT = os.path.join(CTAKES_DIRECTORY, "ctakes_output_abstract") 

CTAKES_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(CTAKES_DIRECTORY, "ctakes_results_full_text")
CTAKES_RESULTS_DIRECTORY_ABSTRACT = os.path.join(CTAKES_DIRECTORY, "ctakes_results_abstract")

# choose whether to use full-text or abstract - MODIFY HERE
ABSTRACT = True

if ABSTRACT:
    CTAKES_OUTPUT_DIRECTORY = CTAKES_OUTPUT_DIRECTORY_ABSTRACT
    CTAKES_RESULTS_DIRECTORY = CTAKES_RESULTS_DIRECTORY_ABSTRACT 
    INPUT_DIRECTORY = INPUT_DIRECTORY_ABSTRACT
else:
    CTAKES_OUTPUT_DIRECTORY = CTAKES_OUTPUT_DIRECTORY_FULL_TEXT
    CTAKES_RESULTS_DIRECTORY = CTAKES_RESULTS_DIRECTORY_FULL_TEXT
    INPUT_DIRECTORY = INPUT_DIRECTORY_FULL_TEXT

In [None]:
FILTER = True

In [None]:
pred_df_temp = pd.read_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, "ctakes_preds.csv"), index_col=0)

In [None]:
if FILTER:
    # filter predicted terms for relevant semtypes/TUIs
    relevant_tuis = ["T048", "T033"]
    
    pred_temp_df_filtered = pred_df_temp
    pred_temp_df_filtered = pred_temp_df_filtered[(pred_temp_df_filtered["TUI"].isin(relevant_tuis)) | (pred_temp_df_filtered["Entity"].str.contains("ASD"))]
    
    relevant_textsem = ["DiseaseDisorderMention", "SignSymptomMention"]
    pred_temp_df_filtered = pred_temp_df_filtered[pred_temp_df_filtered["textsem"].isin(relevant_textsem)]
    
    pred_df_ctakes = filter_pred(pred_temp_df_filtered, remove_non_asd=True) # use ctakes preds filtered by semtype

else:
    pred_df_ctakes = filter_pred(pred_df_temp, remove_non_asd=False)

In [None]:
pred_df_ctakes = pred_df_ctakes.drop_duplicates(["Start", "End", "paper"])
true_df_full_text = true_df_full_text.drop_duplicates(["Start", "End", "paper"])

In [None]:
ctakes_entities = pred_df_ctakes["Entity"]
print("cTAKES mean entity no. of words =", np.mean([len(str(ent).split(" ")) for ent in ctakes_entities]))
print("cTAKES mean entity no. of words =", np.std([len(str(ent).split(" ")) for ent in ctakes_entities]))

In [None]:
if ABSTRACT:
    pred_df_ctakes["paper"] = pred_df_ctakes["paper"].str.replace(".txt", "", 1)

In [None]:
pred_df_ctakes["Entity"] = pred_df_ctakes["true_text"]

In [None]:
print("cTAKES results:")
ctakes_true_pos_df = calculate_statistics(pred_df_ctakes, true_df_full_text)

### Analyze true and false positive, and false negative

In [None]:
true_pos_grouped, false_pos_grouped, false_neg_grouped, false_pos, false_neg = get_false_and_true_pos(ctakes_true_pos_df, pred_df_ctakes, true_df_full_text)


In [None]:
if FILTER:
    filtered = "filtered_"
else:
    filtered = ""

In [None]:
true_pos_grouped.head(20)

In [None]:
true_pos_grouped.to_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, filtered + "ctakes_true_positive.csv"))

In [None]:
false_pos_grouped.head(20)

In [None]:
false_pos_grouped.to_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, filtered + "ctakes_false_positive.csv"))

In [None]:
false_neg_grouped.head(20)

In [None]:
false_neg_grouped.to_csv(os.path.join(CTAKES_RESULTS_DIRECTORY, filtered + "ctakes_false_negative.csv"))

# BioBERT results

In [None]:
# biobert input and output
BIOBERT_DIRECTORY = "biobert"

INPUT_DIRECTORY_FULL_TEXT = "pubmed_all_fulltext_input"
INPUT_DIRECTORY_ABSTRACT = "pubmed_abstracts_24860"

BIOBERT_OUTPUT_DIRECTORY_FULL_TEXT = os.path.join(BIOBERT_DIRECTORY, "biobert_output_full_text") 
BIOBERT_OUTPUT_DIRECTORY_ABSTRACT = os.path.join(BIOBERT_DIRECTORY, "biobert_output_abstract") 

BIOBERT_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(BIOBERT_DIRECTORY, "biobert_results_full_text")
BIOBERT_RESULTS_DIRECTORY_ABSTRACT = os.path.join(BIOBERT_DIRECTORY, "biobert_results_abstract")

# choose whether to use full-text or abstract - MODIFY HERE
ABSTRACT = True

if ABSTRACT:
    BIOBERT_OUTPUT_DIRECTORY = BIOBERT_OUTPUT_DIRECTORY_ABSTRACT
    BIOBERT_RESULTS_DIRECTORY = BIOBERT_RESULTS_DIRECTORY_ABSTRACT 
    INPUT_DIRECTORY = INPUT_DIRECTORY_ABSTRACT
else:
    BIOBERT_OUTPUT_DIRECTORY = BIOBERT_OUTPUT_DIRECTORY_FULL_TEXT
    BIOBERT_RESULTS_DIRECTORY = BIOBERT_RESULTS_DIRECTORY_FULL_TEXT
    INPUT_DIRECTORY = INPUT_DIRECTORY_FULL_TEXT

In [None]:
pred_df_temp = pd.read_csv(os.path.join(BIOBERT_RESULTS_DIRECTORY, "biobert_preds.csv"), index_col=0)
true_df_full_text = pd.read_csv(os.path.join(BIOBERT_RESULTS_DIRECTORY, "biobert_labels.csv"), index_col=0)

In [None]:
pred_df_temp[pred_df_temp["paper"].str.contains("PMC6061181")]

In [None]:
pred_df_biobert = filter_pred(pred_df_temp, remove_non_asd=False, cui=False)

In [None]:
pred_df_biobert = pred_df_biobert.drop_duplicates(["Start", "End", "paper"])
true_df_full_text = true_df_full_text.drop_duplicates(["Start", "End", "paper"])

In [None]:
biobert_entities = pred_df_biobert["Entity"]
print("BIOBERT mean entity no. of words =", np.mean([len(str(ent).split(" ")) for ent in biobert_entities]))
print("BIOBERT mean entity no. of words =", np.std([len(str(ent).split(" ")) for ent in biobert_entities]))

In [None]:
print("BIOBERT results:")
biobert_true_pos_df = calculate_statistics(pred_df_biobert, true_df_full_text)

### Analyze true and false positive, and false negative

In [None]:
true_pos_grouped, false_pos_grouped, false_neg_grouped, false_pos, false_neg = get_false_and_true_pos(biobert_true_pos_df, pred_df_biobert, true_df_full_text, cui=False)


In [None]:
true_pos_grouped.head(20)

In [None]:
true_pos_grouped.to_csv(os.path.join(BIOBERT_RESULTS_DIRECTORY, "biobert_true_positive.csv"))

In [None]:
false_pos_grouped.head(20)

In [None]:
false_pos_grouped.to_csv(os.path.join(BIOBERT_RESULTS_DIRECTORY, "biobert_false_positive.csv"))

In [None]:
false_neg_grouped.head(20)

In [None]:
false_neg_grouped.to_csv(os.path.join(BIOBERT_RESULTS_DIRECTORY, "biobert_false_negative.csv"))