In [1]:
import csv
import os
from io import StringIO
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from unidecode import unidecode

nlp = spacy.load("en_core_web_sm")

In [2]:
# configurations that can be modified
ABSTRACT = True # True if running this program of abstracts, False if running on full-texts
METAMAP_DIRECTORY = "metamap"

# location of full-texts/abstracts in plain text
INPUT_DIRECTORY_FULL_TEXT = "pubmed_fulltexts_544"
INPUT_DIRECTORY_ABSTRACT = "pubmed_abstracts_20408"

# MetaMap output/predictions
METAMAP_OUTPUT_DIRECTORY_FULL_TEXT = os.path.join(METAMAP_DIRECTORY, "metamap_output_fulltexts") 
METAMAP_OUTPUT_DIRECTORY_ABSTRACT = os.path.join(METAMAP_DIRECTORY, "metamap_output_abstracts") 

# formatted MetaMap output/predictions
METAMAP_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(METAMAP_DIRECTORY, "metamap_results_full_text")
METAMAP_RESULTS_DIRECTORY_ABSTRACT = os.path.join(METAMAP_DIRECTORY, "metamap_results_abstract")

if ABSTRACT:
    METAMAP_OUTPUT_DIRECTORY = METAMAP_OUTPUT_DIRECTORY_ABSTRACT
    METAMAP_RESULTS_DIRECTORY = METAMAP_RESULTS_DIRECTORY_ABSTRACT 
    INPUT_DIRECTORY = INPUT_DIRECTORY_ABSTRACT
    METAMAP_FULL_TEXT_DIRECTORY = os.path.join(METAMAP_DIRECTORY, "metamap_abstract") # combined text directory since MetaMap splits up text if too long
else:
    METAMAP_OUTPUT_DIRECTORY = METAMAP_OUTPUT_DIRECTORY_FULL_TEXT
    METAMAP_RESULTS_DIRECTORY = METAMAP_RESULTS_DIRECTORY_FULL_TEXT
    INPUT_DIRECTORY = INPUT_DIRECTORY_FULL_TEXT
    METAMAP_FULL_TEXT_DIRECTORY = os.path.join(METAMAP_DIRECTORY, "metamap_full_text") # combined text directory since MetaMap splits up text if too long
    METAMAP_TABLES_DIR = os.path.join(METAMAP_DIRECTORY, "metamap_tables") # tables in fulltexts that couldn't be processed by metamap

In [3]:
# helper functions

def extract_sentence(sentences, row):
    ent_start = row['Start']
    for s in sentences:
        if s.start_char <= ent_start and ent_start < s.end_char:
            return s.text
    return ""

def is_file_empty(DIRECTORY, filename):
    with open(os.path.join(DIRECTORY, filename)) as f:
        data = f.read()
        
    return data.isspace() or data == ""

In [4]:
# read in BM ASD terms and create BM set (all lowercase)
BM_df = pd.read_csv("BM_terms.csv")
BM_df["TEXT"] = BM_df["TEXT"].str.strip().str.lower()
autism_terms = set(BM_df["TEXT"])
print(f"There are {len(autism_terms)} autism terms")

There are 827 autism terms


In [5]:
# create spaCy Phrase Matcher (used for labelling BM terms)
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(text) for text in autism_terms]
matcher.add("AutismTerms", None, *patterns)

In [6]:
# arrange files so they are processed in order (MetaMap splits up text if too long)

metamap_files = os.listdir(METAMAP_OUTPUT_DIRECTORY)
metamap_files = [f for f in metamap_files if ".txt" in f]
if ABSTRACT:
    metamap_files = sorted(metamap_files, key = lambda x: (x.split("_")[0], int(x.split("_")[1])))
else:
    metamap_files = sorted(metamap_files, key = lambda x: (x.split("_")[0], int(x.split("_")[1]), int(x.split("_")[2])))
    metamap_tables = os.listdir(METAMAP_TABLES_DIR)
    metamap_tables = [f for f in metamap_tables if ".txt" in f]
    metamap_tables = sorted(metamap_tables, key = lambda x: (x.split("_")[0], int(x.split("_")[1]),))

input_files = os.listdir(INPUT_DIRECTORY)
input_files = [f for f in input_files if ".txt" in f]
input_files = [f.replace(".txt", "") for f in input_files]


In [7]:
# format cTAKES output/predictions in csv format where one row is one NER prediction

# output files
labels_file = open(os.path.join(METAMAP_RESULTS_DIRECTORY, "metamap_labels.csv"), "w")
preds_file = open(os.path.join(METAMAP_RESULTS_DIRECTORY, "metamap_preds.csv"), "w")

labels_csv_writer = csv.writer(labels_file, delimiter=',', quoting=csv.QUOTE_MINIMAL)
labels_csv_writer.writerow(["Entity", "Entity_lower", "paper", "Start", "End", "Sentence"])

preds_csv_writer = csv.writer(preds_file, delimiter=',', quoting=csv.QUOTE_MINIMAL)
metamap_columns = ["id", "MappingScore", "CandidateCUI", "CandidateMatched", "SemType", "StartPos", "Length", "Negated", "CandidateScore", "MatchedWords"]
metamap_columns_formatted = ['MappingScore', 'CUI', 'Entity', 'SemType', 'Start', 'Negated',
       'CandidateScore', 'MatchedWords', 'MatchedPhrase', 'paper',
       'paper_part', 'End', 'Entity_matched', "Sentence_pred"]
preds_csv_writer.writerow(metamap_columns_formatted)

# metamap formatting
header = "id	MappingScore	CandidateCUI	CandidateMatched	SemType	StartPos	Length	Negated	CandidateScore	MatchedWords\n"
papers_analyzed = []
full_text = ""
empty_metamap_output = []
for filename in metamap_files:
    paper = filename.split("_")[0]

    # new paper
    if paper not in papers_analyzed:

        # label last paper for BM terms
        if len(papers_analyzed) > 0:

            full_text = unidecode(full_text)

            with open(os.path.join(METAMAP_FULL_TEXT_DIRECTORY, papers_analyzed[-1]), "w") as f:
                f.write(full_text)

            # analyze previous paper and label BM terms with spaCy
            doc = nlp(full_text)
            matches = matcher(doc)
            spans = []

            for match_id, start, end in matches:
                span = doc[start:end]
                spans.append(span)

            filtered = spacy.util.filter_spans(spans) # use longest match

            for span in filtered:
                row = [span.text, span.text.lower().strip(), papers_analyzed[-1], span.start_char, span.end_char, span.sent.text]
                labels_csv_writer.writerow(row)

        idx = len(papers_analyzed)
        if ABSTRACT and idx % 1000 == 0:
            print(idx, filename)
        elif not ABSTRACT and idx % 5 == 0:
            print(idx, filename)

        full_text = ""
        papers_analyzed.append(paper)


    if is_file_empty(METAMAP_OUTPUT_DIRECTORY, filename): # ignore empty file
        empty_metamap_output.append(filename)
        continue

    with open(os.path.join(METAMAP_OUTPUT_DIRECTORY, filename), "r") as f:
        data = f.read()

    if header not in data:
        print(filename, "has no header")

    splits = data.split(header)

    # this part contains the pmid and utterances
    info = splits[0].split("\n")
    pmid = ""
    utterance = False
    start_idx = len(full_text)

    for line in info:
        if "PMID: " in line:
            pmid_found = line.replace("PMID: ", "")
            pmid_found = pmid_found.split("_")[0]

            # check if pmid matches paper
            if pmid_found != paper:
                raise Exception("PMID doesn't match paper:", line)
            else:
                pmid = pmid_found

        if utterance:
            full_text = full_text + line

        if "UttText:" in line:
            utterance = True
        else:
            utterance = False

    full_text = full_text + " "

    # no terms detected
    if len(splits) < 2:
        i = i + 1
        continue

    doc = nlp(full_text)

    temp = pd.read_csv(StringIO(splits[1]), sep="\t", header=None) 
    temp.columns = metamap_columns
    temp["MatchedPhrase"] = temp["MatchedWords"].apply(lambda x: " ".join(str(x).split(",")))
    temp["paper"] = paper
    temp["paper_part"] = filename
    temp = temp.rename(columns={"StartPos": "Start", "CandidateCUI":"CUI"})
    temp["Start"] = temp["Start"] + start_idx
    temp["End"] = temp["Start"] + temp["Length"]
    temp["Entity_matched"] = temp.apply(lambda row: full_text[row['Start']:row['End']], axis=1)
    temp = temp.rename(columns={'CandidateMatched':'Entity'})
    temp = temp.drop(["id", "Length"], axis=1)
    temp["Sentence_pred"] = temp.apply(lambda row: extract_sentence(doc.sents, row), axis=1)

    for i, row in temp.iterrows():
        preds_csv_writer.writerow(list(row))

        
# analyze previous paper
full_text = unidecode(full_text)

with open(os.path.join(METAMAP_FULL_TEXT_DIRECTORY, papers_analyzed[-1]), "w") as f:
    f.write(full_text)

doc = nlp(full_text)
matches = matcher(doc)
spans = []

for match_id, start, end in matches:
    span = doc[start:end]
    spans.append(span)

filtered = spacy.util.filter_spans(spans) # use longest match

for span in filtered:
    row = [span.text, span.text.lower().strip(), papers_analyzed[-1], span.start_char, span.end_char, span.sent.text]
    labels_csv_writer.writerow(row)


0 10026453.txt_0
1000 12352267.txt_0
2000 15590241.txt_0
3000 16835068.txt_0
4000 17825125.txt_0
5000 18998843.txt_0
6000 2004485.txt_0
7000 21610188.txt_0
8000 22987894.txt_0
9000 24384067.txt_0
10000 25621974.txt_0
11000 26273832.txt_0
12000 26856821.txt_0
13000 27483248.txt_0
14000 28189493.txt_0
15000 28707805.txt_0
16000 29266810.txt_0
17000 29885454.txt_0
18000 30581125.txt_0
19000 3666327.txt_0
20000 8894948.txt_0


In [8]:
if not ABSTRACT:
    for filename in metamap_tables:
        print(filename)
        paper = filename.split("_")[0]
        with open(os.path.join(METAMAP_TABLES_DIR, filename)) as f:
            full_text = f.read()
        full_text = unidecode(full_text)

        doc = nlp(full_text)
        matches = matcher(doc)
        spans = []

        for match_id, start, end in matches:
            span = doc[start:end]
            spans.append(span)

        filtered = spacy.util.filter_spans(spans) # use longest match

        for span in filtered:
            row = [span.text, span.text.lower().strip(), paper, span.start_char, span.end_char, span.sent.text]
            labels_csv_writer.writerow(row)

In [9]:
labels_file.close()
preds_file.close()

In [10]:
print(len(papers_analyzed), "papers analyzed by MetaMap")

20408 papers analyzed by MetaMap


In [11]:
empty_metamap_output

[]