This notebook labels the PubMed full-texts and abstracts using the benchmark (BM) ASD vocabulary set. The labels/matches are exported as a .csv file. These labels are considered the "true" labels for the named-entity recognition (NER) task.

In [1]:
import csv
import os
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from unidecode import unidecode

nlp = spacy.load("en_core_web_sm")

In [2]:
# configurations that can be modified
ABSTRACT = True # True if running this program of abstracts, False if running on full-texts
OUTPUT_DIR = "BM_labelled" # folder where the results (dataframe with labels) will be stored

In [3]:
if ABSTRACT:
    INPUT_DIRECTORY = "pubmed_abstracts_20408"
else:
    INPUT_DIRECTORY = "pubmed_fulltexts_544"

In [4]:
# read in BM ASD terms and create BM set (all lowercase)
BM_df = pd.read_csv("BM_terms.csv")
BM_df["TEXT"] = BM_df["TEXT"].str.strip().str.lower()
autism_terms = set(BM_df["TEXT"])
print(f"There are {len(autism_terms)} autism terms")

There are 827 autism terms


In [5]:
# create spaCy Phrase Matcher (used for labelling BM terms)
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(text) for text in autism_terms]
matcher.add("AutismTerms", None, *patterns)

In [6]:
# csv file to export labels to
if ABSTRACT:
    csv_path = os.path.join(OUTPUT_DIR, "abstract_labels.csv")
else:
    csv_path = os.path.join(OUTPUT_DIR, "full_text_labels.csv")

In [7]:
# label BM terms and write the results to the csv file where one row is a label/match

with open(csv_path, "w") as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=',', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerow(["Entity", "Entity_lower", "paper", "Start", "End", "Sentence"]) # header
    for idx, filename in enumerate(os.listdir(INPUT_DIRECTORY)):

        if filename.endswith(".txt"):
            path = os.path.join(INPUT_DIRECTORY, filename)

            if ABSTRACT and idx % 1000 == 0:
                print(idx, filename)
            elif not ABSTRACT and idx % 5 == 0:
                print(idx, filename)

            # tag entities in abstract
            with open(path, "r") as f:
                data = f.read()
            
            # convert to ASCII
            #data = unidecode(data)

            doc = nlp(data)
            matches = matcher(doc)
            spans = []

            for match_id, start, end in matches:
                span = doc[start:end]
                spans.append(span)

            # use longest BM term match
            filtered = spacy.util.filter_spans(spans)

            for span in filtered:
                row = [span.text, span.text.lower().strip(), filename, span.start_char, span.end_char, span.sent.text]
                csv_writer.writerow(row)

0 28770039.txt
1000 24761747.txt
2000 29873809.txt
3000 18760197.txt
4000 17517680.txt
5000 28856484.txt
6000 18563708.txt
7000 16389586.txt
8000 29927797.txt
9000 18702558.txt
10000 23535821.txt
11000 25882392.txt
12000 8050988.txt
13000 29664902.txt
14000 7169196.txt
15000 25939529.txt
16000 19589455.txt
17000 24604922.txt
18000 24167375.txt
19000 23159942.txt
20000 17896119.txt


In [8]:
print("DONE")

DONE
