This notebook formats the cTAKES output (i.e. predictions) for the PubMed abstracts and full-texts.

In [1]:
import csv
import os
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")

In [2]:
# configurations that can be modified
ABSTRACT = False # True if running this program of abstracts, False if running on full-texts
CTAKES_DIRECTORY = "ctakes" # parent directory for cTAKES-related files

# location of full-texts/abstracts in plain text
INPUT_DIRECTORY_FULL_TEXT = "pubmed_fulltexts_544"
INPUT_DIRECTORY_ABSTRACT = "pubmed_abstracts_20408"

# cTAKES output/predictions
CTAKES_OUTPUT_DIRECTORY_FULL_TEXT = os.path.join(CTAKES_DIRECTORY, "ctakes_output_full_text") 
CTAKES_OUTPUT_DIRECTORY_ABSTRACT = os.path.join(CTAKES_DIRECTORY, "ctakes_output_abstract") 

# formatted cTAKES output/predictions
CTAKES_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(CTAKES_DIRECTORY, "ctakes_results_full_text")
CTAKES_RESULTS_DIRECTORY_ABSTRACT = os.path.join(CTAKES_DIRECTORY, "ctakes_results_abstract")

if ABSTRACT:
    CTAKES_OUTPUT_DIRECTORY = CTAKES_OUTPUT_DIRECTORY_ABSTRACT
    CTAKES_RESULTS_DIRECTORY = CTAKES_RESULTS_DIRECTORY_ABSTRACT 
    INPUT_DIRECTORY = INPUT_DIRECTORY_ABSTRACT
else:
    CTAKES_OUTPUT_DIRECTORY = CTAKES_OUTPUT_DIRECTORY_FULL_TEXT
    CTAKES_RESULTS_DIRECTORY = CTAKES_RESULTS_DIRECTORY_FULL_TEXT
    INPUT_DIRECTORY = INPUT_DIRECTORY_FULL_TEXT

In [3]:
# helper functions
def extract_sentence(sentences, row):
    ent_start = row['Start']
    for s in sentences:
        if s.start_char <= ent_start and ent_start < s.end_char:
            return s.text
    return ""

def is_file_empty(DIRECTORY, filename):
    with open(os.path.join(DIRECTORY, filename)) as f:
        data = f.read()
        
    return data.isspace() or data == ""

In [4]:
# format cTAKES output/predictions in csv format where one row is one NER prediction
empty_input_files = []
with open(os.path.join(CTAKES_RESULTS_DIRECTORY, "ctakes_preds.csv"), "w") as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=',', quoting=csv.QUOTE_MINIMAL)
    header = ['conditional', 'confidence', 'CUI', 'generic', 'id', 'negated', 'End',
       'Start', 'preferred_text', 'refsem', 'scheme', 'score', 'subject',
       'textsem', 'TUI', 'uncertainty', 'true_text', 'part_of_speech', 'paper',
       'Entity_matched', 'Sentence_pred']
    csv_writer.writerow(header)
    
    ctakes_files = [filename for filename in os.listdir(CTAKES_OUTPUT_DIRECTORY) if filename.endswith(".csv")]
    for idx, filename in enumerate(ctakes_files):

        if ABSTRACT and idx % 1000 == 0:
            print(idx, filename)
        elif not ABSTRACT and idx % 5 == 0:
            print(idx, filename)

        input_filename = filename.replace(".csv", ".txt")
            
        # ignore empty files
        if is_file_empty(INPUT_DIRECTORY, input_filename):
            empty_input_files.append(input_filename)
            continue
        
        if is_file_empty(CTAKES_OUTPUT_DIRECTORY, filename):
            continue

        with open(os.path.join(INPUT_DIRECTORY, input_filename)) as f:
            plain_text = f.read()     
            doc = nlp(plain_text)
            
        df = pd.read_csv(os.path.join(CTAKES_OUTPUT_DIRECTORY, filename))
        df["paper"] = filename.replace(".csv", ".txt")
        df = df.rename(columns={"cui":"CUI", "tui":"TUI", "pos_start":"Start", "pos_end":"End"})
        df["Entity_matched"] = df.apply(lambda row: plain_text[row['Start']:row['End']], axis=1)
        df["Sentence_pred"] = df.apply(lambda row: extract_sentence(doc.sents, row), axis=1)

        for i, row in df.iterrows():
            csv_writer.writerow(list(row))

0 PMC5681971.csv
5 PMC5751211.csv
10 PMC4437371.csv
15 PMC6509633.csv
20 PMC5894192.csv
25 PMC5414938.csv
30 PMC5570931.csv
35 PMC5516334.csv
40 PMC3134991.csv
45 PMC6099375.csv
50 PMC6394789.csv
55 PMC6445349.csv
60 PMC6263710.csv
65 PMC5967343.csv
70 PMC5879814.csv
75 PMC4938426.csv
80 PMC5800535.csv
85 PMC5577136.csv
90 PMC4973977.csv
95 PMC3183050.csv
100 PMC3150222.csv
105 PMC6373074.csv
110 PMC6370186.csv
115 PMC5264460.csv
120 PMC6153902.csv
125 PMC6308558.csv
130 PMC5443871.csv
135 PMC6245048.csv
140 PMC6022232.csv
145 PMC6341103.csv
150 PMC6050263.csv
155 PMC5360852.csv
160 PMC5225659.csv
165 PMC6606653.csv
170 PMC6223803.csv
175 PMC5947578.csv
180 PMC5697632.csv
185 PMC6017261.csv
190 PMC6539237.csv
195 PMC5526322.csv
200 PMC6550041.csv
205 PMC5678184.csv
210 PMC6181799.csv
215 PMC5457440.csv
220 PMC5832686.csv
225 PMC6258483.csv
230 PMC6651320.csv
235 PMC3411605.csv
240 PMC5241332.csv
245 PMC6085908.csv
250 PMC5623420.csv
255 PMC5317002.csv
260 PMC5455122.csv
265 PMC6813284.

In [5]:
print(len(empty_input_files))

0


In [6]:
empty_input_files

[]