This notebook formats the CLAMP output (i.e. predictions) for the PubMed abstracts and full-texts.

In [1]:
import csv
import os
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")

In [2]:
# configurations that can be modified
ABSTRACT = False # True if running this program of abstracts, False if running on full-texts
CLAMP_DIRECTORY = "clamp" # parent directory for CLAMP-related files

# location of full-texts/abstracts in plain text
INPUT_DIRECTORY_FULL_TEXT = "pubmed_fulltexts_544"
INPUT_DIRECTORY_ABSTRACT = "pubmed_abstracts_20408"

# CLAMP output/predictions
CLAMP_OUTPUT_DIRECTORY_FULL_TEXT = os.path.join(CLAMP_DIRECTORY, "clamp_output_full_text") 
CLAMP_OUTPUT_DIRECTORY_ABSTRACT = os.path.join(CLAMP_DIRECTORY, "clamp_output_abstract") 

# formatted CLAMP output/predictions
CLAMP_RESULTS_DIRECTORY_FULL_TEXT = os.path.join(CLAMP_DIRECTORY, "clamp_results_full_text")
CLAMP_RESULTS_DIRECTORY_ABSTRACT = os.path.join(CLAMP_DIRECTORY, "clamp_results_abstract")

if ABSTRACT:
    CLAMP_OUTPUT_DIRECTORY = CLAMP_OUTPUT_DIRECTORY_ABSTRACT
    CLAMP_RESULTS_DIRECTORY = CLAMP_RESULTS_DIRECTORY_ABSTRACT 
    INPUT_DIRECTORY = INPUT_DIRECTORY_ABSTRACT
else:
    CLAMP_OUTPUT_DIRECTORY = CLAMP_OUTPUT_DIRECTORY_FULL_TEXT
    CLAMP_RESULTS_DIRECTORY = CLAMP_RESULTS_DIRECTORY_FULL_TEXT
    INPUT_DIRECTORY = INPUT_DIRECTORY_FULL_TEXT

In [3]:
# helper functions
def extract_entity(full_text, row):
    ent = full_text[row['Start']:row['End']]
    return ent

def extract_sentence(sentences, row):
    ent_start = row['Start']
    for s in sentences:
        if s.start_char <= ent_start and ent_start < s.end_char:
            return s.text
    return ""

def is_file_empty(DIRECTORY, filename):
    with open(os.path.join(DIRECTORY, filename)) as f:
        data = f.read()
        
    return data.isspace() or data == ""

In [4]:
# format CLAMP output/predictions in csv format where one row is one NER prediction
empty_input_files = []
with open(os.path.join(CLAMP_RESULTS_DIRECTORY, "clamp_preds.csv"), "w") as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=',', quoting=csv.QUOTE_MINIMAL)
    header = ['Start', 'End', 'Semantic', 'CUI', 'Assertion', 'Entity', 'paper',
       'Entity_matched', 'Sentence_pred']
    csv_writer.writerow(header)
    
    clamp_files = [filename for filename in os.listdir(CLAMP_OUTPUT_DIRECTORY) if filename.endswith(".txt")]
    for idx, filename in enumerate(clamp_files):

        if ABSTRACT and idx % 1000 == 0:
            print(idx, filename)
        elif not ABSTRACT and idx % 5 == 0:
            print(idx, filename)
            
        if is_file_empty(INPUT_DIRECTORY, filename):
            empty_input_files.append(filename)
            continue
            
        # ignore empty files
        if is_file_empty(CLAMP_OUTPUT_DIRECTORY, filename):
            continue

        with open(os.path.join(INPUT_DIRECTORY, filename)) as f:
            full_text = f.read()
            doc = nlp(full_text)

        df = pd.read_csv(os.path.join(CLAMP_OUTPUT_DIRECTORY, filename), sep="\t", quoting=3)
        df["paper"] = filename
        df["Entity_matched"] = list(df.apply(lambda row: extract_entity(full_text, row), axis=1))
        df["Sentence_pred"] = df.apply(lambda row: extract_sentence(doc.sents, row), axis=1)

        for i, row in df.iterrows():
            csv_writer.writerow(list(row))

0 PMC5419910.txt
5 PMC5331586.txt
10 PMC4359103.txt
15 PMC6479357.txt
20 PMC6096788.txt
25 PMC6450841.txt
30 PMC5948870.txt
35 PMC6706698.txt
40 PMC5789214.txt
45 PMC3641085.txt
50 PMC5432605.txt
55 PMC4946778.txt
60 PMC5352792.txt
65 PMC6696098.txt
70 PMC5627471.txt
75 PMC6361977.txt
80 PMC4619728.txt
85 PMC6414208.txt
90 PMC5609014.txt
95 PMC5481972.txt
100 PMC6218441.txt
105 PMC5775320.txt
110 PMC6282609.txt
115 PMC6133091.txt
120 PMC5360849.txt
125 PMC6199253.txt
130 PMC5889781.txt
135 PMC6521002.txt
140 PMC6136574.txt
145 PMC5416705.txt
150 PMC6025870.txt
155 PMC4534101.txt
160 PMC3827355.txt
165 PMC2903486.txt
170 PMC6450836.txt
175 PMC6417160.txt
180 PMC5681818.txt
185 PMC6061016.txt
190 PMC5680520.txt
195 PMC6045598.txt
200 PMC2663032.txt
205 PMC5304941.txt
210 PMC6628259.txt
215 PMC3501481.txt
220 PMC5192959.txt
225 PMC6373295.txt
230 PMC2731203.txt
235 PMC6389139.txt
240 PMC2582449.txt
245 PMC6546658.txt
250 PMC5700871.txt
255 PMC6082980.txt
260 PMC4638355.txt
265 PMC6466354.

In [5]:
print("Number of empty input files", len(empty_input_files))

Number of empty input files 0


In [6]:
empty_input_files

[]