# SciBERT: Examining the Filtering Script.

This notebook looks at the "filter_hypernyms.py" script of the Scibert preprocessing step.

## 1. Imports

In [1]:
## Imports from the original py file
import argparse
import codecs
from collections import defaultdict

## 2. Arg Parse
- We first load the arguments needed for the script.
- This basically just specifies where to load and save the text files.

In [41]:
## Configuring the parser for the args variable
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input_file', required=True, help='input file in 13col tsv')
parser.add_argument('-m', '--mesh_file', required=True, help='mesh file to get hierarchy from')
parser.add_argument('-o', '--output_file', required=True, help='write results to this file')

args = parser.parse_args("--input_file ./test_nolabel_og.txt --mesh_file ./2017MeshTree.txt --output_file ./test_output.txt".split())

In [8]:
## Quickly checking the args, looks okay.
args.input_file, args.mesh_file, args.output_file

('./test_nolabel_og.data', './2017MeshTree.txt', './test_output')

In [11]:
## This is a helper function used in the script.  Nothing special here
def chunks(l, n):
    """
    Yield successive n-sized chunks from l.
    """
    for i in range(0, len(l), n):
        assert len(l[i:i + n]) == n
        yield l[i:i + n]

## 3. Step 1: Loading MeSH data
- The first step involves loading in the MeSH tree
- We focus on making sure it loads and see what the format is like

In [13]:
# read in mesh hierarchy
ent_tree_map = defaultdict(list)

with codecs.open(args.mesh_file, 'r') as f:
    lines = [l.rstrip().split('\t') for i, l in enumerate(f) if i > 0]
    [ent_tree_map[l[1]].append(l[0]) for l in lines]

In [14]:
len(ent_tree_map)

28470

In [15]:
ent_tree_map

defaultdict(list,
            {'D001829': ['A01'],
             'D059925': ['A01.111'],
             'D001940': ['A01.236'],
             'D042361': ['A01.236.249', 'A10.336.532'],
             'D009558': ['A01.236.500'],
             'D005121': ['A01.378'],
             'D000672': ['A01.378.100'],
             'D035002': ['A01.378.610'],
             'D002081': ['A01.378.610.100'],
             'D005528': ['A01.378.610.250'],
             'D000842': ['A01.378.610.250.149'],
             'D005545': ['A01.378.610.250.300'],
             'D008684': ['A01.378.610.250.300.480'],
             'D014034': ['A01.378.610.250.300.792'],
             'D006214': ['A01.378.610.250.300.792.380'],
             'D006365': ['A01.378.610.250.510'],
             'D006615': ['A01.378.610.400'],
             'D007717': ['A01.378.610.450'],
             'D007866': ['A01.378.610.500'],
             'D013848': ['A01.378.610.750'],
             'D034941': ['A01.378.800'],
             'D001132': ['A01.378.800.

## 4. Step 2: Loading in input
- The first step involves counting the positive and negative examples.
- Naturally, we should not have any positive ones

In [16]:
pos_doc_examples = defaultdict(list)
neg_doc_examples = defaultdict(list)

unfilitered_pos_count = 0
unfilitered_neg_count = 0
text = {}
with open(args.input_file, 'r') as f:
    lines = [l.strip().split('\t') for l in f]

    for l in lines:
        pmid = l[0]
        text[pmid] = pmid + '\t' + l[1]

        for r in chunks(l[2:], 17):

            if r[0] == '1:NR:2':
                assert ((r[7] == 'Chemical') and (r[13] == 'Disease'))
                neg_doc_examples[pmid].append(r)
                unfilitered_neg_count += 1
            elif r[0] == '1:CID:2':
                assert ((r[7] == 'Chemical') and (r[13] == 'Disease'))
                pos_doc_examples[pmid].append(r)
                unfilitered_pos_count += 1


In [17]:
unfilitered_pos_count, unfilitered_neg_count

(0, 5405)

In [25]:
print(f"The number of texts is: {len(text)}, which is correct.")

## Looking at the first two texts
list(text.values())[:2]

The number of texts is: 500, which is correct.


['8701013\tFamotidine - associated delirium .|A series of six cases .|Famotidine is a histamine H2 - receptor antagonist used in inpatient settings for prevention of stress ulcers and is showing increasing popularity because of its low cost .|Although all of the currently available H2 - receptor antagonists have shown the propensity to cause delirium , only two previously reported cases have been associated with famotidine .|The authors report on six cases of famotidine - associated delirium in hospitalized patients who cleared completely upon removal of famotidine .|The pharmacokinetics of famotidine are reviewed , with no change in its metabolism in the elderly population seen .|The implications of using famotidine in elderly persons are discussed .',
 '439781\tIndomethacin induced hypotension in sodium and volume depleted rats .|After a single oral dose of 4 mg / kg indomethacin ( IDM ) to sodium and volume depleted rats plasma renin activity ( PRA ) and systolic blood pressure fell

In [26]:
pos_doc_examples

defaultdict(list, {})

In [28]:
## So all examples are negative examples, which is correct.
len(neg_doc_examples)

500

In [31]:
## And the keys are all PubMed IDs
list(neg_doc_examples.keys())[:2]

['8701013', '439781']

In [35]:
## The usual chunks of 17 fields we see following texts.
list(neg_doc_examples.values())[:1]

[[['1:NR:2',
   'L2R',
   'NON-CROSS',
   '0-1',
   '3-4',
   'D015738',
   'Famotidine|Famotidine|famotidine|famotidine|famotidine|famotidine|famotidine',
   'Chemical',
   '0:11:66:75:88:93:113',
   '1:12:67:76:89:94:114',
   '0:2:3:4:4:5:6',
   'D003693',
   'delirium|delirium|delirium',
   'Disease',
   '3:55:78',
   '4:56:79',
   '0:3:4'],
  ['1:NR:2',
   'L2R',
   'NON-CROSS',
   '11-12',
   '27-28',
   'D015738',
   'Famotidine|Famotidine|famotidine|famotidine|famotidine|famotidine|famotidine',
   'Chemical',
   '0:11:66:75:88:93:113',
   '1:12:67:76:89:94:114',
   '0:2:3:4:4:5:6',
   'D014456',
   'ulcers',
   'Disease',
   '27',
   '28',
   '2']]]

## 5. Step 3: Output writing
- Things look okay so far, but the next step is already about processing and then directly outputting a text file.
- Just to be sure, we run the script once to see if it indeed outputs a blank file.

In [37]:
# iterate over docs
hypo_count = 0
negative_count = 0

all_pos = 0
with open("./testing_output_from_filtering.text", 'w') as out_f:
    for doc_id in pos_doc_examples.keys():
        towrite = text[doc_id]

        for r in pos_doc_examples[doc_id]:
            towrite += '\t'
            towrite += '\t'.join(r)
        all_pos += len(pos_doc_examples[doc_id])

        # get nodes for all the positive diseases
        pos_e2_examples = [(pos_node, pe) for pe in pos_doc_examples[doc_id]
                           for pos_node in ent_tree_map[pe[11]]]

        pos_e1_examples = [(pos_node, pe) for pe in pos_doc_examples[doc_id]
                           for pos_node in ent_tree_map[pe[5]]]

        filtered_neg_exampled = []
        for ne in neg_doc_examples[doc_id]:
            neg_e1 = ne[5]
            neg_e2 = ne[11]
            example_hyponyms = 0
            for neg_node in ent_tree_map[ne[11]]:
                hyponyms = [pos_node for pos_node, pe in pos_e2_examples
                            if neg_node in pos_node and neg_e1 == pe[5]] \
                           + [pos_node for pos_node, pe in pos_e1_examples
                              if neg_node in pos_node and neg_e2 == pe[11]]
                example_hyponyms += len(hyponyms)
            if example_hyponyms == 0:
                towrite += '\t' + '\t'.join(ne)
                negative_count += 1
            else:
                ne[0] = 'not_include'  # just don't include the negative pairs, but keep the entities
                towrite += '\t' + '\t'.join(ne)
                hypo_count += example_hyponyms
        out_f.write(towrite + '\n')

In [39]:
## It is indeed blank!
with open("./testing_output_from_filtering.text", 'r') as f:
    testing_file_lines = [l for l in f]
testing_file_lines

[]

## 6. Refinement: Changing the original script
- a quick examination of the script will show that the issue lies in line 7 of the code cell above.
- specifically, pos_doc_examples.keys() will be empty and nothing gets looped if we default everything to a negative example.
- but if we look at the code, looping over the neg_doc_examples.keys() should also work in terms of making sure the script considers all examples.

In [42]:
# iterate over docs
hypo_count = 0
negative_count = 0

all_pos = 0
with open(args.output_file, 'w') as out_f:
    for doc_id in neg_doc_examples.keys():
        towrite = text[doc_id]

        for r in pos_doc_examples[doc_id]:
            towrite += '\t'
            towrite += '\t'.join(r)
        all_pos += len(pos_doc_examples[doc_id])

        # get nodes for all the positive diseases
        pos_e2_examples = [(pos_node, pe) for pe in pos_doc_examples[doc_id]
                           for pos_node in ent_tree_map[pe[11]]]

        pos_e1_examples = [(pos_node, pe) for pe in pos_doc_examples[doc_id]
                           for pos_node in ent_tree_map[pe[5]]]

        filtered_neg_exampled = []
        for ne in neg_doc_examples[doc_id]:
            neg_e1 = ne[5]
            neg_e2 = ne[11]
            example_hyponyms = 0
            for neg_node in ent_tree_map[ne[11]]:
                hyponyms = [pos_node for pos_node, pe in pos_e2_examples
                            if neg_node in pos_node and neg_e1 == pe[5]] \
                           + [pos_node for pos_node, pe in pos_e1_examples
                              if neg_node in pos_node and neg_e2 == pe[11]]
                example_hyponyms += len(hyponyms)
            if example_hyponyms == 0:
                towrite += '\t' + '\t'.join(ne)
                negative_count += 1
            else:
                ne[0] = 'not_include'  # just don't include the negative pairs, but keep the entities
                towrite += '\t' + '\t'.join(ne)
                hypo_count += example_hyponyms
        out_f.write(towrite + '\n')

In [44]:
## So naturally it found no hyponyms
hypo_count

0

In [45]:
## Let's also get a sense of what a node is in the ent_tree_map
neg_e2_examples = [(neg_node, ne) for ne in neg_doc_examples[doc_id]
                           for neg_node in ent_tree_map[ne[11]]]

In [47]:
neg_e2_examples[0]

('F03.615.250',
 ['1:NR:2',
  'L2R',
  'NON-CROSS',
  '4-5',
  '7-9',
  'D005690',
  'galactose|d - galactose|galactose|galactose|galactose|galactose|galactose|galactose|galactose|galactose|galactose|galactose|galactose',
  'Chemical',
  '4:71:115:135:154:197:225:235:251:258:266:285:289',
  '5:74:116:136:155:198:226:236:252:259:267:286:290',
  '0:2:3:3:4:5:6:6:7:7:8:8:9',
  'D003072',
  'cognitive deficits|cognitive deficits|cognitive deficits|cognitive deficits',
  'Disease',
  '7:165:218:308',
  '9:167:220:310',
  '0:4:5:9'])

In [48]:
## D003072 is "cognitive deficits"
ent_tree_map['D003072']

['F03.615.250']

In [49]:
## D005690 is "galactose"
ent_tree_map['D005690']

['D09.546.359.377']

In [57]:
## Do all examples have disease coming as 2nd entity:
for ne in list(neg_doc_examples.values()):
    for r in ne:
        assert r[7] == 'Chemical'
        assert r[13] == 'Disease'
        continue

print("Hypothesis confirmed, all examples have chemicals as entity 1 and diseases as entity 2")

Hypothesis confirmed, all examples have chemicals as entity 1 and diseases as entity 2
