In [None]:
!pip install scispacy
!pip install spacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz

In [1]:
import scispacy
import json
import spacy
from sklearn.utils import shuffle
import random

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/MyDrive/

In [None]:
%ls

## Data

In [2]:
types = ('LOC', 'CORONAVIRUS', 'LIVESTOCK', 'WILDLIFE', 'EVOLUTION', 'PHYSICAL_SCIENCE',
'SUBSTRATE', 'MATERIAL', 'IMMUNE_RESPONSE')

In [3]:
X = []

In [4]:
with open('./SS-NER-prune.json') as file:
    for line in file:
        d = json.loads(line)
        X.append(d)

In [6]:
test_X = X[:int(len(X)*(0.2))]
train_X = X[int(len(X)*(0.2)):]

## Spacy

### en_ner_bc5cdr

In [7]:
from spacy.util import minibatch, compounding
import random
from spacy.training.example import Example

In [8]:
bc5cdr = spacy.load("en_ner_bc5cdr_md")

In [9]:
bc5cdr.pipe_names

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner']

In [10]:
ner = bc5cdr.get_pipe('ner')

In [11]:
for name in bc5cdr.pipe_names:
    if name != "ner":
        bc5cdr.disable_pipe(name)

In [12]:
for i in types:
    ner.add_label(i)

In [13]:
optimizer = ner.create_optimizer()

In [50]:
new_train = []
for x in train_X[200:300]:
    if 'body' in x.keys() and 'entities' in x.keys():
        text = x['body']
        tmp = []
        for i in range(len(x['entities'])):
            cur = x['entities'][i]
            tmp.append((cur['start'], cur['end'], cur['type']))
        new_train.append((text, {"entities": tmp}))

In [51]:
n_iter = 5

In [17]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
for itn in range(n_iter):
    random.shuffle(new_train)
    losses = {}
    batches = minibatch(new_train, size=compounding(4., 32., 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        example = []
            # Update the model with iterating each text
        for i in range(len(texts)):
            if len(texts[i]) > 100000:
              continue
            doc = bc5cdr.make_doc(texts[i])
            example.append(Example.from_dict(doc, annotations[i]))
        # Update the model
        bc5cdr.update(example, sgd=optimizer, drop=0.5, losses=losses)
        # ner.update(texts, annotations, sgd=optimizer, drop=0.35,
        #                 losses=losses)
            # doc = bc5cdr.make_doc(texts)
            # for a in annotations:
            #     example.append(Example.from_dict(doc, a))
                
            # Updating the weights
        # bc5cdr.update(example, sgd=optimizer, drop=0.35, losses=losses)
    print('Losses', losses)

In [20]:
bc5cdr.meta['name'] = 'model80'  # rename model
bc5cdr.to_disk('./')

#### Evaluation

In [None]:
from spacy.scorer import Scorer
from spacy.tokens import Doc
from spacy.training.example import Example

In [None]:
def evaluate(ner_model, examples):
    scorer = Scorer()
    example = []
    for input_, annot in examples:
        pred = ner_model(input_)
        temp = Example.from_dict(pred, dict.fromkeys(annot))
        example.append(temp)
    scores = scorer.score(example)
    return scores

In [None]:
scorer = Scorer(bc5cdr)

In [None]:
new_test = []
for t in test_X:
    txt = t['body']
    et = {}
    lst = []
    for e in t['entities']:
        lst.append((e['start'],e['end'],e['type']))
    et['entities'] = lst
    example = Example.from_dict(bc5cdr.make_doc(txt), et)
    new_test.append(example)

In [None]:
scores = scorer.score(new_test)

In [None]:
evaluate(bc5cdr, new_test)

In [21]:
doc = bc5cdr(test_X[0]['body'])

In [23]:
entss = doc.ents

In [26]:
from collections import defaultdict

In [None]:
pred_y = defaultdict(list)
ground_y = defaultdict(list)
i = 0
for d in test_X:
    if i % 100 == 0: print(i)
    doc = bc5cdr(d['body'])
    tmp = defaultdict(list)
    for e in doc.ents:
        tmp[e.label_].append(e.text)
    for key, val in tmp.items():
        pred_y[key].extend(val)
    ground = defaultdict(list)
    for e in d['entities']:
        ground[e['type']].append(e['text'])
    for key, val in ground.items():
        ground_y[key].extend(val)
    i += 1

In [30]:
from collections import Counter

In [36]:
def f1(y_true, y_pred):
    i = list((Counter(y_true) & Counter(y_pred)).elements())
    # i = set(y_true).intersection(y_pred)
    recall = len(i) / len(y_true)
    precision = len(i) / len(y_pred)
    if recall + precision == 0:
        return 0
    else:
        return 2 * (precision * recall) / (precision + recall)


In [37]:
def avgf1(true_y, pred_y):
    f_list = []
    for t, p in zip(true_y, pred_y):
        f_list.append(f1(t, p))
    return sum(f_list)/len(f_list)

In [54]:
total = 0
for key, val in ground_y.items():
    tmp = avgf1(val, pred_y[key])
    total += tmp
    print(key, tmp)
print('over all f1', total/len(types))

CORONAVIRUS 0.3685243168788647
EVOLUTION 0.5431761478394596
WILDLIFE 0.49609891431014996
PHYSICAL_SCIENCE 0.7311053722422336
LIVESTOCK 0.33202250434657105
SUBSTRATE 0.4034442244471359
LOC 0.33667715268349346
IMMUNE_RESPONSE 0.6283439344864685
MATERIAL 0.500207453644279
over all f1 0.48217778009762846
