# Disease Extraction (NER problem) 
https://datahack.analyticsvidhya.com/contest/innoplexus-online-hiring-hackathon-saving-lives-wi/

#### Imports

In [23]:
import pandas as pd
import numpy as np

from collections import namedtuple
from itertools import repeat
from copy import deepcopy

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

import spacy
import scispacy
from spacy.util import minibatch, compounding

import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm_notebook as tqdm

import gc
import random
from pathlib import Path

from multiprocessing import Pool
import multiprocessing
N_CORES = multiprocessing.cpu_count()
print('Number of CPUs: ', N_CORES)

Number of CPUs:  4


#### Evaluation Metric

In [2]:
def collect_named_entities(tokens): # Helper Function for score calculation
    """
    Creates a list of Entity named-tuples, storing the entity type and the start and end
    offsets of the entity.
    :param tokens: a list of labels
    :return: a list of Entity named-tuples
    """
    Entity = namedtuple("Entity", "e_type start_offset end_offset")
    named_entities = []
    start_offset = None
    end_offset = None
    ent_type = None

    for offset, token_tag in enumerate(tokens):

        if token_tag == 'O':
            if ent_type is not None and start_offset is not None:
                end_offset = offset - 1
                named_entities.append(Entity(ent_type, start_offset, end_offset))
                start_offset = None
                end_offset = None
                ent_type = None

        elif ent_type is None:
            ent_type = token_tag[2:]
            start_offset = offset

        elif ent_type != token_tag[2:] or (ent_type == token_tag[2:] and token_tag[:1] == 'B'):

            end_offset = offset - 1
            named_entities.append(Entity(ent_type, start_offset, end_offset))

            # start of a new entity
            ent_type = token_tag[2:]
            start_offset = offset
            end_offset = None

    # catches an entity that goes up until the last token
    if ent_type and start_offset and end_offset is None:
        named_entities.append(Entity(ent_type, start_offset, len(tokens)-1))

    return named_entities

def compute_metrics(true_named_entities, pred_named_entities): # Helper Function for score calculation
    eval_metrics = {'correct': 0, 'partial': 0, 'missed': 0, 'spurius': 0}
    target_tags_no_schema = ['indications']

    # overall results
    evaluation = {'partial': deepcopy(eval_metrics)}


    true_which_overlapped_with_pred = []  # keep track of entities that overlapped

    # go through each predicted named-entity
    for pred in pred_named_entities:
        found_overlap = False

        # check if there's an exact match, i.e.: boundary and entity type match
        if pred in true_named_entities:
            true_which_overlapped_with_pred.append(pred)
            evaluation['partial']['correct'] += 1

        else:

            # check for overlaps with any of the true entities
            for true in true_named_entities:

                
                # 2. check for an overlap i.e. not exact boundary match, with true entities
                if pred.start_offset <= true.end_offset and true.start_offset <= pred.end_offset:

                    true_which_overlapped_with_pred.append(true)

                    evaluation['partial']['partial'] += 1

                    found_overlap = True
                    break

            # count spurius (i.e., False Positive) entities
            if not found_overlap:
                # overall results
                evaluation['partial']['spurius'] += 1

    # count missed entities (i.e. False Negative)
    for true in true_named_entities:
        if true in true_which_overlapped_with_pred:
            continue
        else:
            # overall results
            evaluation['partial']['missed'] += 1

    # Compute 'possible', 'actual'
    for eval_type in ['partial']:

        correct = evaluation[eval_type]['correct']
        partial = evaluation[eval_type]['partial']
        missed = evaluation[eval_type]['missed']
        spurius = evaluation[eval_type]['spurius']

        # possible: nr. annotations in the gold-standard which contribute to the final score
        evaluation[eval_type]['possible'] = correct + partial + missed

        # actual: number of annotations produced by the NER system
        evaluation[eval_type]['actual'] = correct + partial + spurius

        actual = evaluation[eval_type]['actual']
        possible = evaluation[eval_type]['possible']

    return evaluation

def list_converter(df): # Helper Function for score calculation
    keys, values = df.sort_values('Sent_ID_x').values.T
    ukeys, index = np.unique(keys,True)
    lists = [list(array) for array in np.split(values,index[1:])]
    return lists

# ideal and pred respectively represent dataframes containing actual labels and predictions for the set of sentences in the test data. 
# It has the same format as the sample submission (id, Sent_ID, tag)

def calculate_score(ideal, pred): # Calculates the final F1 Score

    merged = ideal.merge(pred, on = "id", how="inner").drop(['Sent_ID_y'],axis = 1)
    
    
    # The scores are calculated sentence wise and then aggregated to calculate the overall score, for this
    # List converter function groups the labels by sentence to generate a list of lists with each inner list representing a sentence in sequence
    ideal_ = list_converter(merged.drop(['id','tag_y'],axis = 1))
    pred_ = list_converter(merged.drop(['id','tag_x'],axis = 1))

    metrics_results = {'correct': 0, 'partial': 0,
                   'missed': 0, 'spurius': 0, 'possible': 0, 'actual': 0}

    results = {'partial': deepcopy(metrics_results)}


    for true_ents, pred_ents in zip(ideal_, pred_):    
    # compute results for one sentence
        tmp_results = compute_metrics(collect_named_entities(true_ents),collect_named_entities(pred_ents))
    
    # aggregate overall results
        for eval_schema in results.keys():
            for metric in metrics_results.keys():
                results[eval_schema][metric] += tmp_results[eval_schema][metric]
    correct = results['partial']['correct']
    partial = results['partial']['partial']
    missed = results['partial']['missed']
    spurius = results['partial']['spurius']
    actual = results['partial']['actual']
    possible = results['partial']['possible']


    precision = (correct + 0.5 * partial) / actual if actual > 0 else 0
    recall = (correct + 0.5 * partial) / possible if possible > 0 else 0


    score = (2 * precision * recall)/(precision + recall) if (precision + recall) >0 else 0
    
    # final score
    return score

#### Load Data and analyse

In [3]:
train_df = pd.read_csv('../data/train.csv')

In [4]:
train_df.head()

Unnamed: 0,id,Doc_ID,Sent_ID,Word,tag
0,1,1,1,Obesity,O
1,2,1,1,in,O
2,3,1,1,Low-,O
3,4,1,1,and,O
4,5,1,1,Middle-Income,O


In [5]:
train_df.shape

(4543833, 5)

In [6]:
train_df['Doc_ID'].nunique()

30000

In [7]:
train_df['Sent_ID'].nunique()

191282

In [8]:
train_df['id'].nunique()

4543833

We have total 30000 documents and 191282 sentances in total.

We have total 4543833 unique id, which means each record is assigned unique id.

#### Sci-spacy demo

In [9]:
nlp = spacy.load("en_ner_bc5cdr_md")

In [10]:
text = """Obesity in Low- and Middle-Income Countries : Burden , Drivers , and Emerging Challenges . We have reviewed the distinctive features of excess weight , its causes , and related prevention and management efforts , as well as data gaps and recommendations for future research in low- and middle-income countries ( LMICs ) . Obesity is rising in every region of the world , and no country has been successful at reversing the epidemic once it has begun . In LMICs , overweight is higher in women compared with men , in urban compared with rural settings , and in older compared with younger individuals ; however , the urban-rural overweight differential is shrinking in many countries . Overweight occurs alongside persistent burdens of underweight in LMICs , especially in young women . Changes in the global diet and physical activity are among the hypothesized leading contributors to obesity . Emerging risk factors include environmental contaminants , chronic psychosocial stress , neuroendocrine dysregulation , and genetic/epigenetic mechanisms . Data on effective strategies to prevent the onset of obesity in LMICs or elsewhere are limited . Expanding the research in this area is a key priority and has important possibilities for reverse innovation that may also inform interventions in high-income countries . """
doc = nlp(text)

In [11]:
[(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]

[('Obesity', 0, 7, 'DISEASE'),
 ('Obesity', 322, 329, 'DISEASE'),
 ('LMICs', 455, 460, 'DISEASE'),
 ('LMICs', 750, 755, 'DISEASE'),
 ('obesity', 886, 893, 'DISEASE'),
 ('neuroendocrine dysregulation', 985, 1013, 'DISEASE'),
 ('obesity', 1105, 1112, 'DISEASE'),
 ('LMICs', 1116, 1121, 'DISEASE')]

In [12]:
# [(X, X.ent_iob_, X.ent_type_) for X in doc]

### Create Spacy NER prediction dataset

Let's create train and validation set.

We are doing this based on if a doc has any entity in it or not.

We will create stratified train and test split based on that.

In [13]:
has_ent = train_df.groupby('Doc_ID')['tag'].apply(lambda x: 'B-indications' in x.values).reset_index()

In [14]:
has_ent.head()

Unnamed: 0,Doc_ID,tag
0,1,True
1,2,True
2,3,True
3,4,False
4,5,False


Get train Doc ids and validation Doc ids

In [15]:
trn_doc_ids, tst_doc_ids = train_test_split(has_ent['Doc_ID'].values, test_size = 0.33, stratify=has_ent['tag'].values, random_state=0)

In [16]:
print(len(trn_doc_ids), " training documents.")

20100  training documents.


In [17]:
print(len(tst_doc_ids), " test documents.")

9900  test documents.


In [18]:
test_df = train_df[train_df['Doc_ID'].isin(tst_doc_ids)].reset_index(drop=True)
train_df = train_df[train_df['Doc_ID'].isin(trn_doc_ids)].reset_index(drop=True)

We want to train our model on document level and also predict on document level, that's why we need to group data by Doc_ID.

The reason behid this is that in training out model can use context words and learn better.

Same goes for prediction. If we just single word for prediction, then model doesn't know its context words, and model can't predict better.


Main problem we face in using using spacy for prediction is that it takes whole document, do tokenization on its own and give us IOB (inside, outside, begining) prediction on tokens it has generated. Here we might face mismatch between tokens of spacy and our own, we need IOB predictions on our own tokens.

That's why we have created a function which gives use prediction based on word location in document.

In [20]:
# Function for creating test data for prediction
def gb_ops_test(df):
    ids = df['id'].tolist()
    st_inds = []
    doc = ""
    ls_ind = 0
    for w in df['Word']:
        st_inds.append(ls_ind)
        w_len = len(str(w))
        ls_ind = ls_ind + w_len + 1
        doc = doc + str(w) + " "
    return pd.Series(dict(ids = ids, st_inds = st_inds, doc = doc))

In [21]:
test_data = test_df.groupby('Doc_ID').apply(gb_ops_test)

In [22]:
test_data.head()

Unnamed: 0_level_0,ids,st_inds,doc
Doc_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,"[212, 213]","[0, 14]",MICROCEPHALIA VERA
3,"[214, 215, 216, 217, 218, 219, 220, 221, 222, ...","[0, 10, 26, 29, 35, 43, 52, 60, 63, 70, 75, 89...",Excellent reproducibility of laser speckle con...
4,"[519, 520, 521, 522, 523, 524, 525, 526, 527, ...","[0, 9, 19, 26, 29, 44, 47, 51, 63, 68, 74]",Positive inotropic action of cholinesterase on...
6,"[830, 831, 832, 833, 834, 835, 836, 837, 838, ...","[0, 15, 20, 29, 37, 39, 44, 46, 48, 51, 60, 65...",Self-assembled drug delivery systems . Part 8 ...
7,"[1066, 1067, 1068, 1069, 1070, 1071, 1072, 107...","[0, 12, 16, 23, 34, 37, 44, 54, 57, 77, 87, 92...",Hyperphagia and leptin resistance in tissue in...


Here, ids contains word ids in doc, st_inds has starting index of all words which are in doc. and doc is text document.

ids and st_inds has samesize.

#### Make a benchmark model using pre trained Sci Spacy 'en_ner_bc5cdr_md' model

Which is a NER for DISEASE and CHEMICHAL entities.

In [45]:
# function for prediction on document and returns 'B' and 'I' word ids in our dataset

def get_tag(ids, st_inds, doc, nlp_obj):
    ids = pd.Series(ids)
    st_inds = pd.Series(st_inds)

    doc = nlp_obj(str(doc))
    out = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]

    ans = {'B': [], 'I': []}

    for o in out:
        if o[3] == 'DISEASE':
            inss = st_inds[(st_inds >= o[1]) & (st_inds <= o[2])].index.tolist()
            if (st_inds == o[1]).sum() == 0:
                inss = [(st_inds[st_inds > o[1]].index[0] - 1)] + inss
            w_ids = ids.iloc[inss].tolist()
            ans['B'].append(w_ids[0])
            ans['I'].extend(w_ids[1:])
    return ans

def get_B_I_ids(temp_data):
    B_pred_ids = []
    I_pred_ids = []
    for d in temp_data:
        B_pred_ids.extend(d['B'])
        I_pred_ids.extend(d['I'])
        
    return B_pred_ids, I_pred_ids

def make_prediction(test_df, B_pred_ids, I_pred_ids):
    ans = test_df[['id', 'Sent_ID']].copy()
    ans['tag'] = 'O'
    ans.loc[ans['id'].isin(B_pred_ids), 'tag'] = 'B-indications'
    ans.loc[ans['id'].isin(I_pred_ids), 'tag'] = 'I-indications'
    return ans

In [41]:
import time

In [44]:
# spacy NLP object
nlp = spacy.load("en_ner_bc5cdr_md")

# function to put nlp object in get_tag function
def mp_get_tag(ids, st_inds, doc):
    return get_tag(ids, st_inds, doc, nlp)

## Multiprocess code
t1 = time.time()
p = Pool(N_CORES)
temp_data = list(p.starmap(mp_get_tag, zip(list(test_data['ids'].values), list(test_data['st_inds'].values), list(test_data['doc'].values))))
p.close()
p.join()
p.terminate()
print((time.time() - t1)/60, " minutes")

4.6307554562886555  minutes


In [46]:
B_pred_ids, I_pred_ids = get_B_I_ids(temp_data)
test_pred_bm = make_prediction(test_df, B_pred_ids, I_pred_ids)

In [51]:
test_pred_bm.head()

Unnamed: 0,id,Sent_ID,tag
0,212,10,O
1,213,10,O
2,214,11,O
3,215,11,O
4,216,11,O


In [52]:
calculate_score(test_df[['id', 'Sent_ID', 'tag']], test_pred_bm)

0.489518489355064

We got 0.489518489355064 score on validation set using sci spacy pre trained model.

On leader board highest score is 0.82.

We can Update (further train) this sci spacy model on our train dataset to increase score upto 0.80 and even higher.

### Create Spacy train dataset

In [53]:
def create_trn_data(df):
    ent_pres = False
    doc = ""
    ents = []
    new_ind = 0
    
    for i, r in df.iterrows():
        w_len = len(str(r['Word']))
        doc = doc + str(r['Word']) + " "
        
        if r['tag'] == 'O':
            new_ind = new_ind + w_len + 1
            
        if r['tag'] == 'B-indications':
            st = new_ind
            en = st + w_len
            ents.append((st, en, 'DISEASE'))
            new_ind = en + 1
            
        if r['tag'] == 'I-indications':
            en = new_ind + w_len
            st = ents[-1][0]
            ents = ents[0:-1]
            ents.append((st, en, 'DISEASE'))
            new_ind = en + 1
    
    if ents:
        ent_pres = True
    
    out = (doc, {"entities": ents})
    return pd.Series(dict(trn_data = out, ent_pres = ent_pres))


In [56]:
train_data = train_df.groupby('Sent_ID').apply(create_trn_data)

In [57]:
train_data.head()

Unnamed: 0_level_0,trn_data,ent_pres
Sent_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,(Obesity in Low- and Middle-Income Countries :...,False
2,(We have reviewed the distinctive features of ...,False
3,(Obesity is rising in every region of the worl...,False
4,"(In LMICs , overweight is higher in women comp...",False
5,(Overweight occurs alongside persistent burden...,False


In [58]:
train_data.iloc[0]['trn_data']

('Obesity in Low- and Middle-Income Countries : Burden , Drivers , and Emerging Challenges . ',
 {'entities': []})

### Train sci spacy model

In [63]:
def trn_model(model= None, output_dir=None, n_iter=10):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly â€“ but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in tqdm(batches):
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.3,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)
    
            # save model to output directory
            if output_dir is not None:
                output_dir = Path(output_dir)
                if not output_dir.exists():
                    output_dir.mkdir()
                nlp.to_disk(output_dir)
                print("Saved model to", output_dir)

    return nlp

In [61]:
TRAIN_DATA = train_data['trn_data'].tolist()

In [64]:
MODEL = "en_ner_bc5cdr_md"
OUT_DIR = "./model_final/"

In [65]:
nlp1 = trn_model(MODEL, OUT_DIR, 5)

Loaded model 'en_ner_bc5cdr_md'


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Losses {'ner': 129.74318817221916}
Saved model to model_final


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Losses {'ner': 86.04977100593487}
Saved model to model_final


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Losses {'ner': 74.31878978241406}
Saved model to model_final


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Losses {'ner': 61.91225983773926}
Saved model to model_final


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Losses {'ner': 57.74447276394376}
Saved model to model_final


#### Now let's predict on test set using this new updated model

In [66]:
# function to put nlp object in get_tag function
def mp_get_tag(ids, st_inds, doc):
    return get_tag(ids, st_inds, doc, nlp1)

## Multiprocess code
t1 = time.time()
p = Pool(N_CORES)
temp_data = list(p.starmap(mp_get_tag, zip(list(test_data['ids'].values), list(test_data['st_inds'].values), list(test_data['doc'].values))))
p.close()
p.join()
p.terminate()
print((time.time() - t1)/60, " minutes")

4.970260135332743  minutes


In [67]:
B_pred_ids, I_pred_ids = get_B_I_ids(temp_data)
test_pred_final = make_prediction(test_df, B_pred_ids, I_pred_ids)

In [68]:
test_pred_final.head()

Unnamed: 0,id,Sent_ID,tag
0,212,10,O
1,213,10,O
2,214,11,O
3,215,11,O
4,216,11,O


In [69]:
test_pred_final['tag'].value_counts()

O                1466397
B-indications      16878
I-indications      13501
Name: tag, dtype: int64

In [70]:
calculate_score(test_df[['id', 'Sent_ID', 'tag']], test_pred_final)

0.7855966520062774

Great! We got a huge improvement over base pre trained model.

We can train for few more epoch to increase the score.

Just load this trained model and train it for few more epochs.

#### Model score : 0.7855966520062774