In [1]:
%load_ext autoreload
%autoreload 2
from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import wordnet as wn
from gensim.models import TfidfModel
from gensim import matutils
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize
import gensim.downloader as api
import nltk
import re, unicodedata

# Bio Creative V

Marks chemicals and diseases in Pubmed articles as named entities. For further details, see [the HuggingFace page](https://huggingface.co/datasets/tner/bc5cdr)

In [2]:
ner_dataset = load_dataset(
    "tner/bc5cdr", 
)

print(f'The dataset is a dictionary with {len(ner_dataset)} splits: \n\n{ner_dataset}')

The dataset is a dictionary with 3 splits: 

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5228
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5330
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5865
    })
})


In [3]:
# btain the data in a list format for some sequence tagging methods
train_sentences_ner = [item['tokens'] for item in ner_dataset['train']]
train_labels_ner = [[str(tag) for tag in item['tags']] for item in ner_dataset['train']]

val_sentences_ner = [item['tokens'] for item in ner_dataset['validation']]
val_labels_ner = [[str(tag) for tag in item['tags']] for item in ner_dataset['validation']]

test_sentences_ner = [item['tokens'] for item in ner_dataset['test']]
test_labels_ner = [[str(tag) for tag in item['tags']] for item in ner_dataset['test']]

In [4]:
ner_dataset['train']['tokens']

[['Naloxone',
  'reverses',
  'the',
  'antihypertensive',
  'effect',
  'of',
  'clonidine',
  '.'],
 ['In',
  'unanesthetized',
  ',',
  'spontaneously',
  'hypertensive',
  'rats',
  'the',
  'decrease',
  'in',
  'blood',
  'pressure',
  'and',
  'heart',
  'rate',
  'produced',
  'by',
  'intravenous',
  'clonidine',
  ',',
  '5',
  'to',
  '20',
  'micrograms',
  '/',
  'kg',
  ',',
  'was',
  'inhibited',
  'or',
  'reversed',
  'by',
  'nalozone',
  ',',
  '0',
  '.'],
 ['2', 'to', '2', 'mg', '/', 'kg', '.'],
 ['The',
  'hypotensive',
  'effect',
  'of',
  '100',
  'mg',
  '/',
  'kg',
  'alpha-methyldopa',
  'was',
  'also',
  'partially',
  'reversed',
  'by',
  'naloxone',
  '.'],
 ['Naloxone',
  'alone',
  'did',
  'not',
  'affect',
  'either',
  'blood',
  'pressure',
  'or',
  'heart',
  'rate',
  '.'],
 ['In',
  'brain',
  'membranes',
  'from',
  'spontaneously',
  'hypertensive',
  'rats',
  'clonidine',
  ',',
  '10',
  '(-',
  '8',
  ')',
  'to',
  '10',
  '(-',
  '

In [5]:
ner_dataset['train']['tags']

[[1, 0, 0, 0, 0, 0, 1, 0],
 [0,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0],
 [0, 0, 0, 0, 0, 0, 0],
 [0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0,
  0,
  0,
  0,
  0,
  2,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1],
 [1, 0, 0, 2, 3, 0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [6]:
# show tag values
np.unique(np.concatenate(train_labels_ner))

array(['0', '1', '2', '3', '4'], dtype='<U1')

In [7]:
pd.DataFrame(list(zip(train_sentences_ner,train_labels_ner)))

Unnamed: 0,0,1
0,"[Naloxone, reverses, the, antihypertensive, ef...","[1, 0, 0, 0, 0, 0, 1, 0]"
1,"[In, unanesthetized, ,, spontaneously, hyperte...","[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[2, to, 2, mg, /, kg, .]","[0, 0, 0, 0, 0, 0, 0]"
3,"[The, hypotensive, effect, of, 100, mg, /, kg,...","[0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0]"
4,"[Naloxone, alone, did, not, affect, either, bl...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...
5223,"[In, mice, ,, apomorphine, produced, qualitati...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5224,"[Drug, -, induced, gross, activity, counts, we...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5225,"[By, contrast, ,, apomorphine, -, induced, loc...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5226,"[Dopamine, turnover, ratios, (, DOPAC, :, DA, ...","[1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, ..."


In [8]:
# mapping with label2id dictionary

ner_tag_mapping = {
    "O": 0,
    "B-Chemical": 1,
    "B-Disease": 2,
    "I-Disease": 3,
    "I-Chemical": 4
}

ner_tag_mapping = {v:k for k, v in ner_tag_mapping.items()}
ner_tag_mapping

{0: 'O', 1: 'B-Chemical', 2: 'B-Disease', 3: 'I-Disease', 4: 'I-Chemical'}

In [9]:
# assign ner tag to dataset
def convertner(dataset):
    new_tag_data = []
    for i in range(len(dataset)):
        ner_string = []
        for j in range(len(dataset[i])):
            label = dataset[i][j]
            label = int(label)
            if label in ner_tag_mapping:
                ner_string.append(ner_tag_mapping[label])
            else:
                ner_string.append(label)
        new_tag_data.append(ner_string)
    return new_tag_data


train_labels_map = convertner(train_labels_ner)
val_labels_map = convertner(val_labels_ner)
test_labels_map =  convertner(test_labels_ner)

train_set = pd.DataFrame(list(zip(train_sentences_ner, train_labels_map)),columns=['Sentences','Labels'])
train_set

Unnamed: 0,Sentences,Labels
0,"[Naloxone, reverses, the, antihypertensive, ef...","[B-Chemical, O, O, O, O, O, B-Chemical, O]"
1,"[In, unanesthetized, ,, spontaneously, hyperte...","[O, O, O, O, B-Disease, O, O, O, O, O, O, O, O..."
2,"[2, to, 2, mg, /, kg, .]","[O, O, O, O, O, O, O]"
3,"[The, hypotensive, effect, of, 100, mg, /, kg,...","[O, B-Disease, O, O, O, O, O, O, B-Chemical, O..."
4,"[Naloxone, alone, did, not, affect, either, bl...","[B-Chemical, O, O, O, O, O, O, O, O, O, O, O]"
...,...,...
5223,"[In, mice, ,, apomorphine, produced, qualitati...","[O, O, O, B-Chemical, O, O, O, O, O, O, O, O, ..."
5224,"[Drug, -, induced, gross, activity, counts, we...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
5225,"[By, contrast, ,, apomorphine, -, induced, loc...","[O, O, O, B-Chemical, O, O, O, O, O, O, O, O, ..."
5226,"[Dopamine, turnover, ratios, (, DOPAC, :, DA, ...","[B-Chemical, O, O, O, B-Chemical, O, B-Chemica..."


In [10]:
def prepare_dataset(sentences_ner, labels):
    train_set = []
    for i in range(len(sentences_ner)):
        sentences = []
        for j in range(len(sentences_ner[i])):
           sentences.append((sentences_ner[i][j], labels[i][j]))
        train_set.append(sentences)
    return train_set

In [11]:
train_set = prepare_dataset(train_sentences_ner, train_labels_map)
val_set = prepare_dataset(val_sentences_ner,val_labels_map)
test_set = prepare_dataset(test_sentences_ner, test_labels_map)

In [12]:
train_set

[[('Naloxone', 'B-Chemical'),
  ('reverses', 'O'),
  ('the', 'O'),
  ('antihypertensive', 'O'),
  ('effect', 'O'),
  ('of', 'O'),
  ('clonidine', 'B-Chemical'),
  ('.', 'O')],
 [('In', 'O'),
  ('unanesthetized', 'O'),
  (',', 'O'),
  ('spontaneously', 'O'),
  ('hypertensive', 'B-Disease'),
  ('rats', 'O'),
  ('the', 'O'),
  ('decrease', 'O'),
  ('in', 'O'),
  ('blood', 'O'),
  ('pressure', 'O'),
  ('and', 'O'),
  ('heart', 'O'),
  ('rate', 'O'),
  ('produced', 'O'),
  ('by', 'O'),
  ('intravenous', 'O'),
  ('clonidine', 'B-Chemical'),
  (',', 'O'),
  ('5', 'O'),
  ('to', 'O'),
  ('20', 'O'),
  ('micrograms', 'O'),
  ('/', 'O'),
  ('kg', 'O'),
  (',', 'O'),
  ('was', 'O'),
  ('inhibited', 'O'),
  ('or', 'O'),
  ('reversed', 'O'),
  ('by', 'O'),
  ('nalozone', 'B-Chemical'),
  (',', 'O'),
  ('0', 'O'),
  ('.', 'O')],
 [('2', 'O'),
  ('to', 'O'),
  ('2', 'O'),
  ('mg', 'O'),
  ('/', 'O'),
  ('kg', 'O'),
  ('.', 'O')],
 [('The', 'O'),
  ('hypotensive', 'B-Disease'),
  ('effect', 'O'),
  ('

In [13]:
val_sentences_ner

[['Tricuspid',
  'valve',
  'regurgitation',
  'and',
  'lithium',
  'carbonate',
  'toxicity',
  'in',
  'a',
  'newborn',
  'infant',
  '.'],
 ['A',
  'newborn',
  'with',
  'massive',
  'tricuspid',
  'regurgitation',
  ',',
  'atrial',
  'flutter',
  ',',
  'congestive',
  'heart',
  'failure',
  ',',
  'and',
  'a',
  'high',
  'serum',
  'lithium',
  'level',
  'is',
  'described',
  '.'],
 ['This',
  'is',
  'the',
  'first',
  'patient',
  'to',
  'initially',
  'manifest',
  'tricuspid',
  'regurgitation',
  'and',
  'atrial',
  'flutter',
  ',',
  'and',
  'the',
  '11th',
  'described',
  'patient',
  'with',
  'cardiac',
  'disease',
  'among',
  'infants',
  'exposed',
  'to',
  'lithium',
  'compounds',
  'in',
  'the',
  'first',
  'trimester',
  'of',
  'pregnancy',
  '.'],
 ['Sixty',
  '-',
  'three',
  'percent',
  'of',
  'these',
  'infants',
  'had',
  'tricuspid',
  'valve',
  'involvement',
  '.'],
 ['Lithium',
  'carbonate',
  'may',
  'be',
  'a',
  'factor',
 

In [14]:
train_set

[[('Naloxone', 'B-Chemical'),
  ('reverses', 'O'),
  ('the', 'O'),
  ('antihypertensive', 'O'),
  ('effect', 'O'),
  ('of', 'O'),
  ('clonidine', 'B-Chemical'),
  ('.', 'O')],
 [('In', 'O'),
  ('unanesthetized', 'O'),
  (',', 'O'),
  ('spontaneously', 'O'),
  ('hypertensive', 'B-Disease'),
  ('rats', 'O'),
  ('the', 'O'),
  ('decrease', 'O'),
  ('in', 'O'),
  ('blood', 'O'),
  ('pressure', 'O'),
  ('and', 'O'),
  ('heart', 'O'),
  ('rate', 'O'),
  ('produced', 'O'),
  ('by', 'O'),
  ('intravenous', 'O'),
  ('clonidine', 'B-Chemical'),
  (',', 'O'),
  ('5', 'O'),
  ('to', 'O'),
  ('20', 'O'),
  ('micrograms', 'O'),
  ('/', 'O'),
  ('kg', 'O'),
  (',', 'O'),
  ('was', 'O'),
  ('inhibited', 'O'),
  ('or', 'O'),
  ('reversed', 'O'),
  ('by', 'O'),
  ('nalozone', 'B-Chemical'),
  (',', 'O'),
  ('0', 'O'),
  ('.', 'O')],
 [('2', 'O'),
  ('to', 'O'),
  ('2', 'O'),
  ('mg', 'O'),
  ('/', 'O'),
  ('kg', 'O'),
  ('.', 'O')],
 [('The', 'O'),
  ('hypotensive', 'B-Disease'),
  ('effect', 'O'),
  ('

## Experiment 1: Token itself

In [15]:
# train CRF NER tagger
def train_CRF_NER_tagger(train_set):
    tagger = nltk.tag.CRFTagger()
    tagger.train(train_set, 'model.crf.tagger')
    return tagger

tagger = train_CRF_NER_tagger(train_set)

In [16]:
def extract_spans(tagged_sents):
    """
    Extract a list of tagged spans for each named entity type, 
    where each span is represented by a tuple containing the 
    start token and end token indexes.
    
    returns: a dictionary containing a list of spans for each entity type.
    """
    spans = {}
        
    for sidx, sent in enumerate(tagged_sents):
        start = -1
        entity_type = None
        for i, (tok, lab) in enumerate(sent):
            if 'B-' in lab:
                start = i
                end = i + 1
                entity_type = lab[2:]
            elif 'I-' in lab:
                end = i + 1
            elif lab == 'O' and start >= 0:
                
                if entity_type not in spans:
                    spans[entity_type] = []
                
                spans[entity_type].append((start, end, sidx))
                start = -1      

        if start >= 0:    
            if entity_type not in spans:
                spans[entity_type] = []
                
            spans[entity_type].append((start, end, sidx))
                
    return spans


def cal_span_level_f1(test_sents, test_sents_with_pred):
    gold_spans = extract_spans(test_sents)

    pred_spans = extract_spans(test_sents_with_pred)
    
    f1_per_class = []
    
    ne_types = gold_spans.keys() 
    
    for ne_type in ne_types:
        # compute the confusion matrix
        true_pos = 0
        false_pos = 0
        
        for span in pred_spans[ne_type]:
            if span in gold_spans[ne_type]:
                true_pos += 1
            else:
                false_pos += 1
                
        false_neg = 0
        for span in gold_spans[ne_type]:
            if span not in pred_spans[ne_type]:
                false_neg += 1
                
        if true_pos + false_pos == 0:
            precision = 0
        else:
            precision = true_pos / float(true_pos + false_pos)
            
        if true_pos + false_neg == 0:
            recall = 0
        else:
            recall = true_pos / float(true_pos + false_neg)
        
        if precision + recall == 0:
            f1 = 0
        else:
            f1 = 2 * precision * recall / (precision + recall)
            
        f1_per_class.append(f1)
        print(f'F1 score for class {ne_type} = {f1}')
        
    print(f'Macro-average f1 score = {np.mean(f1_per_class)}')
    return np.mean(f1_per_class)


In [17]:
predicted_tags_val = tagger.tag_sents(val_sentences_ner)
predicted_tags_val

[[('Tricuspid', 'O'),
  ('valve', 'O'),
  ('regurgitation', 'O'),
  ('and', 'O'),
  ('lithium', 'B-Chemical'),
  ('carbonate', 'I-Chemical'),
  ('toxicity', 'B-Disease'),
  ('in', 'O'),
  ('a', 'O'),
  ('newborn', 'O'),
  ('infant', 'O'),
  ('.', 'O')],
 [('A', 'O'),
  ('newborn', 'O'),
  ('with', 'O'),
  ('massive', 'O'),
  ('tricuspid', 'O'),
  ('regurgitation', 'O'),
  (',', 'O'),
  ('atrial', 'O'),
  ('flutter', 'O'),
  (',', 'O'),
  ('congestive', 'B-Disease'),
  ('heart', 'I-Disease'),
  ('failure', 'I-Disease'),
  (',', 'O'),
  ('and', 'O'),
  ('a', 'O'),
  ('high', 'O'),
  ('serum', 'O'),
  ('lithium', 'B-Chemical'),
  ('level', 'O'),
  ('is', 'O'),
  ('described', 'O'),
  ('.', 'O')],
 [('This', 'O'),
  ('is', 'O'),
  ('the', 'O'),
  ('first', 'O'),
  ('patient', 'O'),
  ('to', 'O'),
  ('initially', 'O'),
  ('manifest', 'O'),
  ('tricuspid', 'O'),
  ('regurgitation', 'O'),
  ('and', 'O'),
  ('atrial', 'O'),
  ('flutter', 'O'),
  (',', 'O'),
  ('and', 'O'),
  ('the', 'O'),
  ('

In [18]:
# Training
predicted_tags_train = tagger.tag_sents(train_sentences_ner)
cal_span_level_f1(train_set, predicted_tags_train)

F1 score for class Chemical = 0.9209246949102093
F1 score for class Disease = 0.7760555852485302
Macro-average f1 score = 0.8484901400793698


0.8484901400793698

In [19]:
# Validation
cal_span_level_f1(val_set, predicted_tags_val)

F1 score for class Disease = 0.6537563521494301
F1 score for class Chemical = 0.8271844660194174
Macro-average f1 score = 0.7404704090844237


0.7404704090844237

## Experiment 2: Extract Basic Features
- Current word
- is it capitalized?
- Does it have punctuation?
- Does it have a number?
- Suffixes up to length 3

In [20]:
class CustomCRFTagger(nltk.tag.CRFTagger):
    _current_tokens = None
    
    def _get_features(self, tokens, idx):

            token = tokens[idx]

            feature_list = []

            if not token:
                return feature_list

            # capitalization
            if token[0].isupper():
                feature_list.append("CAPITALIZATION")

            # number
            if re.search(self._pattern, token) is not None:
                feature_list.append("HAS_NUM")

            # punctuation
            punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"}
            if all(unicodedata.category(x) in punc_cat for x in token):
                feature_list.append("PUNCTUATION")

            # suffix up to length 3
            if len(token) > 1:
                feature_list.append("SUF_" + token[-1:])
            if len(token) > 2:
                feature_list.append("SUF_" + token[-2:])
            if len(token) > 3:
                feature_list.append("SUF_" + token[-3:])

                
            # current word
            feature_list.append("WORD_" + token)
            
            if idx > 0:
                feature_list.append("PREVWORD_" + tokens[idx-1])
            if idx < len(tokens)-1:
                feature_list.append("NEXTWORD_" + tokens[idx+1])
                

            return feature_list
                

In [21]:
# train_set

In [22]:
# train CRF NER tagger
def train_CustomCRF_NER_tagger(train_set):
    tagger = CustomCRFTagger()
    tagger.train(train_set, 'model.crf.tagger')
    return tagger

tagger = train_CustomCRF_NER_tagger(train_set)

In [23]:
# training set
predicted_tags_train = tagger.tag_sents(train_sentences_ner)
cal_span_level_f1(train_set, predicted_tags_train)

F1 score for class Chemical = 0.943269420834157
F1 score for class Disease = 0.8589179394253171
Macro-average f1 score = 0.9010936801297371


0.9010936801297371

In [24]:
# validation
predicted_tags_val = tagger.tag_sents(val_sentences_ner)
predicted_tags_val

[[('Tricuspid', 'O'),
  ('valve', 'B-Disease'),
  ('regurgitation', 'I-Disease'),
  ('and', 'O'),
  ('lithium', 'B-Chemical'),
  ('carbonate', 'I-Chemical'),
  ('toxicity', 'B-Disease'),
  ('in', 'O'),
  ('a', 'O'),
  ('newborn', 'O'),
  ('infant', 'O'),
  ('.', 'O')],
 [('A', 'O'),
  ('newborn', 'O'),
  ('with', 'O'),
  ('massive', 'O'),
  ('tricuspid', 'O'),
  ('regurgitation', 'O'),
  (',', 'O'),
  ('atrial', 'B-Disease'),
  ('flutter', 'I-Disease'),
  (',', 'O'),
  ('congestive', 'B-Disease'),
  ('heart', 'I-Disease'),
  ('failure', 'I-Disease'),
  (',', 'O'),
  ('and', 'O'),
  ('a', 'O'),
  ('high', 'O'),
  ('serum', 'O'),
  ('lithium', 'B-Chemical'),
  ('level', 'O'),
  ('is', 'O'),
  ('described', 'O'),
  ('.', 'O')],
 [('This', 'O'),
  ('is', 'O'),
  ('the', 'O'),
  ('first', 'O'),
  ('patient', 'O'),
  ('to', 'O'),
  ('initially', 'O'),
  ('manifest', 'O'),
  ('tricuspid', 'O'),
  ('regurgitation', 'O'),
  ('and', 'O'),
  ('atrial', 'B-Disease'),
  ('flutter', 'I-Disease'),
  

In [25]:
cal_span_level_f1(val_set, predicted_tags_val)

F1 score for class Disease = 0.6868005932317649
F1 score for class Chemical = 0.8361536123214105
Macro-average f1 score = 0.7614771027765876


0.7614771027765876

## POS

In [26]:
# the package for PoS tagging
nltk.download('averaged_perceptron_tagger')

# try
example_sentence = ["PoS", "tags", "often", "provide", "useful", "information", "for", "identifying", "entities"]
pos_tagged_tokens = nltk.pos_tag(example_sentence)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/blackrose/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [27]:
pos_tagged_tokens

[('PoS', 'NNP'),
 ('tags', 'NNS'),
 ('often', 'RB'),
 ('provide', 'VBP'),
 ('useful', 'JJ'),
 ('information', 'NN'),
 ('for', 'IN'),
 ('identifying', 'VBG'),
 ('entities', 'NNS')]

In [28]:
class CRFTaggerWithPOS(CustomCRFTagger):
    _current_tokens = None
    
    def _get_features(self, tokens, index):
        """
        Extract the features for a token and append the POS tag as an additional feature.
        """
        basic_features = super()._get_features(tokens, index)
        
        # get the pos tags for the current sentence and save it
        if tokens != self._current_tokens:
            self._pos_tagged_tokens = nltk.pos_tag(tokens)
            self._current_tokens = tokens
            
            
        basic_features.append(self._pos_tagged_tokens[index][1])

        
        return basic_features

In [29]:
# train CRF NER tagger
def train_CRF_NER_tagger_with_POS(train_set):
    tagger = CRFTaggerWithPOS()
    tagger.train(train_set, 'model.crf.tagger')
    return tagger 

tagger = train_CRF_NER_tagger_with_POS(train_set)

In [30]:
# training set
predicted_tags_train = tagger.tag_sents(train_sentences_ner)
cal_span_level_f1(train_set, predicted_tags_train)

F1 score for class Chemical = 0.9444444444444444
F1 score for class Disease = 0.8575863407062475
Macro-average f1 score = 0.9010153925753459


0.9010153925753459

In [31]:
# validation
predicted_tags_val = tagger.tag_sents(val_sentences_ner)
cal_span_level_f1(val_set, predicted_tags_val)

F1 score for class Disease = 0.68578352180937
F1 score for class Chemical = 0.8413848631239935
Macro-average f1 score = 0.7635841924666817


0.7635841924666817

## Dependency parsing

In [32]:
# pip install spacy

In [33]:
# python -m spacy download en_core_web_sm

In [34]:
import spacy
nlp = spacy.load("en_core_web_sm")


In [35]:
class CRFTaggerWithDependency(CRFTaggerWithPOS):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.nlp = spacy.load("en_core_web_sm")
        self._doc = None
        self._pos_tagged_tokens = None
        self._current_tokens = None

    def _get_features(self, tokens, index):
        if self._current_tokens is None or tokens != self._current_tokens:
            self._doc = self.nlp(" ".join(tokens))
            self._pos_tagged_tokens = nltk.pos_tag(tokens)
            self._current_tokens = tokens
        
        # using ex class features
        features = super()._get_features(tokens, index)
        
        # dependency as a feature
        if self._doc:
            features.append("DEP_REL=" + self._doc[index].dep_)

        return features


In [36]:
def train_CRF_NER_tagger_with_dependency(train_set):
    tagger = CRFTaggerWithDependency()
    tagger.train(train_set, 'model.crf.tagger')
    return tagger

tagger = train_CRF_NER_tagger_with_dependency(train_set)


In [37]:
# training set
predicted_tags_train = tagger.tag_sents(train_sentences_ner)
cal_span_level_f1(train_set, predicted_tags_train)

F1 score for class Chemical = 0.9460924411156008
F1 score for class Disease = 0.8601705867149134
Macro-average f1 score = 0.9031315139152571


0.9031315139152571

In [38]:
# validation set
predicted_tags_val = tagger.tag_sents(val_sentences_ner)
cal_span_level_f1(val_set, predicted_tags_val)

F1 score for class Disease = 0.6934835076427996
F1 score for class Chemical = 0.8413001912045889
Macro-average f1 score = 0.7673918494236942


0.7673918494236942

## WordNet Dataset

In [39]:
class CRFTaggerWithWordNet(CRFTaggerWithDependency):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._current_tokens = None
        self._wordnet_features = None


    def _get_wordnet_features(self, token):
        synsets = wn.synsets(token)
        wordnet_features = []
        for synset in synsets:
            wordnet_features.append("synonyms=" + ",".join([lemma.name() for lemma in synset.lemmas()]))
            wordnet_features.append("hypernyms=" + ",".join([hypernym.name().split('.')[0] for hypernym in synset.hypernyms()]))
            for lemma in synset.lemmas():
                if lemma.name() != token:
                    wordnet_features.append("similar_to_" + token + "=" + lemma.name())
        return wordnet_features



    def _get_features(self, tokens, index):
        if tokens != self._current_tokens:
            self._current_tokens = tokens
            self._pos_tagged_tokens = nltk.pos_tag(tokens)
            self._wordnet_features = [self._get_wordnet_features(token) for token in tokens]

        features = super()._get_features(tokens, index)
        features.extend(self._wordnet_features[index])
        return features

In [40]:
def train_CRF_NER_tagger_with_WordNet(train_set):
    tagger = CRFTaggerWithWordNet()
    tagger.train(train_set, 'model.crf.tagger')
    return tagger

tagger = train_CRF_NER_tagger_with_WordNet(train_set)


In [41]:
# training set
predicted_tags_train = tagger.tag_sents(train_sentences_ner)
cal_span_level_f1(train_set, predicted_tags_train)

F1 score for class Chemical = 0.9747306562193927
F1 score for class Disease = 0.9409756097560975
Macro-average f1 score = 0.9578531329877451


0.9578531329877451

In [42]:
# validation set
predicted_tags_val = tagger.tag_sents(val_sentences_ner)
cal_span_level_f1(val_set, predicted_tags_val)

F1 score for class Disease = 0.7506574827802128
F1 score for class Chemical = 0.8662471281590252
Macro-average f1 score = 0.808452305469619


0.808452305469619

Test the  best model on test set

In [43]:
# test set
predicted_tags_test = tagger.tag_sents(test_sentences_ner)
cal_span_level_f1(test_set, predicted_tags_test)

F1 score for class Chemical = 0.8574557037626918
F1 score for class Disease = 0.7478072810284754
Macro-average f1 score = 0.8026314923955835


0.8026314923955835

In [44]:
test_sentences_ner

[['Famotidine', '-', 'associated', 'delirium', '.'],
 ['A', 'series', 'of', 'six', 'cases', '.'],
 ['Famotidine',
  'is',
  'a',
  'histamine',
  'H2',
  '-',
  'receptor',
  'antagonist',
  'used',
  'in',
  'inpatient',
  'settings',
  'for',
  'prevention',
  'of',
  'stress',
  'ulcers',
  'and',
  'is',
  'showing',
  'increasing',
  'popularity',
  'because',
  'of',
  'its',
  'low',
  'cost',
  '.'],
 ['Although',
  'all',
  'of',
  'the',
  'currently',
  'available',
  'H2',
  '-',
  'receptor',
  'antagonists',
  'have',
  'shown',
  'the',
  'propensity',
  'to',
  'cause',
  'delirium',
  ',',
  'only',
  'two',
  'previously',
  'reported',
  'cases',
  'have',
  'been',
  'associated',
  'with',
  'famotidine',
  '.'],
 ['The',
  'authors',
  'report',
  'on',
  'six',
  'cases',
  'of',
  'famotidine',
  '-',
  'associated',
  'delirium',
  'in',
  'hospitalized',
  'patients',
  'who',
  'cleared',
  'completely',
  'upon',
  'removal',
  'of',
  'famotidine',
  '.'],


In [45]:
test_set

[[('Famotidine', 'B-Chemical'),
  ('-', 'O'),
  ('associated', 'O'),
  ('delirium', 'B-Disease'),
  ('.', 'O')],
 [('A', 'O'),
  ('series', 'O'),
  ('of', 'O'),
  ('six', 'O'),
  ('cases', 'O'),
  ('.', 'O')],
 [('Famotidine', 'B-Chemical'),
  ('is', 'O'),
  ('a', 'O'),
  ('histamine', 'O'),
  ('H2', 'O'),
  ('-', 'O'),
  ('receptor', 'O'),
  ('antagonist', 'O'),
  ('used', 'O'),
  ('in', 'O'),
  ('inpatient', 'O'),
  ('settings', 'O'),
  ('for', 'O'),
  ('prevention', 'O'),
  ('of', 'O'),
  ('stress', 'O'),
  ('ulcers', 'B-Disease'),
  ('and', 'O'),
  ('is', 'O'),
  ('showing', 'O'),
  ('increasing', 'O'),
  ('popularity', 'O'),
  ('because', 'O'),
  ('of', 'O'),
  ('its', 'O'),
  ('low', 'O'),
  ('cost', 'O'),
  ('.', 'O')],
 [('Although', 'O'),
  ('all', 'O'),
  ('of', 'O'),
  ('the', 'O'),
  ('currently', 'O'),
  ('available', 'O'),
  ('H2', 'O'),
  ('-', 'O'),
  ('receptor', 'O'),
  ('antagonists', 'O'),
  ('have', 'O'),
  ('shown', 'O'),
  ('the', 'O'),
  ('propensity', 'O'),
  (

In [46]:
# show examples of mislabelled in test set
predicted_tags = [tagger.tag([word for word, _ in sent]) for sent in test_set]

mislabelled = []

for sent, predicted in zip(test_set, predicted_tags):
    actual_tag = [tag for _, tag in sent]
    predicted_tag = [tag for _, tag in predicted]
    if actual_tag != predicted_tag:
        mislabelled.append((" ".join([word for word, _ in sent]), actual_tag, predicted_tag))

for sent, actual, predicted in mislabelled[:5]:
    print("token:", sent)
    print("actual tags:", actual)
    print("predicted tags:", predicted)
    print("\n")

token: Famotidine is a histamine H2 - receptor antagonist used in inpatient settings for prevention of stress ulcers and is showing increasing popularity because of its low cost .
actual tags: ['B-Chemical', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
predicted tags: ['B-Chemical', 'O', 'O', 'B-Chemical', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


token: After a single oral dose of 4 mg / kg indomethacin ( IDM ) to sodium and volume depleted rats plasma renin activity ( PRA ) and systolic blood pressure fell significantly within four hours .
actual tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Chemical', 'O', 'B-Chemical', 'O', 'O', 'B-Chemical', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
predicted tags: ['O', 'O', 'O', 'O', 'O', 'O'

# 2.3 Computing similarity between words or phrases.

## Term-Document Matrices using TF-IDF and Word Embdedding

In [47]:
glove_model = api.load("glove-twitter-25")

In [48]:
def get_mean_vector(word2vec_model, text):
    words = word_tokenize(text.lower())
    vectors = [word2vec_model[word] for word in words if word in word2vec_model]
    if len(vectors) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(vectors, axis=0)

In [49]:
texts = []
disease_names = []

for item in ner_dataset['test']:
    tokens = item['tokens']
    tags = item['tags']
    text = " ".join(tokens)
    disease_tokens = [t for t, tag in zip(tokens, tags) if tag in [2, 3]]
    if disease_tokens:
        texts.append(text)
        disease_names.append(" ".join(disease_tokens))

# GloVe vectors
glove_vectors = np.array([get_mean_vector(glove_model, text) for text in texts])

# TF-IDF vector
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectors = tfidf_vectorizer.fit_transform(texts)

In [50]:
# cosine similarity 
def cossim(vec1, vec2):
    dot_prod = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_prod / (norm_vec1 * norm_vec2) if norm_vec1 and norm_vec2 else 0

# GloVe
diabetes_texts = [text for text in texts if 'diabetes' in text.lower()]
diabetes_glove_vector = np.mean([get_mean_vector(glove_model, text) for text in diabetes_texts], axis=0)

glove_similarity = np.array([cossim(diabetes_glove_vector, vec) for vec in glove_vectors])

# TF-IDF
diabetes_index = next((i for i, text in enumerate(texts) if 'diabetes' in text.lower()), None)
diabetes_tfidf_vector = tfidf_vectors[diabetes_index].toarray()[0]

tfidf_similarity = np.array([cossim(diabetes_tfidf_vector, tfidf_vectors[i].toarray()[0]) for i in range(tfidf_vectors.shape[0])])





In [51]:
# sort indices
sorted_glove_indices = np.argsort(-glove_similarity)
sorted_tfidf_indices = np.argsort(-tfidf_similarity)

In [52]:
# find top 5 most similar and dissimilar 
most_similar_glove = sorted_glove_indices[:5]
most_dissimilar_glove = sorted_glove_indices[-5:]
most_similar_tfidf = sorted_tfidf_indices[:5]
most_dissimilar_tfidf = sorted_tfidf_indices[-7:]

In [53]:
print("five similar diseases to diabetes (word embeddings):")
for idx in most_similar_glove:
    print(f"diseases {idx}: {disease_names[idx]} - Similarity: {glove_similarity[idx]:.4f}")
    print(f"content: {texts[idx]}\n")



five similar diseases to diabetes (word embeddings):
diseases 1444: MI bradydysrhythmias ventricular tachycardia ventricular fibrillation seizures toxicity MI cardiovascular or central nervous system toxicity - Similarity: 0.9959
content: RESULTS : Of 29 patients who received lidocaine in the setting of cocaine - associated MI , no patient died ; exhibited bradydysrhythmias , ventricular tachycardia , or ventricular fibrillation ; or experienced seizures after administration of lidocaine ( 95 % confidence interval , 0 % to 11 %). CONCLUSION : Despite theoretical concerns that lidocaine may enhance cocaine toxicity , the use of lidocaine in patients with cocaine - associated MI was not associated with significant cardiovascular or central nervous system toxicity

diseases 2356: acute renal failure renal injury - Similarity: 0.9953
content: We report a 23 - year - old woman who developed acute renal failure following prolonged use of a proprietary Chinese herbal slimming pill that contai

In [54]:
print("\nfive dissimilar diseases to diabetes (word embeddings):")
for idx in most_dissimilar_glove:
    print(f"diseases: {idx}: {disease_names[idx]} - Similarity: {glove_similarity[idx]:.4f}")
    print(f"content: {texts[idx]}\n")



five dissimilar diseases to diabetes (word embeddings):
diseases: 795: Growth retardation - Similarity: 0.5827
content: Growth retardation

diseases: 1951: interstitial cystitis - Similarity: 0.5457
content: This pathology resembles interstitial cystitis

diseases: 1382: nephrotoxicity - Similarity: 0.5354
content: Severe nephrotoxicity

diseases: 2099: necrosis - Similarity: 0.3697
content: Although cortical laminar necrosis

diseases: 1683: hypokalemia - Similarity: 0.1577
content: Symptomatic hypokalemia



In [55]:

print("\nfive similar diseases to diabetes (term-document matrices using TF-IDF):")
for idx in most_similar_tfidf:
    print(f"diseases {idx}: {disease_names[idx]} - Similarity: {tfidf_similarity[idx]:.4f}")
    print(f"content: {texts[idx]}\n")




five similar diseases to diabetes (term-document matrices using TF-IDF):
diseases 535: nephrogenic diabetes insipidus - Similarity: 1.0000
content: Absence of PKC - alpha attenuates lithium - induced nephrogenic diabetes insipidus .

diseases 1684: Hyperosmolar nonketotic coma nephrogenic diabetes insipidus - Similarity: 0.4382
content: Hyperosmolar nonketotic coma precipitated by lithium - induced nephrogenic diabetes insipidus .

diseases 536: nephrogenic diabetes insipidus NDI - Similarity: 0.4236
content: Lithium , an effective antipsychotic , induces nephrogenic diabetes insipidus ( NDI ) in 40 % of patients .

diseases 1688: type 2 diabetes polyuria nephrogenic diabetes insipidus dehydration - Similarity: 0.3055
content: We hypothesize that when this man developed type 2 diabetes , chronic polyuria due to nephrogenic diabetes insipidus was sufficient to precipitate hyperosmolar dehydration

diseases 1687: hyperglycaemia polyuric nephrogenic diabetes insipidus - Similarity: 0.292

In [56]:
print("\nfive dissimilar diseases to diabetes (term-document matrices using TF-ID):")
for idx in most_dissimilar_tfidf:
    print(f"diseases {idx}: {disease_names[idx]} - Similarity: {tfidf_similarity[idx]:.4f}")
    print(f"content: {texts[idx]}\n")


five dissimilar diseases to diabetes (term-document matrices using TF-ID):
diseases 1374: Accelerated junctional rhythms - Similarity: 0.0000
content: Accelerated junctional rhythms during oral verapamil therapy .

diseases 1370: hypotension - Similarity: 0.0000
content: Cardiac work was significantly reduced during SNP hypotension .

diseases 1368: hypotension - Similarity: 0.0000
content: Hemodynamics and myocardial metabolism under deliberate hypotension .

diseases 1366: apnoea - Similarity: 0.0000
content: It is concluded that anticholinesterases are only partially effective in restoring neuromuscular function in succinylcholine apnoea

diseases 1364: apnoea - Similarity: 0.0000
content: Succinylcholine apnoea : attempted reversal with anticholinesterases .

diseases 1390: Hepatitis renal tubular acidosis - Similarity: 0.0000
content: Hepatitis and renal tubular acidosis after anesthesia with methoxyflurane .

diseases 1422: prolonged jaw rigidity myalgia - Similarity: 0.0000
con