In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
import spacy
import json
import scispacy
from spacy import displacy
from spacy.tokens import Doc, DocBin
from spacy.util import filter_spans
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
     ------------------------------------ 777.4/777.4 MB 106.6 kB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


In [4]:
nlp = spacy.load("en_core_web_lg")
nlp

<spacy.lang.en.English at 0x118135e77f0>

In [5]:
doc = nlp('Elephants are found in Africa and India')

In [6]:
doc.ents

(Africa, India)

In [7]:
displacy.render(doc, style="ent", jupyter=True)

In [8]:
# import json
with open('Corona2.json', 'r') as file:
    data = json.load(file)

In [9]:
data['examples'][1]

{'id': '487c93e3-0d45-4088-a378-cf3a01c8953d',
 'content': 'Diarrhea, also spelled diarrhoea, is the condition of having at least three loose, liquid, or watery bowel movements each day.[2] It often lasts for a few days and can result in dehydration due to fluid loss.[2] Signs of dehydration often begin with loss of the normal stretchiness of the skin and irritable behaviour.[2] This can progress to decreased urination, loss of skin color, a fast heart rate, and a decrease in responsiveness as it becomes more severe.[2] Loose but non-watery stools in babies who are exclusively breastfed, however, are normal.[2]',
 'metadata': {},
 'annotations': [{'id': '28601a42-c8a9-44e2-aeea-8939cb1db1a9',
   'tag_id': '03eb3e50-d4d8-4261-a60b-fa5aee5deb4a',
   'end': 382,
   'start': 364,
   'example_id': '487c93e3-0d45-4088-a378-cf3a01c8953d',
   'tag_name': 'MedicalCondition',
   'value': 'loss of skin color',
   'correct': None,
   'human_annotations': [{'timestamp': '2020-03-21T00:23:08.961000Z

In [10]:
data['examples'][1].keys()

dict_keys(['id', 'content', 'metadata', 'annotations', 'classifications'])

In [11]:
data['examples'][1]['content']

'Diarrhea, also spelled diarrhoea, is the condition of having at least three loose, liquid, or watery bowel movements each day.[2] It often lasts for a few days and can result in dehydration due to fluid loss.[2] Signs of dehydration often begin with loss of the normal stretchiness of the skin and irritable behaviour.[2] This can progress to decreased urination, loss of skin color, a fast heart rate, and a decrease in responsiveness as it becomes more severe.[2] Loose but non-watery stools in babies who are exclusively breastfed, however, are normal.[2]'

In [12]:
data['examples'][1]['annotations'][0]

{'id': '28601a42-c8a9-44e2-aeea-8939cb1db1a9',
 'tag_id': '03eb3e50-d4d8-4261-a60b-fa5aee5deb4a',
 'end': 382,
 'start': 364,
 'example_id': '487c93e3-0d45-4088-a378-cf3a01c8953d',
 'tag_name': 'MedicalCondition',
 'value': 'loss of skin color',
 'correct': None,
 'human_annotations': [{'timestamp': '2020-03-21T00:23:08.961000Z',
   'annotator_id': 1,
   'tagged_token_id': '28601a42-c8a9-44e2-aeea-8939cb1db1a9',
   'name': 'Ashpat123',
   'reason': 'exploration'}],
 'model_annotations': []}

## We need the content and annotations.

In [13]:
train_data = []
for example in data['examples']:
    ent_dict = {}
    ent_dict['text'] = example['content']
    ent_dict['entities'] = []
    for annotation in example['annotations']:
        start = annotation['start']
        end = annotation['end']
        label = annotation['tag_name'].upper()
        ent_dict['entities'].append((start, end, label))
    train_data.append(ent_dict)

print(train_data[1])

{'text': 'Diarrhea, also spelled diarrhoea, is the condition of having at least three loose, liquid, or watery bowel movements each day.[2] It often lasts for a few days and can result in dehydration due to fluid loss.[2] Signs of dehydration often begin with loss of the normal stretchiness of the skin and irritable behaviour.[2] This can progress to decreased urination, loss of skin color, a fast heart rate, and a decrease in responsiveness as it becomes more severe.[2] Loose but non-watery stools in babies who are exclusively breastfed, however, are normal.[2]', 'entities': [(364, 382, 'MEDICALCONDITION'), (0, 8, 'MEDICALCONDITION'), (94, 116, 'MEDICALCONDITION'), (178, 189, 'MEDICALCONDITION'), (221, 232, 'MEDICALCONDITION'), (23, 32, 'MEDICALCONDITION'), (409, 435, 'MEDICALCONDITION'), (386, 401, 'MEDICALCONDITION')]}


In [14]:
train_data[1]['text']

'Diarrhea, also spelled diarrhoea, is the condition of having at least three loose, liquid, or watery bowel movements each day.[2] It often lasts for a few days and can result in dehydration due to fluid loss.[2] Signs of dehydration often begin with loss of the normal stretchiness of the skin and irritable behaviour.[2] This can progress to decreased urination, loss of skin color, a fast heart rate, and a decrease in responsiveness as it becomes more severe.[2] Loose but non-watery stools in babies who are exclusively breastfed, however, are normal.[2]'

In [15]:
train_data[1]['entities']

[(364, 382, 'MEDICALCONDITION'),
 (0, 8, 'MEDICALCONDITION'),
 (94, 116, 'MEDICALCONDITION'),
 (178, 189, 'MEDICALCONDITION'),
 (221, 232, 'MEDICALCONDITION'),
 (23, 32, 'MEDICALCONDITION'),
 (409, 435, 'MEDICALCONDITION'),
 (386, 401, 'MEDICALCONDITION')]

In [16]:
train_data[1]['text'][364:382]

'loss of skin color'

In [17]:
train_data[5]

{'text': "Hantaviruses, usually found in rodents and shrews, were discovered in two species of bats. The MouyassuÃ© virus (MOUV) was isolated from banana pipistrelle bats captured near MouyassuÃ© village in Cote d'Ivoire, West Africa. The Magboi virus was isolated from hairy slit-faced bats found near the Magboi River in Sierra Leone in 2011. They are single-stranded, negative sense, RNA viruses in the Bunyaviridae family.[29][30][31][32]",
 'entities': [(0, 12, 'PATHOGEN'),
  (394, 406, 'PATHOGEN'),
  (227, 239, 'PATHOGEN'),
  (95, 110, 'PATHOGEN')]}

In [18]:
train_data[5]['text'][0:12]

'Hantaviruses'

## We need to initialize a blank spacy model to build our custom NER model.

In [19]:
nlp_custom = spacy.blank('en')
doc_bin = DocBin()

In [20]:
for train_doc in tqdm(train_data):
    text = train_doc['text']
    labels = train_doc['entities']
    doc = nlp_custom.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode='contract')
        if span is None:
            print('No Span found')
        else:
            ents.append(span)
    final_ents = filter_spans(ents)
    doc.ents = final_ents
    doc_bin.add(doc)
    
doc_bin.to_disk('custom_ner.spacy')

100%|██████████| 31/31 [00:00<00:00, 119.69it/s]

No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found
No Span found





In [21]:
!python -m spacy init fill-config base_config.cfg config.cfg

[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
# !python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy

In [None]:
# print(doc.ents)

In [None]:
# custom_med_NER = spacy.load('output/model-best')

In [None]:

# docx = custom_med_NER('Diarrhea, also spelled diarrhoea, is the condition of having at least three loose, liquid, or watery bowel movements each day.[2] It often lasts for a few days and can result in dehydration due to fluid loss.[2] Signs of dehydration often begin with loss of the normal stretchiness of the skin and irritable behaviour.[2] This can progress to decreased urination, loss of skin color, a fast heart rate, and a decrease in responsiveness as it becomes more severe.[2] Loose but non-watery stools in babies who are exclusively breastfed, however, are normal')

# colors= {'PATHOGEN': '#D49137', 'MEDICINE': '#BE398D', 'MEDICALCONDITION': '#F07857'}
# options = {'colors': colors}

# spacy.displacy.render(docx, style='ent', options=options, jupyter=True)

In [None]:
# print(doc.ents.label_, doc.ents.text)