<a href="https://colab.research.google.com/github/RakeshSharma21/Sessions_Notebook/blob/main/Spacy_Custom_model_for_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### SPACY 101

In [None]:
# installation
!pip install spacy

In [4]:
# version of spacy library
import spacy
print(spacy.__version__)

3.7.3


#### Linguistic annotations

In [6]:
nlp=spacy.load("en_core_web_sm")

In [7]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [9]:
for token in doc:
  print(token.text,token.pos_,token.dep_)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN dobj
startup NOUN dep
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [None]:
#### tokenisation can done through token.text

In [10]:
### More linguistic annotations
for token in doc:
  print(token.text,token.lemma_,token.pos_,token.shape_,token.is_alpha,token.is_stop)

Apple Apple PROPN Xxxxx True False
is be AUX xx True True
looking look VERB xxxx True False
at at ADP xx True True
buying buy VERB xxxx True False
U.K. U.K. PROPN X.X. False False
startup startup NOUN xxxx True False
for for ADP xxx True True
$ $ SYM $ False False
1 1 NUM d False False
billion billion NUM xxxx True False


In [12]:
### visualisation:
from spacy import displacy
displacy.render(doc,style="dep",jupyter=True)

#### Named Entities

In [14]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

doc.ents
for ent in doc.ents:
  print(ent.text,ent.start_char,ent.end_char,ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [15]:
type(doc)

spacy.tokens.doc.Doc

In [17]:
type(doc.ents[0])

spacy.tokens.span.Span

In [None]:
### (DOC)==> entities==(Span)

In [18]:
#### inspecting spacy pipeline
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7dbfa4500820>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7dbfa4500ac0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7dbfa4dede00>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7dbfa4626100>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7dbfa44f1780>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7dbfa4dedd90>)]

In [19]:
#### visualise the ner
displacy.render(doc,style="ent",jupyter=True)

#### Train Custom NER model

In [None]:
### https://www.kaggle.com/datasets/finalepoch/medical-ner

In [20]:
### unzip the training data
!unzip archive\ \(2\).zip

Archive:  archive (2).zip
  inflating: Corona2.json            


In [21]:
import json
with open('Corona2.json','r') as f:
  data=json.load(f)


In [30]:
data['examples'][0]['annotations'][0]

{'id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
 'tag_id': 'c06bd022-6ded-44a5-8d90-f17685bb85a1',
 'end': 371,
 'start': 360,
 'example_id': '18c2f619-f102-452f-ab81-d26f7e283ffe',
 'tag_name': 'Medicine',
 'value': 'Diosmectite',
 'correct': None,
 'human_annotations': [{'timestamp': '2020-03-21T00:24:32.098000Z',
   'annotator_id': 1,
   'tagged_token_id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
   'name': 'Ashpat123',
   'reason': 'exploration'}],
 'model_annotations': []}

In [None]:
#{'text':value,'entities':[{'start':start,'end':end,'tag_name':tag}]}

In [32]:
training_data=[]
for example in data['examples']:
  temp_dict={}
  temp_dict['text']=example['content']
  temp_dict['entities']=[]
  for annotation in example['annotations']:
    start=annotation['start']
    end = annotation['end']
    label= annotation['tag_name'].upper()
    temp_dict['entities'].append((start,end,label))
  training_data.append(temp_dict)


In [35]:
training_data[0]

{'text': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
 'entities': [(360, 371, 'MEDICINE'),
  (383, 408, 'MEDICINE'),
  (104, 112, 'MEDICALCONDITION'),


In [36]:
#### custom model training data prep
from spacy.util import filter_spans
from spacy.tokens import DocBin
nlp =spacy.blank("en")
doc_bin = DocBin()
for training_example in training_data:
  text = training_example['text']
  labels=training_example['entities']
  doc=nlp.make_doc(text)
  ents=[]
  for start,end,label in labels:
    span=doc.char_span(start,end,label=label)
    if span is None:
      print('skipping entity')
    else:
      ents.append(span)
    fitered_ents=filter_spans(ents)
    doc.ents=fitered_ents
    doc_bin.add(doc)
doc_bin.to_disk("train.spacy")

skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity


In [37]:
### https://spacy.io/usage/training#quickstart
#### settting the config file from base_config file
!python -m spacy init fill-config base_config.cfg config.cfg


[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


##### training the model for the given custom dataset

In [None]:
### downloading large model for spacy
!python -m spacy download en_core_web_lg

In [47]:
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     76.71    1.68    0.89   13.94    0.02
  0     200        347.51   4322.89   38.94   48.50   32.52    0.39
[38;5;2m✔ Saved pipeline to output directory[0m
model-last


In [48]:
### looking few prediction
nlp_ner = spacy.load("model-best")
doc = nlp_ner("While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]")

In [49]:
colors={"PATHOGEN":'blue','MEDICINE':'yellow',"MEDICALCONDITION":'red'}
options={"colors":colors}

spacy.displacy.render(doc,style='ent',options=options,jupyter=True)