# Goal: Explore other great models, components and pipelines from other sources like 'scispacy' which comes from the Allen Institute for Artificial Intelligence (AllenAI) 

In [1]:
import spacy
from scispacy.abbreviation import AbbreviationDetector

In [2]:
text = "Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily."

In [3]:
default_nlp = spacy.load("en_core_sci_sm")

In [4]:
for pipe in default_nlp.pipeline:
    print(pipe)

('tagger', <spacy.pipeline.pipes.Tagger object at 0x7fafc6dc8f60>)
('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7fafc5e66228>)
('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7fafc5e66288>)


In [5]:
def count_document_extensions(spacy_doc):
    document_extensions = spacy_doc._._extensions.keys()
    print(document_extensions)
    return len(document_extensions)

In [13]:
doc = default_nlp(text)

default_pipeline_extension_count = count_document_extensions(doc)
    
print('Total tokens with extension values : {}'.format(default_pipeline_extension_count))

dict_keys(['abbreviations'])
Total tokens with extension values : 1


In [7]:
abbreviation_nlp = spacy.load("en_core_sci_sm")

# Add the abbreviation pipe to the spacy pipeline.
abbreviation_pipe = AbbreviationDetector(abbreviation_nlp)
abbreviation_nlp.add_pipe(abbreviation_pipe)

In [8]:
# let's look at the pipeline components
for pipe in abbreviation_nlp.pipeline:
    print(pipe)

('tagger', <spacy.pipeline.pipes.Tagger object at 0x7fafc527c5c0>)
('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7fafc51d09a8>)
('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7fafc51d0a08>)
('AbbreviationDetector', <scispacy.abbreviation.AbbreviationDetector object at 0x7fafc527c550>)


In [9]:
doc2 = abbreviation_nlp(text)

In [10]:
abbreviation_pipeline_extension_count = count_document_extensions(doc2)
    
print('Total tokens with extension values : {}'.format(abbreviation_pipeline_extension_count))

dict_keys(['abbreviations'])
Total tokens with extension values : 1


# Let's look at the abbreviations

In [11]:
print("Abbreviation", "\t", "Definition")
for abrv in doc2._.abbreviations:
    print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")

Abbreviation 	 Definition
SBMA 	 (6, 7) Spinal and bulbar muscular atrophy
SBMA 	 (33, 34) Spinal and bulbar muscular atrophy
AR 	 (29, 30) androgen receptor


In [12]:
print(doc2._._extensions.keys())

dict_keys(['abbreviations'])
