# Goal: Explore other great models, components and pipelines from other sources like 'scispacy' which comes from the Allen Institute for Artificial Intelligence (AllenAI) 

In [1]:
import spacy

from spacy import displacy

from scispacy.abbreviation import AbbreviationDetector

# The scispacy models and included components include many features which are useful for biomedical text and potentially even clinical text since it provides an improved tokenizer for medical tokens and improved sentence boundary detection as trained from large amounts of biomedical literature.
## Today we will look at the AbbreviationDetector

In [2]:
text = "Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily."

In [3]:
default_nlp = spacy.load("en_core_sci_sm")

In [4]:
for pipe in default_nlp.pipeline:
    print(pipe)

('tagger', <spacy.pipeline.pipes.Tagger object at 0x7fd59bc60c50>)
('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7fd59bb36048>)
('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7fd59bb360a8>)


# Scispacy and other components and models than extend beyond the base spacy functionality use a special variable inside Documents, Tokens, and Spans called the extension variable.  This is a class member components can use to register names and variables of new knowledge or linguistic information that it adds.  This name of this member variable is the literal underscore character (_)

In [5]:
def count_document_extensions(spacy_doc):
    document_extensions = spacy_doc._._extensions.keys()
    print(document_extensions)
    return len(document_extensions)

In [6]:
doc = default_nlp(text)

default_pipeline_extension_count = count_document_extensions(doc)
    
print('Total tokens with extension values : {}'.format(default_pipeline_extension_count))

dict_keys([])
Total tokens with extension values : 0


# Add the Abbreviation Detector to our pipeline and we can get extended information from its processing

In [7]:
abbreviation_nlp = spacy.load("en_core_sci_sm")

# Add the abbreviation pipe to the spacy pipeline.
abbreviation_pipe = AbbreviationDetector(abbreviation_nlp)
abbreviation_nlp.add_pipe(abbreviation_pipe)

In [8]:
# let's look at the pipeline components
for pipe in abbreviation_nlp.pipeline:
    print(pipe)

('tagger', <spacy.pipeline.pipes.Tagger object at 0x7fd59afbec18>)
('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7fd59ae9e288>)
('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7fd59ae9e2e8>)
('AbbreviationDetector', <scispacy.abbreviation.AbbreviationDetector object at 0x7fd59af4d0f0>)


In [9]:
doc2 = abbreviation_nlp(text)

In [10]:
abbreviation_pipeline_extension_count = count_document_extensions(doc2)
    
print('Total tokens with extension values : {}'.format(abbreviation_pipeline_extension_count))

dict_keys(['abbreviations'])
Total tokens with extension values : 1


# Let's look at the abbreviations

In [11]:
print("Abbreviation", "\t", "Definition")
for abrv in doc2._.abbreviations:
    print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")

Abbreviation 	 Definition
SBMA 	 (6, 7) Spinal and bulbar muscular atrophy
SBMA 	 (33, 34) Spinal and bulbar muscular atrophy
AR 	 (29, 30) androgen receptor


In [12]:
print(doc2._._extensions.keys())

dict_keys(['abbreviations'])
