In [1]:
!pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import spacy
nlp = spacy.load('en_core_web_sm') # en_core_web_md

In [3]:
text = """
This is one of the greatest films ever made. Brilliant acting by George C. Scott and Diane Riggs. 
This movie is both disturbing and extremely deep. Don't be fooled into believing this is just a comedy. 
It is a brilliant satire about the medical profession. It is not a pretty picture.
 Healthy patients are killed by incompetent surgeons, who spend all their time making money outside the hospital. 
 And yet, you really believe that this is a hospital. 
 The producers were very careful to include real medical terminology and real medical cases. 
 This movie really reveals how difficult in is to run a hospital, and how badly things already were in 1971. 
 I loved this movie. P.S. - I noticed that the incompetent, wheeler dealer surgeon played the head of the firm in 
 LA Law. The young doctor played in Lou Grant. 
 I also noticed that the registration nurse has appeared since in Becker and other shows.
"""

In [4]:
doc = nlp(text) # tagger, parser, NER
print(dir(doc))

['_', '__bytes__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', '_bulk_merge', '_context', '_get_array_attrs', '_realloc', '_vector', '_vector_norm', 'cats', 'char_span', 'copy', 'count_by', 'doc', 'ents', 'extend_tensor', 'from_array', 'from_bytes', 'from_dict', 'from_disk', 'from_docs', 'from_json', 'get_extension', 'get_lca_matrix', 'has_annotation', 'has_extension', 'has_unknown_spaces', 'has_vector', 'is_nered', 'is_parsed', 'is_sentenced', 'is_tagged', 'lang', 'lang_', 'mem', 'noun_chunks', 'noun_chunks_iterator', 'remove_extension', 'retokenize', 'sentiment', 'sents', 'set_ents', 'set_extension', 'similarity', 'spans', 'tensor', 'text', 'te

In [5]:
from IPython.core.display import display, HTML
# SpaCy pipeline
spacy_url = 'https://spacy.io/pipeline-7a14d4edd18f3edfee8f34393bff2992.svg'
iframe = '<iframe src={} width=1000 height=200></iframe>'.format(spacy_url)
HTML(iframe)



# Tokenizor

In [6]:
# SpaCy pipeline
spacy_url = 'https://spacy.io/tokenization-57e618bd79d933c4ccd308b5739062d6.svg'
iframe = '<iframe src={} width=1500 height=200></iframe>'.format(spacy_url)
HTML(iframe)

In [7]:
text = "This is one of the greatest movie. I loved It"
doc = nlp(text)

# print column headers
print('{:15} | {:15} | {:8} | {:8} | {:11} | {:8} | {:8} | {:8} | '.format('TEXT','LEMMA_','POS_','TAG_','DEP_','SHAPE_','IS_ALPHA','IS_STOP'))

for token in doc:
  print('{:15} | {:15} | {:8} | {:8} | {:11} | {:8} | {:8} | {:8} |'.format(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop))

TEXT            | LEMMA_          | POS_     | TAG_     | DEP_        | SHAPE_   | IS_ALPHA | IS_STOP  | 
This            | this            | PRON     | DT       | nsubj       | Xxxx     |        1 |        1 |
is              | be              | AUX      | VBZ      | ROOT        | xx       |        1 |        1 |
one             | one             | NUM      | CD       | attr        | xxx      |        1 |        1 |
of              | of              | ADP      | IN       | prep        | xx       |        1 |        1 |
the             | the             | DET      | DT       | det         | xxx      |        1 |        1 |
greatest        | great           | ADJ      | JJS      | amod        | xxxx     |        1 |        0 |
movie           | movie           | NOUN     | NN       | pobj        | xxxx     |        1 |        0 |
.               | .               | PUNCT    | .        | punct       | .        |        0 |        0 |
I               | I               | PRON     | PRP    

In [8]:
spacy.explain('nsubj')

'nominal subject'

In [9]:
spacy.explain('AUX')

'auxiliary'

In [10]:
previous_token = doc[0]  # set first token

for token in doc[1:]:    
    # identify adjective noun pairs
    if previous_token.pos_ == 'ADJ' and token.pos_ == 'NOUN':
        print(f'{previous_token.text}_{token.text}')
    
    previous_token = token

greatest_movie


In [11]:
ner_text = "I love my country India, and I love Machine learning"

ner_doc = nlp(ner_text)
print('{:10} | {:15}'.format('LABEL','ENTITY'))

for ent in ner_doc.ents:
    print('{:10} | {:50}'.format(ent.label_, ent.text))

LABEL      | ENTITY         
GPE        | India                                             


In [12]:
from spacy import displacy

In [13]:
displacy.render(docs=ner_doc, style='ent', jupyter=True)

In [14]:
spacy.explain('GPE')

'Countries, cities, states'