In [None]:
### Reference: https://spacy.io/

In [None]:
!pip install spacy
!python -m spacy download en_core_web_lg

In [None]:
!pip install spacy-langdetect

In [2]:
import spacy
print(spacy.__version__)

3.3.0


In [3]:
nlp = spacy.load("en_core_web_lg")

### Tokenization

In [4]:
doc = nlp("I am flying to Manila")
print([w.text for w in doc])

['I', 'am', 'flying', 'to', 'Manila']


### Lemmatization

In [5]:
doc = nlp("this product integrates both libraries for downloading and applying patches")
for token in doc:
    print(token.text, token.lemma_)

this this
product product
integrates integrate
both both
libraries library
for for
downloading download
and and
applying apply
patches patch


### POS tagging

In [7]:
doc=nlp('I have flown to Cebu. Now I am flying to Manila.')
for token in doc:
    print(token.text, token.pos_, token.tag_)

I PRON PRP
have AUX VBP
flown VERB VBN
to ADP IN
Cebu PROPN NNP
. PUNCT .
Now ADV RB
I PRON PRP
am AUX VBP
flying VERB VBG
to ADP IN
Manila PROPN NNP
. PUNCT .


In [8]:
spacy.explain("NNP")

'noun, proper singular'

### Segmentation

In [11]:
doc=nlp('I have flown to Cebu. Now I am flying to Manila.')
for sent in doc.sents:
    print([sent[i] for i in range(len(sent))])

[I, have, flown, to, Cebu, .]
[Now, I, am, flying, to, Manila, .]


### Retokenization

In [18]:
doc=nlp('The Golden State Bridge is an iconic landmark in San Francisco')
[doc[i] for i in range(len(doc))]

[The, Golden, State, Bridge, is, an, iconic, landmark, in, San, Francisco]

In [19]:
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[1:4])
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[7:9])

In [20]:
[doc[i] for i in range(len(doc))]

[The, Golden State Bridge, is, an, iconic, landmark, in, San Francisco]

### Syntactic Parsing

In [22]:
doc = nlp('I want a green apple,')
for token in doc:
    print(token.text, token.pos_, token.dep_, spacy.explain(token.dep_))

I PRON nsubj nominal subject
want VERB ROOT None
a DET det determiner
green ADJ amod adjectival modifier
apple NOUN dobj direct object
, PUNCT punct punctuation


In [23]:
from spacy import displacy
displacy.serve(doc, style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [24]:
from IPython.core.display import display, HTML

doc = nlp('I want to fly to Manila.')

from spacy import displacy
html = displacy.render(doc, style='ent', page=True)

display(HTML(html))

  from IPython.core.display import display, HTML


<IPython.core.display.HTML object>

In [25]:
spacy.explain('GPE')

'Countries, cities, states'

### Similarity

In [26]:
doc = nlp('I want a green apple.')

In [27]:
doc[2:5]

a green apple

In [28]:
doc.similarity(doc[2:5])

0.8776482403927138

In [29]:
nlp('apple').similarity(nlp('banana'))

0.5831844567891399

In [30]:
nlp('lovelife').similarity(nlp('forever'))

0.11223901797214983

In [31]:
nlp('apple').vector

array([-3.6391e-01,  4.3771e-01, -2.0447e-01, -2.2889e-01, -1.4227e-01,
        2.7396e-01, -1.1435e-02, -1.8578e-01,  3.7361e-01,  7.5339e-01,
       -3.0591e-01,  2.3741e-02, -7.7876e-01, -1.3802e-01,  6.6992e-02,
       -6.4303e-02, -4.0024e-01,  1.5309e+00, -1.3897e-02, -1.5657e-01,
        2.5366e-01,  2.1610e-01, -3.2720e-01,  3.4974e-01, -6.4845e-02,
       -2.9501e-01, -6.3923e-01, -6.2017e-02,  2.4559e-01, -6.9334e-02,
       -3.9967e-01,  3.0925e-02,  4.9033e-01,  6.7524e-01,  1.9481e-01,
        5.1488e-01, -3.1149e-01, -7.9939e-02, -6.2096e-01, -5.3277e-03,
       -1.1264e-01,  8.3528e-02, -7.6947e-03, -1.0788e-01,  1.6628e-01,
        4.2273e-01, -1.9009e-01, -2.9035e-01,  4.5630e-02,  1.0120e-01,
       -4.0855e-01, -3.5000e-01, -3.6175e-01, -4.1396e-01,  5.9485e-01,
       -1.1524e+00,  3.2424e-02,  3.4364e-01, -1.9209e-01,  4.3255e-02,
        4.9227e-02, -5.4258e-01,  9.1275e-01,  2.9576e-01,  2.3658e-02,
       -6.8737e-01, -1.9503e-01, -1.1059e-01, -2.2567e-01,  2.41

### Language Detection: https://spacy.io/universe/project/spacy-langdetect

In [32]:
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x1cf802c1df0>

In [33]:
text = 'This is an english text.'
doc = nlp(text)

print(doc._.language)

{'language': 'en', 'score': 0.9999961945863722}


In [34]:
text = 'magandang gabi!'
doc = nlp(text)

print(doc._.language)

{'language': 'tl', 'score': 0.9999975120359708}
