In [2]:
# Imports and configuration
import spacy
import os

gpu_activated = spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")
# Spacy general info
spacy.info()

{'spacy_version': '3.4.1',
 'location': 'C:\\prj\\elevate\\venv\\lib\\site-packages\\spacy',
 'platform': 'Windows-10-10.0.19044-SP0',
 'python_version': '3.10.5',
 'pipelines': {'en_core_web_sm': '3.4.0'}}

In [5]:
# download "en_core_web_sm" language model
# os.system('cmd /c "python -m spacy download en_core_web_sm"')

0

In [3]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for word in doc:
    print(
        word.text,
        word.tag_,
    )

Apple NNP noun, proper singular
is VBZ verb, 3rd person singular present
looking VBG verb, gerund or present participle
at IN conjunction, subordinating or preposition
buying VBG verb, gerund or present participle
U.K. NNP noun, proper singular
startup NN noun, singular or mass
for IN conjunction, subordinating or preposition
$ $ symbol, currency
1 CD cardinal number
billion CD cardinal number


In [68]:
# Tokenize, lemmatizer, POS, tags, dependancy and more
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(
        f"{token.i}\t{token.text}\t{doc[token.i:token.i+1].start_char}-{doc[token.i:token.i+1].end_char}\t{token.lemma_}\t{token.pos_}\t{token.tag_}\t{spacy.explain(token.tag_)}"
    )
    print(f"{token.morph.__str__()}\t")
    print(f"{token.head.i}\t{token.dep_}\t")
    print("------------------------------")

0	Apple	0-5	Apple	PROPN	NNP	noun, proper singular
Number=Sing	
2	nsubj	
------------------------------
1	is	6-8	be	AUX	VBZ	verb, 3rd person singular present
Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	
2	aux	
------------------------------
2	looking	9-16	look	VERB	VBG	verb, gerund or present participle
Aspect=Prog|Tense=Pres|VerbForm=Part	
2	ROOT	
------------------------------
3	at	17-19	at	ADP	IN	conjunction, subordinating or preposition
	
2	prep	
------------------------------
4	buying	20-26	buy	VERB	VBG	verb, gerund or present participle
Aspect=Prog|Tense=Pres|VerbForm=Part	
3	pcomp	
------------------------------
5	U.K.	27-31	U.K.	PROPN	NNP	noun, proper singular
Number=Sing	
4	dobj	
------------------------------
6	startup	32-39	startup	NOUN	NN	noun, singular or mass
Number=Sing	
4	dobj	
------------------------------
7	for	40-43	for	ADP	IN	conjunction, subordinating or preposition
	
6	prep	
------------------------------
8	$	44-45	$	SYM	$	symbol, currency
	
10	quantmod	

In [64]:
# Named Entity Recognition
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(f"{ent.text}\t{ent.start_char}-{ent.end_char}\t{ent.label_}\t")
# Create Named Entities
data = {}
i = 0
for ent in doc.ents:
    data[i] = {
        "text": ent.text,
        "start_char": ent.start_char,
        "end_char": ent.end_char,
        "type": ent.label_,
    }
    i = i + 1
print(data)

Apple	0-5	ORG	
U.K.	27-31	GPE	
$1 billion	44-54	MONEY	
{0: {'text': 'Apple', 'start_char': 0, 'end_char': 5, 'type': 'ORG'}, 1: {'text': 'U.K.', 'start_char': 27, 'end_char': 31, 'type': 'GPE'}, 2: {'text': '$1 billion', 'start_char': 44, 'end_char': 54, 'type': 'MONEY'}}


In [48]:
# Word vectors and similarity
# Note: for better accuracy use a larger pipeline model(not ending with sm, e.g. "en_core_web_sm"), instead use "en_core_web_trf"
tokens = nlp("sun moon cat dog funny sad zzzz")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

sun True 8.730866 True
moon True 8.087774 True
cat True 7.08495 True
dog True 7.1998067 True
funny True 7.778815 True
sad True 7.6599216 True
zzzz True 8.588857 True
