In [5]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
import spacy
from spacy.lang.en import English
nlp = English()

text = """Phillip Williams, who lives in georgia, said he that was 12 years old and balding."""

my_doc = nlp(text)

token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['Phillip', 'Williams', ',', 'who', 'lives', 'in', 'georgia', ',', 'said', 'he', 'that', 'was', '12', 'years', 'old', 'and', 'balding', '.']


In [7]:
# Cleaning Text of "Stop Words"
spacy_stopwords =  spacy.lang.en.stop_words.STOP_WORDS

print('Number of stop words: %d' % len(spacy_stopwords))

print('First ten stop words: %s' % list(spacy_stopwords)[:500])

Number of stop words: 326
First ten stop words: ['its', 'alone', 'over', '‘d', 'twenty', 'or', 'seeming', 'rather', 'whose', 'has', 'cannot', 'although', 'he', "'d", 'meanwhile', 'enough', 'as', 'everyone', 'last', 'seems', 'among', 'well', 'nor', 'own', 'wherein', 'is', 'of', 'no', 'however', 'being', 'fifty', 'should', 'make', 'name', 'himself', 'almost', 'who', 'seemed', 're', 'yourselves', 'his', 'seem', 'quite', 'might', 'therein', 'into', 'about', 'either', 'throughout', 'any', 'we', 'when', 'made', 'not', 'nothing', 'am', 'nevertheless', 'mine', 'by', 'used', 'except', 'towards', 'perhaps', 'whom', 'now', 'your', 'none', 'whence', 'third', 'be', 'just', 'keep', 'everywhere', '‘ll', 'nine', 'does', 'mostly', 'afterwards', 'i', 'hereby', 'only', 'indeed', 'itself', 'others', 'every', 'thereby', 'unless', 'nowhere', 'in', 'during', 'more', 'thereafter', 'please', 'somehow', 'until', 'also', 'after', 'full', "'m", 'move', 'may', 'put', 'themselves', 'each', 'up', 'her', 'could', 'mo

In [8]:
# Filtered Text of "Stop Words"
from spacy.lang.en.stop_words import STOP_WORDS

filtered_sent=[]

doc = nlp(text)

for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)

Filtered Sentence: [Phillip, Williams, ,, lives, georgia, ,, said, 12, years, old, balding, .]


In [9]:
# Lemmatization—this addresses how several words may mean the same thing.
nlp = spacy.load("en_core_web_sm")
doc = nlp("think thinks thinking thinker thought")

for token in doc:
    print(token.lemma_)

think
think
think
thinker
think


In [10]:
# Part of Speech Tagging
import en_core_web_sm

nlp = en_core_web_sm.load()

docs = nlp(u"Phillip Williams, who lives in georgia, said he that was 12 years old and balding.")

for word in docs:
    print(word.text,word.pos_)


Phillip PROPN
Williams PROPN
, PUNCT
who PRON
lives VERB
in ADP
georgia PROPN
, PUNCT
said VERB
he PRON
that PRON
was AUX
12 NUM
years NOUN
old ADJ
and CCONJ
balding ADJ
. PUNCT


In [12]:
# Entity Detection
from spacy import displacy
nytimes= nlp(u"""phillip williams, who lives in georgia, said that he was 12 years old and balding.""")

entities=[(i, i.label_, i.label) for i in nytimes.ents]
entities

[(phillip williams, 'PERSON', 380),
 (georgia, 'GPE', 384),
 (12 years old, 'DATE', 391)]

In [13]:
# Entity Visualization
from spacy import displacy
nytimes= nlp(u"""Phillip Williams, who lives in georgia, said he that was 12 years old and balding.""")

entities=[(i, i.label_, i.label) for i in nytimes.ents]
entities
displacy.render(nytimes, style = "ent",jupyter = True)