In [3]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


### The doc object for Processed Text

In [4]:
import spacy

In [5]:
nlp = spacy.load("en_core_web_sm")
nlp

<spacy.lang.en.English at 0x718b53aa7aa0>

In [6]:
introduction_doc = nlp("This tutorial is about Natural Language Processing in English")
type(introduction_doc)

spacy.tokens.doc.Doc

In [7]:
[token.text for token in introduction_doc]

['This',
 'tutorial',
 'is',
 'about',
 'Natural',
 'Language',
 'Processing',
 'in',
 'English']

In [8]:
import pathlib
file_name = "example.txt"
introduction_doc = nlp(pathlib.Path(file_name).read_text(encoding="utf-8"))
print([token.text for token in introduction_doc])



### Sentence Detection

In [9]:
about_text = (
...    "Gus Proto is a python developer currently"
...    " working for a London-based Fintech"
...    " company. He is interested in learning"
...    " Natural Language Processing."
... )

In [10]:
about_doc = nlp(about_text)
sentences = list(about_doc.sents)
len(sentences)

2

In [11]:
for sentence in sentences:
    print(f"{sentence[:5]}...")

Gus Proto is a python...
He is interested in learning...


In [12]:
ellipsis_text = (
    "Gus, can you, ... never mind, I forgot"
    " what I was saying. So, do you think"
    " we should ..."
)

In [13]:
from spacy.language import Language
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == "...":
            doc[token.i + 1].is_sent_start = True
    return doc

In [14]:
custom_nlp = spacy.load("en_core_web_sm")

In [15]:
custom_nlp.add_pipe("set_custom_boundaries", before="parser")

<function __main__.set_custom_boundaries(doc)>

In [16]:
custom_ellipsis_doc = custom_nlp(ellipsis_text)

In [17]:
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)

In [18]:
for sentence in custom_ellipsis_sentences:
    print(sentence)

Gus, can you, ...
never mind, I forgot what I was saying.
So, do you think we should ...


### Tokens in Spacy

In [19]:
import spacy
nlp = spacy.load("en_core_web_sm")
about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)
about_doc = nlp(about_text)

for token in about_doc:
    print (token, token.idx)

Gus 0
Proto 4
is 10
a 13
Python 15
developer 22
currently 32
working 42
for 50
a 54
London 56
- 62
based 63
Fintech 69
company 77
. 84
He 86
is 89
interested 92
in 103
learning 106
Natural 115
Language 123
Processing 132
. 142


In [20]:
print(
    f'{"Text with Whitespace":22}'
    f'{"Is Alphanumeric?":15}'
    f'{"Is Punctuation?":18}'
    f'{"Is Stop Word?"}'
)

Text with Whitespace  Is Alphanumeric?Is Punctuation?   Is Stop Word?


In [21]:
for token in about_doc:
    print(
        f'{str(token.text_with_ws):22}'
        f'{str(token.is_alpha):15}'
        f'{str(token.is_punct):18}'
        f'{str(token.is_stop)}'
)

Gus                   True           False             False
Proto                 True           False             False
is                    True           False             True
a                     True           False             True
Python                True           False             False
developer             True           False             False
currently             True           False             False
working               True           False             False
for                   True           False             True
a                     True           False             True
London                True           False             False
-                     False          True              False
based                 True           False             False
Fintech               True           False             False
company               True           False             False
.                     False          True              False
He                    True  

In [22]:
custom_about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London@based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)

tokens_to_print = [token.text for token in about_doc[8:15]]
print(tokens_to_print)

['for', 'a', 'London', '-', 'based', 'Fintech', 'company']


In [23]:
import re
import spacy
from spacy.tokenizer import Tokenizer

custom_nlp = spacy.load("en_core_web_sm")

prefix_re = spacy.util.compile_prefix_regex(custom_nlp.Defaults.prefixes)
suffix_re = spacy.util.compile_suffix_regex(custom_nlp.Defaults.suffixes)

custom_infixes = [r"@"]
infix_re = spacy.util.compile_infix_regex(
    list(custom_nlp.Defaults.infixes) + custom_infixes
)

custom_nlp.tokenizer = Tokenizer(
    custom_nlp.vocab,
    prefix_search=prefix_re.search,
    suffix_search=suffix_re.search,
    infix_finditer=infix_re.finditer,
    token_match=None,
)

custom_about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London@based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."

)

custom_tokenizer_about_doc = custom_nlp(custom_about_text)

print([token.text for token in custom_tokenizer_about_doc[8:15]])

['for', 'a', 'London', '@', 'based', 'Fintech', 'company']


### Stopwords

In [24]:
import spacy

spacy_stopwords = spacy. lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

for stop_word in list(spacy_stopwords) [:10]:
    print(stop_word)

whether
thus
used
latterly
via
beside
myself
due
noone
using


In [25]:
import spacy

custom_about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)

nlp = spacy. load("en_core_web_sm")
about_doc = nlp(custom_about_text)

filtered_tokens = [token.text for token in about_doc if not token.is_stop]

print(filtered_tokens)

['Gus', 'Proto', 'Python', 'developer', 'currently', 'working', 'London', '-', 'based', 'Fintech', 'company', '.', 'interested', 'learning', 'Natural', 'Language', 'Processing', '.']


### Lemmatization

In [26]:
import spacy

nlp = spacy.load("en_core_web_sm")
conference_help_text = (
    "Gus is helping organize a developer"
    " conference on Applications of Natural Language"
    " Processing. He keeps organizing local Python meetups"
    " and several internal talks at his workplace."
)

conference_help_doc = nlp(conference_help_text)

for token in conference_help_doc:
    if str(token) != str(token.lemma_):
        print(f"{str(token):>20}: {str(token.lemma_)}")

                  is: be
          Processing: processing
                  He: he
               keeps: keep
          organizing: organize
             meetups: meetup
               talks: talk


### Word Frequency

In [27]:
import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm")

complete_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech company. He is"
    " interested in learning Natural Language Processing."
    " There is a developer conference happening on 21 July"
    ' 2019 in London. It is titled "Applications of Natural'
    ' Language Processing". There is a helpline number'
    " available at +44-1234567891. Gus is helping organize it."
    " He keeps organizing local Python meetups and several"
    " internal talks at his workplace. Gus is also presenting"
    ' a talk. The talk will introduce the reader about "Use'
    ' cases of Natural Language Processing in Fintech".'
    " Apart from his work, he is very passionate about music."
    " Gus is learning to play the Piano. He has enrolled"
    " himself in the weekend batch of Great Piano Academy."
    " Great Piano Academy is situated in Mayfair or the City"
    " of London and has world-class piano instructors."
)

complete_doc = nlp(complete_text)

words = [
    token.text
    for token in complete_doc
    if not token. is_stop and not token.is_punct
]

print(Counter(words).most_common(5))

[('Gus', 4), ('London', 3), ('Natural', 3), ('Language', 3), ('Processing', 3)]


In [28]:
Counter(
    [token.text for token in complete_doc if not token. is_punct]
).most_common(5)

[('is', 10), ('a', 5), ('in', 5), ('Gus', 4), ('of', 4)]

### Part of speech tagging

In [29]:
import spacy

nlp = spacy.load("en_core_web_sm")

about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)

about_doc = nlp(about_text)

for token in about_doc:
    print(
        f"""
TOKEN: {str(token)}
=====
TAG: {str(token.tag_):10} POS: {token.pos_}
EXPLANATION: {spacy.explain(token.tag_)}"""
)


TOKEN: Gus
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: Proto
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: is
=====
TAG: VBZ        POS: AUX
EXPLANATION: verb, 3rd person singular present

TOKEN: a
=====
TAG: DT         POS: DET
EXPLANATION: determiner

TOKEN: Python
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: developer
=====
TAG: NN         POS: NOUN
EXPLANATION: noun, singular or mass

TOKEN: currently
=====
TAG: RB         POS: ADV
EXPLANATION: adverb

TOKEN: working
=====
TAG: VBG        POS: VERB
EXPLANATION: verb, gerund or present participle

TOKEN: for
=====
TAG: IN         POS: ADP
EXPLANATION: conjunction, subordinating or preposition

TOKEN: a
=====
TAG: DT         POS: DET
EXPLANATION: determiner

TOKEN: London
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: -
=====
TAG: HYPH       POS: PUNCT
EXPLANATION: punctuation mark, hyphen

TOKEN: based
=====
TAG

In [30]:
nouns = []
adjectives = []

for token in about_doc:
    if token.pos_ == "NOUN":
        nouns.append(token)
    if token.pos_ == "ADJ":
        adjectives.append(token)

In [31]:
nouns

[developer, company]

In [32]:
adjectives

[interested]

### Visualize using displaCy

In [33]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

about_interest_text = "He is interested in learning Natural Language Processing"
about_interest_doc = nlp(about_interest_text)

html_code = displacy.render(about_interest_doc, style="dep", options={"distance":100})
print(html_code)

None


### Preprocessing Functions

In [34]:
import spacy

nlp = spacy.load("en_core_web_sm")

complete_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech company. He is"
    " interested in learning Natural Language Processing."
    " There is a developer conference happening on 21 July"
    ' 2019 in London. It is titled "Applications of Natural'
    ' Language Processing". There is a helpline number'
    "available at +44-1234567891. Gus is helping organize it."
    " He keeps organizing local Python meetups and several"
    " internal talks at his workplace. Gus is also presenting"
    ' a talk. The talk will introduce the reader about "Use'
    ' cases of Natural Language Processing in Fintech".'
    "Apart from his work, he is very passionate about music."
    " Gus is learning to play the Piano. He has enrolled"
    " himself in the weekend batch of Great Piano Academy."
    " Great Piano Academy is situated in Mayfair or the City"
    " of London and has world-class piano instructors."
)

complete_doc = nlp(complete_text)

def is_token_allowed(token):
    return bool(
        token
        and str(token).strip()
        and not token.is_stop
        and not token.is_punct
)

def preprocess_token(token):
    return token. lemma_. strip().lower()

complete_filtered_tokens = [
    preprocess_token(token)
    for token in complete_doc
    if is_token_allowed(token)
]
print(complete_filtered_tokens)

['gus', 'proto', 'python', 'developer', 'currently', 'work', 'london', 'base', 'fintech', 'company', 'interested', 'learn', 'natural', 'language', 'processing', 'developer', 'conference', 'happen', '21', 'july', '2019', 'london', 'title', 'application', 'natural', 'language', 'processing', 'helpline', 'numberavailable', '+44', '1234567891', 'gus', 'helping', 'organize', 'keep', 'organize', 'local', 'python', 'meetup', 'internal', 'talk', 'workplace', 'gus', 'present', 'talk', 'talk', 'introduce', 'reader', 'use', 'case', 'natural', 'language', 'processing', 'fintech"', 'apart', 'work', 'passionate', 'music', 'gus', 'learn', 'play', 'piano', 'enrol', 'weekend', 'batch', 'great', 'piano', 'academy', 'great', 'piano', 'academy', 'situate', 'mayfair', 'city', 'london', 'world', 'class', 'piano', 'instructor']


### Rule based matching spacy

In [35]:
import spacy

nlp = spacy.load("en_core_web_sm")

about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing." 
)

about_doc = nlp(about_text)

from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

def extract_full_name(nlp_doc):
    pattern = [{"POS": "PROPN"}, {"POS": "PROPN"} ]
    matcher.add("FULL_NAME", [pattern])
    matches = matcher(nlp_doc)
    for _, start, end in matches:
        span = nlp_doc[start:end]
        yield span.text

print(next(extract_full_name(about_doc)))

Gus Proto


### Dependency parsing using spacy

In [36]:
import spacy

nlp = spacy.load("en_core_web_sm")
piano_text = "Gus is learning piano"
piano_doc = nlp(piano_text)

for token in piano_doc:
    print(
        f"""
TOKEN: {token.text}

{token.tag_ =}
{token.head.text =}
{token. dep_ =}"""
)


TOKEN: Gus

token.tag_ ='NNP'
token.head.text ='learning'
token. dep_ ='nsubj'

TOKEN: is

token.tag_ ='VBZ'
token.head.text ='learning'
token. dep_ ='aux'

TOKEN: learning

token.tag_ ='VBG'
token.head.text ='learning'
token. dep_ ='ROOT'

TOKEN: piano

token.tag_ ='NN'
token.head.text ='learning'
token. dep_ ='dobj'


In [37]:
displacy.serve(piano_doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [38]:
import spacy

nlp = spacy. load("en_core_web_sm")

one_line_about_text = (
    "Gus Proto is a Python developer"
    " currently working for a London-based Fintech company"
)

one_line_about_doc = nlp(one_line_about_text)
print([token.text for token in one_line_about_doc[5].children])
print(one_line_about_doc[5].nbor(-1))
print(one_line_about_doc[5].nbor())
print([token.text for token in one_line_about_doc[5].lefts])
print([token.text for token in one_line_about_doc[5].rights])
print(list(one_line_about_doc[5].subtree))

['a', 'Python', 'working']
Python
currently
['a', 'Python']
['working']
[a, Python, developer, currently, working, for, a, London, -, based, Fintech, company]


### Named Entity Recogniition 

In [39]:
import spacy

nlp = spacy.load("en_core_web_sm")

piano_class_text = (
    "Great Piano Academy is situated"
    " in Mayfair or the City of London and has"
    " world-class piano instructors."
)

piano_class_doc = nlp(piano_class_text)

for ent in piano_class_doc.ents:
    print(
        f"""
{ent.text =}
{ent.start_char =}
{ent.end_char =}
{ent.label_ =}
spacy.explain('{ent.label_}') = {spacy.explain(ent.label_)}"""
)


ent.text ='Great Piano Academy'
ent.start_char =0
ent.end_char =19
ent.label_ ='ORG'
spacy.explain('ORG') = Companies, agencies, institutions, etc.

ent.text ='Mayfair'
ent.start_char =35
ent.end_char =42
ent.label_ ='FAC'
spacy.explain('FAC') = Buildings, airports, highways, bridges, etc.

ent.text ='the City of London'
ent.start_char =46
ent.end_char =64
ent.label_ ='GPE'
spacy.explain('GPE') = Countries, cities, states


In [40]:
displacy.serve(piano_class_doc, style="ent")


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [13/Dec/2024 08:54:38] "GET / HTTP/1.1" 200 1425
127.0.0.1 - - [13/Dec/2024 08:54:39] "GET /favicon.ico HTTP/1.1" 200 1425


Shutting down server on port 5000.


In [41]:
survey_text = (
    "Out of 5 people surveyed, James Robert,"
    " Julie Fuller and Benjamin Brooks like"
    " apples. Kelly Cox and Matthew Evans"
    " like oranges."
)

def replace_person_names(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "[REDACTED] "
    return token.text_with_ws

def redact_names(nlp_doc):
    with nlp_doc.retokenize() as retokenizer:
        for ent in nlp_doc.ents:
            retokenizer.merge(ent)
    tokens = map(replace_person_names, nlp_doc)
    return "".join(tokens)

survey_doc = nlp(survey_text)
print(redact_names(survey_doc))

Out of 5 people surveyed, [REDACTED] , [REDACTED] and [REDACTED] like apples. [REDACTED] and [REDACTED] like oranges.
