In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
about_text = ('I am a Python developer currently'
             ' working for a London-based Fintech'
              ' company. I am interested in learning'
              ' Natural Language Processing.')
about_doc = nlp(about_text)
sentences = list(about_doc.sents)
len(sentences)

2

In [3]:
for sentence in sentences:
    print (sentence)

I am a Python developer currently working for a London-based Fintech company.
I am interested in learning Natural Language Processing.


In [4]:
for token in about_doc:
     print (token, token.idx)

I 0
am 2
a 5
Python 7
developer 14
currently 24
working 34
for 42
a 46
London 48
- 54
based 55
Fintech 61
company 69
. 76
I 78
am 80
interested 83
in 94
learning 97
Natural 106
Language 114
Processing 123
. 133


In [6]:
import re

In [7]:
from spacy.tokenizer import Tokenizer
custom_nlp = spacy.load('en_core_web_sm')
prefix_re = spacy.util.compile_prefix_regex(custom_nlp.Defaults.prefixes)
suffix_re = spacy.util.compile_suffix_regex(custom_nlp.Defaults.suffixes)
infix_re = re.compile(r'''[-~]''')
def customize_tokenizer(nlp):
     # Adds support to use `-` as the delimiter for tokenization
     return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                      suffix_search=suffix_re.search,
                      infix_finditer=infix_re.finditer,
                      token_match=None
                      )


custom_nlp.tokenizer = customize_tokenizer(custom_nlp)
custom_tokenizer_about_doc = custom_nlp(about_text)
print([token.text for token in custom_tokenizer_about_doc])

['I', 'am', 'a', 'Python', 'developer', 'currently', 'working', 'for', 'a', 'London', '-', 'based', 'Fintech', 'company', '.', 'I', 'am', 'interested', 'in', 'learning', 'Natural', 'Language', 'Processing', '.']


In [8]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

326

In [9]:
for stop_word in list(spacy_stopwords)[:10]:
   print(stop_word)

should
it
‘re
others
around
above
mostly
’ll
itself
ourselves


In [10]:
for token in about_doc:
     if not token.is_stop:
         print (token)

Python
developer
currently
working
London
-
based
Fintech
company
.
interested
learning
Natural
Language
Processing
.


In [11]:
conference_help_text = ('Raj is helping organize a developer'
     'conference on Applications of Natural Language'
     ' Processing. He keeps organizing local Python meetups'
     ' and several internal talks at his workplace.')
conference_help_doc = nlp(conference_help_text)
for token in conference_help_doc:
    print (token, token.lemma_)

Raj raj
is be
helping helping
organize organize
a a
developerconference developerconference
on on
Applications Applications
of of
Natural Natural
Language Language
Processing Processing
. .
He he
keeps keep
organizing organize
local local
Python Python
meetups meetup
and and
several several
internal internal
talks talk
at at
his his
workplace workplace
. .


In [12]:
from collections import Counter
complete_text = ('J K is a Python developer currently'
    'working for a London-based Fintech company. He is'
    ' interested in learning Natural Language Processing.'
    ' There is a conference happening on 21 June'
     ' 2019 in London. It is titled "Applications of Natural'
    ' Language Processing". There is a helpline number '
    ' available . J is helping organize it.'
    ' He keeps organizing local Python meetups and several'
    ' internal talks at his workplace. J is also presenting'
     ' a talk. The talk will introduce the reader about "Use'
     ' cases of Natural Language Processing in Fintech".'
     ' Apart from his work, he is very passionate about music.'
     ' J is learning to play the Piano. He has enrolled '
     ' himself in the weekend batch of Great Piano Academy.'
     ' Great Piano Academy is situated in Mayfair or the City'
     ' of London and has world-class piano instructors.')

complete_doc = nlp(complete_text)
# Remove stop words and punctuation symbols
words = [token.text for token in complete_doc
          if not token.is_stop and not token.is_punct]
word_freq = Counter(words)
# 5 commonly occurring words with their frequencies
common_words = word_freq.most_common(5)
print (common_words)

[('J', 4), ('London', 3), ('Natural', 3), ('Language', 3), ('Processing', 3)]


In [13]:
# Unique words
unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
print (unique_words)

['K', 'developer', 'currentlyworking', 'based', 'company', 'interested', 'conference', 'happening', '21', 'June', '2019', 'titled', 'Applications', 'helpline', 'number', 'available', 'helping', 'organize', 'keeps', 'organizing', 'local', 'meetups', 'internal', 'talks', 'workplace', 'presenting', 'introduce', 'reader', 'Use', 'cases', 'Apart', 'work', 'passionate', 'music', 'play', 'enrolled', 'weekend', 'batch', 'situated', 'Mayfair', 'City', 'world', 'class', 'piano', 'instructors']


In [14]:
piano_text = ('Piano Academy is situated'
     ' in Mayfair or the City of London and has'
     ' world-class piano instructors.')
piano_doc = nlp(piano_text)
for ent in piano_doc.ents:
     print(ent.text, ent.start_char, ent.end_char,
           ent.label_, spacy.explain(ent.label_))

Piano Academy 0 13 ORG Companies, agencies, institutions, etc.
Mayfair 29 36 GPE Countries, cities, states
the City of London 40 58 GPE Countries, cities, states
