## NLP using Spacy - Preprocessing

In [2]:
!pip install spacy



In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')

## 1. Word Tokenization

In [4]:
txt = ('I am going to learn deep neural networks now only.')
intro = nlp(txt)
# Extract tokens for the given doc
print ([token.text for token in intro])

['I', 'am', 'going', 'to', 'learn', 'deep', 'neural', 'networks', 'now', 'only', '.']


## 2. Sentence Tokenization

In [5]:
about_text = ('Bob Ross is a Python developer currently'
              ' working for a London-based Fintech'
              ' company. He is interested in learning'
              ' Natural Language Processing.')
about_doc = nlp(about_text)
sentences = list(about_doc.sents)
print(len(sentences))

for sentence in sentences:
     print (sentence)

2
Bob Ross is a Python developer currently working for a London-based Fintech company.
He is interested in learning Natural Language Processing.


In [6]:
txt = ('I am going to play'
              ' football in new delhi'
              ' after'
              ' two days.')
doc = nlp(txt)
sentences = list(doc.sents)
print(len(sentences))

for sentence in sentences:
     print (sentence)

1
I am going to play football in new delhi after two days.


## 3. Stop Words - Removal

In [7]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

326

In [8]:
for stop_word in list(spacy_stopwords)[:11]:
  print(stop_word)

each
itself
another
would
which
why
via
off
regarding
through
always


In [9]:
for token in about_doc:
     if not token.is_stop:
         print (token)

Bob
Ross
Python
developer
currently
working
London
-
based
Fintech
company
.
interested
learning
Natural
Language
Processing
.


##4. Lemmatization


In [10]:
conference_help_text = ('She is helping organize a developer'
    'conference on Applications of Natural Language'
     ' Processing. He keeps organizing local Python meetups'
     ' and several internal talks at his workplace.')
conference_help_doc = nlp(conference_help_text)
for token in conference_help_doc:
     print (token, token.lemma_)

She -PRON-
is be
helping help
organize organize
a a
developerconference developerconference
on on
Applications Applications
of of
Natural Natural
Language Language
Processing Processing
. .
He -PRON-
keeps keep
organizing organize
local local
Python Python
meetups meetup
and and
several several
internal internal
talks talk
at at
his -PRON-
workplace workplace
. .


## 5. Work Frequency

In [12]:
from collections import Counter
complete_text = ('Bran Adams is a Python developer currently'
    'working for a London-based Fintech company. He is'
    ' interested in learning Natural Language Processing.'
    ' There is a developer conference happening on 21 July'
    ' 2019 in London. It is titled "Applications of Natural'
    ' Language Processing". There is a helpline number '
    ' available at +1-1234567891. Bran is helping organize it.'
    ' He keeps organizing local Python meetups and several'
    ' internal talks at his workplace. Bran is also presenting'
    ' a talk. The talk will introduce the reader about "Use'
    ' cases of Natural Language Processing in Fintech".'
    ' Apart from his work, he is very passionate about music.'
    ' Bran is learning to play the Piano. He has enrolled '
    ' himself in the weekend batch of Great Piano Academy.'
    ' Great Piano Academy is situated in Mayfair or the City'
    ' of London and has world-class piano instructors.')

In [13]:
complete_doc = nlp(complete_text)
# Remove stop words and punctuation symbols
words = [token.text for token in complete_doc
          if not token.is_stop and not token.is_punct]

word_freq = Counter(words)

# 5 commonly occurring words with their frequencies
common_words = word_freq.most_common(5)
print (common_words)

[('Bran', 4), ('London', 3), ('Natural', 3), ('Language', 3), ('Processing', 3)]


In [14]:
# Unique words
unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
print (unique_words)

['Adams', 'currentlyworking', 'based', 'company', 'interested', 'conference', 'happening', '21', 'July', '2019', 'titled', 'Applications', 'helpline', 'number', 'available', '+1', '1234567891', 'helping', 'organize', 'keeps', 'organizing', 'local', 'meetups', 'internal', 'talks', 'workplace', 'presenting', 'introduce', 'reader', 'Use', 'cases', 'Apart', 'work', 'passionate', 'music', 'play', 'enrolled', 'weekend', 'batch', 'situated', 'Mayfair', 'City', 'world', 'class', 'piano', 'instructors']
