In [1]:
# NLP Operations in Python
# Two packages:
# 1. NLTK (Natural Language Processing Toolkit)
# 2. Spacy

In [2]:
# spacy Basics

In [3]:
#If spacy is not installed --------> Anaconda Prompt --------> pip install spacy
# -or-
#  conda install -c conda-forge spacy

In [4]:
import spacy

In [5]:
# Download Language Model (Mandatory Step !)
# https://spacy.io/models
#Doc for English Model: https://spacy.io/models/en

In [6]:
# To download the english core model (small)
! python3 -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 7.7 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [7]:
# Quick Check if Spacy is working 

In [8]:
import spacy

In [9]:
model = spacy.load('en_core_web_sm')

In [11]:
textExample = u"Tesla is looking at buying U.S. startups for $6 million"
#POS Tagging Demo

# All the string data that needs to be loaded in spacy model object 
# must follow unicode standard

tokenData = model(textExample)
for token in tokenData :
  print(token.text , token.pos_)


Tesla PROPN
is AUX
looking VERB
at ADP
buying VERB
U.S. PROPN
startups NOUN
for ADP
$ SYM
6 NUM
million NUM


In [12]:
#Tokenization

In [14]:
demoText = u"We're here to help you! Send snail-mail @ support@prashant.com or visit us @ https://www.prashant.com"
demoText.split(" ")

["We're",
 'here',
 'to',
 'help',
 'you!',
 'Send',
 'snail-mail',
 '@',
 'support@prashant.com',
 'or',
 'visit',
 'us',
 '@',
 'https://www.prashant.com']

In [15]:
for token in model(demoText):
  print(token.text)

We
're
here
to
help
you
!
Send
snail
-
mail
@
support@prashant.com
or
visit
us
@
https://www.prashant.com


In [16]:
for token in model(u"Let's visit St. Louis in the U.S. next year!"):
  print(token.text)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
!


In [17]:
# NER - Named Entity Recognition
result = model(u"Apple to build a factory in Hong Kong and Mumbai with an initial investment of $100 million in collaboration with Microsoft and Simplilearn")

for tokens in result:
  print(tokens)

print("==========================================")

for tokenEntities in result.ents:
  print(tokenEntities.text , tokenEntities.label_, spacy.explain(tokenEntities.label_))

Apple
to
build
a
factory
in
Hong
Kong
and
Mumbai
with
an
initial
investment
of
$
100
million
in
collaboration
with
Microsoft
and
Simplilearn
Apple ORG Companies, agencies, institutions, etc.
Hong Kong GPE Countries, cities, states
Mumbai GPE Countries, cities, states
$100 million MONEY Monetary values, including unit
Microsoft ORG Companies, agencies, institutions, etc.
Simplilearn ORG Companies, agencies, institutions, etc.


In [19]:
result = model(u"An Apple a day keeps the doctor away")

for tokens in result:
  print(tokens)

print("==========================================")

for tokenEntities in result.ents:
  print(tokenEntities.text , tokenEntities.label_, spacy.explain(tokenEntities.label_))

An
Apple
a
day
keeps
the
doctor
away
Apple ORG Companies, agencies, institutions, etc.


In [20]:
result = model(u"Narendra Modi is the current PM of India")

for tokens in result:
  print(tokens)

print("==========================================")

for tokenEntities in result.ents:
  print(tokenEntities.text , tokenEntities.label_, spacy.explain(tokenEntities.label_))

Narendra
Modi
is
the
current
PM
of
India
Narendra Modi PERSON People, including fictional
India GPE Countries, cities, states


In [21]:
# Visualizing the NER components in a sentence!

In [22]:
# NER - Named Entity Recognition
result = model(u"Apple to build a factory in Hong Kong and Mumbai with an initial investment of $100 million in collaboration with Microsoft and Simplilearn")

from spacy import displacy
displacy.render(result, jupyter=True, style="ent")

In [23]:
# Stemming -- Using NLTK Package
# Reasons as of dated 13Feb2022:
#    1. NLTK Stemmer and Lemmatizer's time complexity and turn around time is the fastest
#    2. NLTK's method is widely adopted by enterprise app design framework !


# If nltk is not present ------------> pip install nltk 

In [24]:
import nltk

In [25]:
from nltk.stem.porter import PorterStemmer

stemObject = PorterStemmer()

In [26]:
wordList = ["branching","branched","branches","caching","cached","caches","easily"]

for word in wordList:
  print(word , stemObject.stem(word))

branching branch
branched branch
branches branch
caching cach
cached cach
caches cach
easily easili


In [27]:
# Lemmatization Example


In [30]:
result = model(u"branching branched branches caching cached caches easily")
for token in result:
  print(token.text+"  "+token.lemma_)

branching  branching
branched  branch
branches  branch
caching  cache
cached  cached
caches  cache
easily  easily


In [34]:
#Using NLTK
import nltk
from nltk.stem import WordNetLemmatizer

In [35]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [40]:
lemmaObject = WordNetLemmatizer()

lemmaObject.lemmatize("trains")

'train'