<a href="https://colab.research.google.com/github/PrathameshS26/CustomerAngular/blob/master/NLP_Week1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Tokenization Without Spacy Lib**

In [73]:
import re
text = 'I\'m with you for the entire life in U.K.!'
words = re.split(r'\W+', text)
print(words[:100])

['I', 'm', 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'U', 'K', '']


In [74]:
import string
import re
words = text.split()
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
stripped = [re_punc.sub('',w) for w in words]
print(stripped[:100])

['Im', 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'UK']


In [75]:
re_print = re.compile('[^%s]' % re.escape(string.printable))
result = [re_print.sub('',w) for w in words]
print(result)

["I'm", 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'U.K.!']


In [76]:
words = text.split()
words = [word.lower() for word in words]
print(words[:100])

["i'm", 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'u.k.!']


# **USING SPACY LIB**

In [77]:
import spacy
nlp = spacy.load('en_core_web_sm') #Core english language in small kind

In [78]:
string = '"I\'m with you for the entire life in U.K.!"'
print(string)

"I'm with you for the entire life in U.K.!"


In [79]:
doc = nlp(string)
for token in doc:
  print(token.text, end=' | ')

" | I | 'm | with | you | for | the | entire | life | in | U.K. | ! | " | 

In [80]:
doc2 = nlp(u"We're here to help! Send snail-mail, email ps226@exeter.com or visit us at https://prathamesh.com/!")
for t in doc2:
  print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
ps226@exeter.com
or
visit
us
at
https://prathamesh.com/
!


In [81]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')
for t  in doc3:
  print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [82]:
len(doc)

13

In [83]:
len(doc.vocab)

793

In [84]:
doc4 = nlp(u'It is better to give than to receive.')
doc4[2]

better

In [85]:
doc4[2:5]

better to give

In [86]:
doc4[-4:]

than to receive.

In [87]:
doc5 = nlp(u'Apple to build a Hong Kong fcatory for $6 million')

for token in doc5:
  print(token.text, end= ' | ')

print('\n----')

for ent in doc5.ents:
  print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))

Apple | to | build | a | Hong | Kong | fcatory | for | $ | 6 | million | 
----
Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [88]:
len(doc5.ents)

3

In [89]:
doc6 = nlp(u'Autonomous cars shift insurance liability toward manufacturers')

for chunk in doc6.noun_chunks:
  print(chunk.text)

Autonomous cars
insurance liability
manufacturers


In [90]:
from spacy import displacy

doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc, style='dep', jupyter=True, options={'distance' : 110})

In [91]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)

In [92]:
doc = nlp(u'This is a sentence.')
displacy.serve(doc, style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


# **Lemmatization**

In [93]:
var1 = nlp(u"John Adam is one of the researcher who invent the direction of way towards success")

for token in var1:
  print(token.text, '\t','\t', token.pos_, '\t','\t', token.lemma, '\t', token.lemma_)

John 	 	 PROPN 	 	 11174346320140919546 	 John
Adam 	 	 PROPN 	 	 14264057329400597350 	 Adam
is 	 	 AUX 	 	 10382539506755952630 	 be
one 	 	 NUM 	 	 17454115351911680600 	 one
of 	 	 ADP 	 	 886050111519832510 	 of
the 	 	 DET 	 	 7425985699627899538 	 the
researcher 	 	 NOUN 	 	 1317581537614213870 	 researcher
who 	 	 PRON 	 	 3876862883474502309 	 who
invent 	 	 VERB 	 	 5373681334090504585 	 invent
the 	 	 DET 	 	 7425985699627899538 	 the
direction 	 	 NOUN 	 	 895834437038626927 	 direction
of 	 	 ADP 	 	 886050111519832510 	 of
way 	 	 NOUN 	 	 6878210874361030284 	 way
towards 	 	 ADP 	 	 9315050841437086371 	 towards
success 	 	 NOUN 	 	 16089821935113899987 	 success


In [94]:
def show_lemmas(text):
  for token in text:
    print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [95]:
var2 = nlp(u"John Adam is one of the researcher who invent the direction of way towards success")
show_lemmas(var2)

John         PROPN  11174346320140919546   John
Adam         PROPN  14264057329400597350   Adam
is           AUX    10382539506755952630   be
one          NUM    17454115351911680600   one
of           ADP    886050111519832510     of
the          DET    7425985699627899538    the
researcher   NOUN   1317581537614213870    researcher
who          PRON   3876862883474502309    who
invent       VERB   5373681334090504585    invent
the          DET    7425985699627899538    the
direction    NOUN   895834437038626927     direction
of           ADP    886050111519832510     of
way          NOUN   6878210874361030284    way
towards      ADP    9315050841437086371    towards
success      NOUN   16089821935113899987   success


# **STOP WORDS**

In [96]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [97]:
print(nlp.Defaults.stop_words)

{'even', 'five', 'into', '’m', 'with', 'become', 'already', 'very', 'top', 'many', 'perhaps', 'put', 'am', 'both', 'say', 'really', 'she', 'yet', 'please', 'becomes', 'another', 'whose', 'using', "'s", 'twelve', 'above', 'when', 'bottom', 'and', 'might', 'where', 'thus', 'which', 'his', 'used', 'out', 'whence', 'such', 'there', 'beside', 'front', 'nor', 'otherwise', "n't", 'everywhere', 'itself', 'us', 'off', 'amongst', 'a', 'fifteen', 'thereupon', 'anyone', 'formerly', 'less', 'anything', 'each', 'this', 'mystery', 'moreover', 'every', 'had', 'sometimes', 'whither', 'much', '‘s', 'n’t', 'twenty', 'somewhere', 'hereafter', 'it', 'n‘t', 'serious', 'done', 'everyone', 'show', 'thereby', 'doing', 'cannot', 'the', 'made', 'ourselves', 'therein', 'were', 'more', 'if', 'since', 'should', 'so', 'until', 'sixty', 'quite', 'least', 'than', 'i', 'rather', 'your', 'but', 'up', 'throughout', 'some', 'always', 'either', 'ours', 'most', 'forty', 'upon', 'whereas', 'hundred', 'we', 'nevertheless', 't

In [98]:
len(nlp.Defaults.stop_words)

326

# To see if a word is a stop **word**

In [99]:
nlp.vocab['myself'].is_stop

True

In [100]:
nlp.vocab['mystery'].is_stop

True

In [101]:
nlp.Defaults.stop_words.add('mystery')

In [102]:
nlp.vocab['mystery'].is_stop = True

In [103]:
len(nlp.Defaults.stop_words)

326

In [104]:
nlp.vocab['mystery'].is_stop

True

In [106]:
#nlp.Defaults.stop_words.remove('beyond')
#nlp.vocab['beyond'].is_stop = False

In [107]:
len(nlp.Defaults.stop_words)

326

In [108]:
nlp.vocab['beyond'].is_stop

False

In [109]:
import string
import re
import nltk
nltk.download('punkt')
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

text = 'The Quick brown fox jump over the lazy dog!'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [110]:
tokens = word_tokenize(text)
print(tokens)

['The', 'Quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '!']


In [111]:
tokens = [w.lower() for w in tokens]
print(tokens)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '!']


In [112]:
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
print(re_punc)

re.compile('[!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~]')


In [113]:
stripped = [re_punc.sub('', w) for w in tokens]
print(stripped)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '']


In [114]:
words = [word for word in stripped if word.isalpha()]
print(words)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog']


In [115]:
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
print(words)

['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']


In [116]:
nlp.vocab['quick'].is_stop

False

# Vocabulary and **Matching**

In [117]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [118]:
pattern1 = [{'LOWER' : 'solarpower'}]
pattern2 = [{'LOWER' : 'solar'}, {'LOWER' : 'power'}]
pattern3 = [{'LOWER' : 'solar'}, {'IS_PUNCT' : True}, {'LOWER':'power'}]
patterns = [pattern1, pattern2, pattern3]
matcher.add('SolarPower', patterns)

In [119]:
doc = nlp(u'The Solar Power industry continues to grow as demand for solarpower increases. Solar-power cars are gaining popularity.')

In [120]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [121]:
for match_id, start, end in found_matches:
  string_id = nlp.vocab.strings[match_id]
  span = doc[start:end]
  print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


# Setting pattern options and **quantifiers**

In [122]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT':True, 'OP':'*'}, {'LOWER':'power'}]
patterns = [pattern1, pattern2]
matcher.add('SolarPower', patterns)

In [123]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [124]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT':True, 'OP':'*'}, {'LEMMA':'power'}]
patterns = [pattern1, pattern2]
matcher.add('SolarPower', patterns)

In [125]:
doc2 = nlp(u'Solar-powered energy runs solar-powered cars.')

In [126]:
found_matches = matcher(doc2)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 5, 8)]


# PHRASE **MATCHER**

In [127]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

In [128]:
terms = ['Galaxy Note','iPhone 11','iPhone XS','Google Pixel']
patterns = [nlp(text) for text in terms]
matcher.add("TerminologyList",None, *patterns)

In [129]:
text_doc = nlp("Glowing review overall, and some really interesting side-by-side""photograpgy tests pitting the iPhone 11 Pro against the ""Galaxy Note 10Plus and last year's iPhone XS and Google Pixel 3.")
matches = matcher(text_doc)
print(matches)

[(3766102292120407359, 16, 18), (3766102292120407359, 21, 23), (3766102292120407359, 28, 30), (3766102292120407359, 31, 33)]


# **POS** BASICS

In [130]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
  print(token.text,"\t", token.lemma_,"\t",  token.pos_,"\t",  token.tag_,"\t",  token.dep_,"\t",  token.shape_,"\t",  token.is_alpha,"\t",  token.is_stop)

Apple 	 Apple 	 PROPN 	 NNP 	 nsubj 	 Xxxxx 	 True 	 False
is 	 be 	 AUX 	 VBZ 	 aux 	 xx 	 True 	 True
looking 	 look 	 VERB 	 VBG 	 ROOT 	 xxxx 	 True 	 False
at 	 at 	 ADP 	 IN 	 prep 	 xx 	 True 	 True
buying 	 buy 	 VERB 	 VBG 	 pcomp 	 xxxx 	 True 	 False
U.K. 	 U.K. 	 PROPN 	 NNP 	 dobj 	 X.X. 	 False 	 False
startup 	 startup 	 NOUN 	 NN 	 dep 	 xxxx 	 True 	 False
for 	 for 	 ADP 	 IN 	 prep 	 xxx 	 True 	 True
$ 	 $ 	 SYM 	 $ 	 quantmod 	 $ 	 False 	 False
1 	 1 	 NUM 	 CD 	 compound 	 d 	 False 	 False
billion 	 billion 	 NUM 	 CD 	 pobj 	 xxxx 	 True 	 False


In [131]:
doc = nlp(u"The Quick brown fox jump over the lazy dog's back.")

In [132]:
print(doc.text)

The Quick brown fox jump over the lazy dog's back.


In [133]:
print(doc[4].text, doc[4].pos_, doc[4].tag_, spacy.explain(doc[4].tag_))

jump NOUN NN noun, singular or mass


In [134]:
for token in doc:
  print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

The        DET      DT     determiner
Quick      PROPN    NNP    noun, proper singular
brown      ADJ      JJ     adjective (English), other noun-modifier (Chinese)
fox        NOUN     NN     noun, singular or mass
jump       NOUN     NN     noun, singular or mass
over       ADP      IN     conjunction, subordinating or preposition
the        DET      DT     determiner
lazy       ADJ      JJ     adjective (English), other noun-modifier (Chinese)
dog        NOUN     NN     noun, singular or mass
's         PART     POS    possessive ending
back       NOUN     NN     noun, singular or mass
.          PUNCT    .      punctuation mark, sentence closer


In [135]:
POS_counts = doc.count_by(spacy.attrs.POS)
POS_counts

{90: 2, 96: 1, 84: 2, 92: 4, 85: 1, 94: 1, 97: 1}

In [136]:
doc.vocab[83].text

'LANG'




#  Create frequency list of POS tags from the entire document



In [137]:
for k,v in sorted(POS_counts.items()):
  print(f'{k}.{doc.vocab[k].text:{5}}:{v}')

84.ADJ  :2
85.ADP  :1
90.DET  :2
92.NOUN :4
94.PART :1
96.PROPN:1
97.PUNCT:1


In [138]:
TAG_counts = doc.count_by(spacy.attrs.TAG)

for k, v in sorted(TAG_counts.items()):
  print(f'{k}.{doc.vocab[k].text:{4}}:{v}')

74.POS :1
1292078113972184607.IN  :1
10554686591937588953.JJ  :2
12646065887601541794..   :1
15267657372422890137.DT  :2
15308085513773655218.NN  :4
15794550382381185553.NNP :1


In [139]:
DEP_counts = doc.count_by(spacy.attrs.DEP)

for k,v in sorted(DEP_counts.items()):
  print(f'{k}.{doc.vocab[k].text:{4}}:{v}')

402.amod:2
415.det :2
426.nmod:1
439.pobj:1
440.poss:1
443.prep:1
445.punct:1
7037928807040764755.compound:1
8110129090154140942.case:1
8206900633647566924.ROOT:1


# **Named-Entity Recognition (NER)**

In [140]:
def show_ents(doc):
  if doc.ents:
    for ent in doc.ents:
      print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
      print('No named entities found.')

In [141]:
doc = nlp(u'Hi, everyone I\'m Prathamesh Sawant CS')

show_ents(doc)