# NLP Tasks
 1. Tokenization
 2. Part of speech tagging
 3. Name entity recognition
 4. Sentiment analysis
 5. Embedding - a) word2vec and b) sentence(post) to vec




In [11]:
!pip install spacy



## Tokenizatation with spacy


https://spacy.io/usage/processing-pipelines


In [12]:
#Tokenize with spacy
import spacy
nlp = spacy.load('en_core_web_sm')

text = """Here’s to the crazy ones, the misfits, the rebels, the troublemakers, the round pegs in the square holes. The ones who see things differently — they’re not fond of rules. You can quote them, disagree with them, glorify or vilify them, but the only thing you can’t do is ignore them because they change things. They push the human race forward, and while some may see them as the crazy ones, we see genius, because the ones who are crazy enough to think
that they can change the world, are the ones who do."""

doc = nlp(text)

tokens = [tokens for tokens in doc]
print(tokens)

tokens = [tokens.lower_ for tokens in doc]
tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
tokens = [token.lemma_ for token in tokens]
print(tokens)

[Here, ’s, to, the, crazy, ones, ,, the, misfits, ,, the, rebels, ,, the, troublemakers, ,, the, round, pegs, in, the, square, holes, ., The, ones, who, see, things, differently, —, they, ’re, not, fond, of, rules, ., You, can, quote, them, ,, disagree, with, them, ,, glorify, or, vilify, them, ,, but, the, only, thing, you, ca, n’t, do, is, ignore, them, because, they, change, things, ., They, push, the, human, race, forward, ,, and, while, some, may, see, them, as, the, crazy, ones, ,, we, see, genius, ,, because, the, ones, who, are, crazy, enough, to, think, 
, that, they, can, change, the, world, ,, are, the, ones, who, do, .]
['crazy', 'one', 'misfit', 'rebel', 'troublemaker', 'round', 'peg', 'square', 'hole', 'one', 'thing', 'differently', 'fond', 'rule', 'quote', 'disagree', 'glorify', 'vilify', 'thing', 'ignore', 'change', 'thing', 'push', 'human', 'race', 'forward', 'crazy', 'one', 'genius', 'one', 'crazy', 'think', '\n', 'change', 'world', 'one']


## Part of speech tagging

* Tagging part of speech on each word
* <img src="https://machinelearningknowledge.ai/wp-content/uploads/2021/06/Untitled-696x209.png?ezimgfmt=ng:webp/ngcb1"/>





In [13]:
text = "Apple is looking at buying U.K. startup for $1 billion"

doc = nlp(text)

for token in doc:
    print( token.text, token.pos_, token.dep_)

print()
print(" ".join(token.text for token in doc)  )
print(" ".join(token.pos_ for token in doc)  )


# doc = nlp("They refuse to permit us to obtain the refuse permit")
# for token in doc:
#     print( token.text, token.pos_, token.dep_)


Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN dobj
startup NOUN dep
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj

Apple is looking at buying U.K. startup for $ 1 billion
PROPN AUX VERB ADP VERB PROPN NOUN ADP SYM NUM NUM


## Name entity recognition

* DATE - Absolute or relative dates or periods
* PERSON - People, including fictional
* GPE - Countries, cities, states
* LOC - Non-GPE locations, mountain ranges, bodies of water
* MONEY - Monetary values, including unit
* TIME - Times smaller than a day
* PRODUCT - Objects, vehicles, foods, etc. (not services)
* CARDINAL - Numerals that do not fall under another type
* ORDINAL - "first", "second", etc.
* QUANTITY - Measurements, as of weight or distance
* EVENT - Named hurricanes, battles, wars, sports events, etc.
* FAC - Buildings, airports, highways, bridges, etc.
* LANGUAGE - Any named language
* LAW - Named documents made into laws.
* NORP - Nationalities or religious or political groups
* PERCENT - Percentage, including "%"
* WORK_OF_ART - Titles of books, songs, etc.



In [14]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
# doc= nlp("The Indian Space Research Organisation or is the national space agency of India, headquartered in Bengaluru. It operates under Department \
# of Space which is directly overseen by the Prime Minister of India while Chairman of ISRO acts as executive of DOS as well.")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


## Sentiment Analysis

In [3]:
!pip install spacytextblob

Collecting spacytextblob
  Downloading spacytextblob-4.0.0-py3-none-any.whl (4.5 kB)
Collecting textblob<0.16.0,>=0.15.3 (from spacytextblob)
  Downloading textblob-0.15.3-py2.py3-none-any.whl (636 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m636.5/636.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: textblob, spacytextblob
  Attempting uninstall: textblob
    Found existing installation: textblob 0.17.1
    Uninstalling textblob-0.17.1:
      Successfully uninstalled textblob-0.17.1
Successfully installed spacytextblob-4.0.0 textblob-0.15.3


### The sentiment property returns a namedtuple of the form Sentiment(polarity, subjectivity).

* The polarity score is a float within the range [-1.0, 1.0].
* The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.



In [4]:
from spacytextblob.spacytextblob import SpacyTextBlob
import spacy
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')

text = 'I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.'

doc = nlp(text)
print(doc._.blob.polarity)
print(doc._.blob.subjectivity)
print(doc._.blob.sentiment_assessments.assessments )



-0.125
0.9
[(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]


In [5]:
# Sentence Segmentation with spacy
for sent in doc.sents:
    print(sent)

for sent in doc.sents:
    print(sent._.polarity,sent)

I had a really horrible day.
It was the worst day ever!
But every now and then I have a really good day that makes me happy.
-1.0 I had a really horrible day.
-1.0 It was the worst day ever!
0.75 But every now and then I have a really good day that makes me happy.


## Embedding


1.   Word2Vec
2.   Sentence2Vec

To make them compact and fast, spaCy’s small pipeline packages (all packages that end in **sm**) don’t ship with word vectors, and only include context-sensitive tensors. This means you can still use the similarity() methods to compare documents, spans and tokens – but the result won’t be as good, and individual tokens won’t have any vectors assigned. So in order to use real word vectors, you need to **download a larger** pipeline package:


In [6]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.6.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


## Word2Vec (Token to vec)

In [15]:
#Word2vec
nlp = spacy.load("en_core_web_md")
# tokens = nlp("dog cat banana afskfsd")

tokens = nlp("dog cat king queen man women")

for token in tokens:
     print(token.text, token.has_vector, token.vector_norm, token.is_oov)

for token in tokens:
     print(token.vector.size, token.vector)


for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))




dog True 75.254234 False
cat True 63.188496 False
king True 69.68691 False
queen True 44.440845 False
man True 68.725006 False
women True 61.391197 False
300 [ 1.2330e+00  4.2963e+00 -7.9738e+00 -1.0121e+01  1.8207e+00  1.4098e+00
 -4.5180e+00 -5.2261e+00 -2.9157e-01  9.5234e-01  6.9880e+00  5.0637e+00
 -5.5726e-03  3.3395e+00  6.4596e+00 -6.3742e+00  3.9045e-02 -3.9855e+00
  1.2085e+00 -1.3186e+00 -4.8886e+00  3.7066e+00 -2.8281e+00 -3.5447e+00
  7.6888e-01  1.5016e+00 -4.3632e+00  8.6480e+00 -5.9286e+00 -1.3055e+00
  8.3870e-01  9.0137e-01 -1.7843e+00 -1.0148e+00  2.7300e+00 -6.9039e+00
  8.0413e-01  7.4880e+00  6.1078e+00 -4.2130e+00 -1.5384e-01 -5.4995e+00
  1.0896e+01  3.9278e+00 -1.3601e-01  7.7732e-02  3.2218e+00 -5.8777e+00
  6.1359e-01 -2.4287e+00  6.2820e+00  1.3461e+01  4.3236e+00  2.4266e+00
 -2.6512e+00  1.1577e+00  5.0848e+00 -1.7058e+00  3.3824e+00  3.2850e+00
  1.0969e+00 -8.3711e+00 -1.5554e+00  2.0296e+00 -2.6796e+00 -6.9195e+00
 -2.3386e+00 -1.9916e+00 -3.0450e+00  2

## Sentense embedding

*   [defaults] tokens.vector is an average of the token vectors. not capture semantic
*   Use pre-trained transformer models (E.g. universal_sentence_encoder, sentence-transformers) to convert sentense to embedding



In [8]:
import numpy as np
# l-valued meaning representation. Defaults to an average of the token vectors.
tokens = nlp("dog cat banana afskfsd")

matrixArr = np.array([tokens[0].vector,tokens[1].vector,tokens[2].vector,tokens[3].vector])
avg = np.average(matrixArr, axis=0)
print(avg)

print(tokens.vector)

[ 1.285995    1.51985    -3.1519876  -4.857275    0.40372053 -0.702725
 -1.97505    -1.9329001  -0.79143     0.99263746  3.560485    1.390425
  0.26564184  2.01145     3.3977425  -3.612475   -0.15815374 -2.1185076
  1.435475   -1.710825   -2.4027236   2.909375   -2.1509075  -2.2286
 -0.668355   -0.9713     -2.6473498   3.782715   -2.5905025  -0.33405
 -0.61644995 -0.599235   -1.24345    -0.14730498  0.490825   -4.184225
  1.0886575   1.9182426   2.1102002  -2.239075   -0.19210999 -2.6021075
  5.2194247   2.7733      1.3173975   0.5136955   1.3593975  -1.86975
 -0.20521674 -1.4796726   2.3111901   5.665       2.3114748   0.7079749
 -0.90067494  1.17948     2.5487623   0.68675     1.7658175   1.3378
  0.59345746 -3.6535451   0.527775    1.3896024  -2.6922002  -3.325725
 -1.3890749  -0.874045    0.09935001  0.8764      2.7730901   1.0204074
  0.6353925  -0.146353    1.56624    -1.063715   -1.2923775  -0.5483975
  0.75505    -1.590275   -2.441175    1.866395   -0.2400275   1.5825524
 -0.49

In [9]:
import tensorflow_hub as hub
import numpy as np

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)


module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [10]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

sentences = ["I ate dinner.",
       "We had a three-course meal.",
       "Brad came to dinner with us.",
       "He loves fish tacos.",
       "In the end, we all felt like we ate too much.",
       "We all agreed; it was a magnificent evening."]

sentence_embeddings = model(sentences)
query = "I had pizza and pasta"
query_vec = model([query])[0]

for sent in sentences:
  sim = cosine(query_vec, model([sent])[0])
  print("Sentence = ", sent, "; similarity = ", sim)

Sentence =  I ate dinner. ; similarity =  0.46866417
Sentence =  We had a three-course meal. ; similarity =  0.35643065
Sentence =  Brad came to dinner with us. ; similarity =  0.20338944
Sentence =  He loves fish tacos. ; similarity =  0.16515437
Sentence =  In the end, we all felt like we ate too much. ; similarity =  0.14987424
Sentence =  We all agreed; it was a magnificent evening. ; similarity =  0.058435917
