In [4]:
import numpy as np
import pandas as pd
import spacy

In [5]:
nlp = spacy.load('en_core_web_sm')

# Learning Natural Language Processing (NLP)

In [6]:
doc = nlp(u"Tesla is looking at buying U.S. startup for around $6 million")


In [7]:
for token in doc:
    print("{} {} {}".format(token.text,token.pos_,token.dep_))

Tesla NOUN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
around ADP quantmod
$ SYM quantmod
6 NUM compound
million NUM pobj


In [8]:
# pipeline Names
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


## Tokenizer 
Tokenization is used to split paragraphs and sentences into smaller units that can be more easily assigned meaning. The first step of the NLP process is gathering the data (a sentence) and breaking it into understandable parts (words).

In [9]:
doc2 = nlp(u"Tesla isn't looking into startups   anymore")
for token in doc2:
    print("{} {} {}".format(token.text,token.pos_,token.dep_))

Tesla NOUN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN compound
   SPACE dep
anymore ADV advmod


In [10]:
print(doc2[0].pos_) # part of speech 
print(doc2[0].dep_) # dependency
print(doc2[0].tag_) # detail part-of-speech NNP noun/proper/singular
print(doc2[0].lemma_) # base form of word
print(doc2[0].is_alpha) # is token alpha char ? 
print(doc2[0].is_stop) # is stop words i.e. most common words in language

NOUN
nsubj
NN
tesla
True
False


## spans

span is slice of document in the form of doc[start:stop]

In [11]:
doc = nlp(u"Although commanly attributed to Jhon Lennon from his song 'Beautiful' the phrase 'Life is what happens to while we are making other plans' was writtten by a cartoon artist Allen Saunders")


In [12]:
life_quote = doc[15:30]
print(life_quote)
type(life_quote) # if take a slice , spacy is smart to get it is span 

Life is what happens to while we are making other plans' was writtten by


spacy.tokens.span.Span

In [13]:
doc4 = nlp("This is first sentance.This is another sentence.This is last sentence")

for sentence in doc4.sents:
    print(sentence)

print(doc4[10].is_sent_start)
print(doc4[8].is_sent_start)

This is first sentance.
This is another sentence.
This is last sentence
True
False


In [14]:
doc5 = nlp("This is first sentance. this is another sentence.This is last sentence") # we must give either '. ' or '.<Caps>

for sentence in doc5.sents:
    print(sentence)


This is first sentance.
this is another sentence.
This is last sentence


## Tokenization

 * prefix: char at begining $ (
 * suffix: char at end km , m 
 * infix: char in between --/ ...
 * Exception: Special case rule to split a string in several  tokens or prevent a tokken from being split when punctuation rules are applied ex: U.s. ! 

In [15]:
mystr = '"We\'re moving to L.A.!"'
print(mystr)
doc = nlp(mystr)
for token  in doc :
    print(token.text) # punctuations are seperated

"We're moving to L.A.!"
"
We
're
moving
to
L.A.
!
"


In [16]:
doc2 = nlp(u"We\'re here to help! Send us email, email support@oursite.com or visit http://www.oursite.com")
for token in doc2:
    print(token)

We
're
here
to
help
!
Send
us
email
,
email
support@oursite.com
or
visit
http://www.oursite.com


In [17]:
doc3 = nlp("a 5km NYC to St.Louis church U.S. cab ride costs $10.30")
for token in doc3:
    print(token,end = " | ")

print("Token length",len(doc3))
print(len(doc3.vocab))
# get slice of token 
print(doc3[3:7])
# document does not assign doc[0]=test will not work 

a | 5 | km | NYC | to | St. | Louis | church | U.S. | cab | ride | costs | $ | 10.30 | Token length 14
835
NYC to St.Louis


## Named Entities

In [18]:
docent = nlp("Intel company to build a factory in India worth INR 8crore")
for token in docent:
    print(token.text,end=" | ")

Intel | company | to | build | a | factory | in | India | worth | INR | 8crore | 

In [19]:
for entity in docent.ents:
    print(entity , end=" | ")

Intel | India | INR | 

In [20]:
for entity in docent.ents:
    print("{} {} {}".format(entity,entity.label_,spacy.explain(entity.label_)),end=' | ')

Intel ORG Companies, agencies, institutions, etc. | India GPE Countries, cities, states | INR ORG Companies, agencies, institutions, etc. | 

### Chunks: Noun chunks 

In [21]:
docent1 = nlp("Autonomous cars shift insurance liabilities to software manufactures")
for chunk in docent1.noun_chunks:
    print(chunk)

Autonomous cars
insurance liabilities
software manufactures


### Token Visulaization

In [22]:
from spacy import displacy

In [23]:
doc = nlp("Intel is going to build a factory to build only GPU's in Banglore for $6 million")

In [24]:
displacy.render(doc,style='dep',jupyter=True,options={'distance':100}) # sytactic dependency

In [25]:
doc1 = nlp("Over the last few quarters Apple sold nearly 50 thousand iPods for a profit of around $6 million.")

In [26]:
displacy.render(doc1,style='ent',jupyter=True) # check on documentation

### Stemming

Often searching for certain keywords it helps if search returns the variation of the word .

* For instance searching for "boat" might also returns "boats" and "boating". Here "boat" would be the stem  for [boating,boater,boats]. spcay does not include stemmer instead it has lemmitzation.

* Chops of letters until stem is reached.

###  Alogrithm 1 Porters Algorithm
* From a given set of rules only one rule applies based on longest suffix S1.

* More spohesticated phases consider the length and complexity.

### Algorithm 2 Summer Algorithm
Improved version of Porter algorithm is summer also developed by Porter.Its more improved but still has some flaws.

In [28]:
import nltk 
from nltk.stem.porter import PorterStemmer
# porters Algorithm 

p_stemmer = PorterStemmer()
words = ['run','runner','rans','easily','fairly','categorically',
         'basically','boating','organically','fairness','raining','generation',
         'generate','generous','generously']


In [29]:
for word in words :
    print(word + '------->'+p_stemmer.stem(word))

run------->run
runner------->runner
rans------->ran
easily------->easili
fairly------->fairli
categorically------->categor
basically------->basic
boating------->boat
organically------->organ
fairness------->fair
raining------->rain
generation------->gener
generate------->gener
generous------->gener
generously------->gener


In [30]:
from nltk.stem.snowball import SnowballStemmer
s_stemmer = SnowballStemmer(language='english')
for word in words:
    print(word + '------->'+s_stemmer.stem(word))

run------->run
runner------->runner
rans------->ran
easily------->easili
fairly------->fair
categorically------->categor
basically------->basic
boating------->boat
organically------->organ
fairness------->fair
raining------->rain
generation------->generat
generate------->generat
generous------->generous
generously------->generous


## Lemmatization

in contrast to stemmming , lematization looks beyond word reduction and considers a language full vocab to apply morphological analysis to words.

example : lemma of 'was' is 'be' and lemma of 'mice' is 'mouse' and in simlar way lemma of meeting might depend on use in sentence

In [31]:
import spacy 
nlp = spacy.load('en_core_web_sm')


In [32]:
doc1 = nlp("I am runner running in a race as I love to run since I won a race when I was age 10")

In [37]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_} ')

print("Text , POS , Lemma-hash, Lemma_Des")
print("\t")
show_lemmas(doc1)


Text , POS , Lemma-hash, Lemma_Des
	
I            PRON   4690420944186131903    I 
am           AUX    10382539506755952630   be 
runner       NOUN   12640964157389618806   runner 
running      VERB   12767647472892411841   run 
in           ADP    3002984154512732771    in 
a            DET    11901859001352538922   a 
race         NOUN   8048469955494714898    race 
as           SCONJ  7437575085468336610    as 
I            PRON   4690420944186131903    I 
love         VERB   3702023516439754181    love 
to           PART   3791531372978436496    to 
run          VERB   12767647472892411841   run 
since        SCONJ  10066841407251338481   since 
I            PRON   4690420944186131903    I 
won          VERB   471204509717844521     win 
a            DET    11901859001352538922   a 
race         NOUN   8048469955494714898    race 
when         SCONJ  15807309897752499399   when 
I            PRON   4690420944186131903    I 
was          AUX    10382539506755952630   be 
age        