In [1]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [2]:
doc=nlp(u'9 99 9999 ghjkl')

In [3]:
doc[0].is_alpha

False

In [4]:
doc[3].is_alpha

True

In [5]:
doc=nlp(u'"We\'re moving to L.A.!"')
print(doc)

"We're moving to L.A.!"


In [6]:
for t in doc:
    print(t.text,end=' | ')

" | We | 're | moving | to | L.A. | ! | " | 

* First split happens on whitespace.
* Prefix Exception - We're split into We & 're
* Suffix Exception - L.A.! split into  L.A. and !

### NER

In [7]:
doc2= nlp(u'Apple to build a Hongkong factory for $6 million')

In [8]:
for t in doc2:
    print(t.text,end=' | ')


print()
print()
for ent in doc2.ents:
    print(ent.text,' >> ',ent.label_, ' >> ', str(spacy.explain(ent.label_)))

Apple | to | build | a | Hongkong | factory | for | $ | 6 | million | 

Apple  >>  ORG  >>  Companies, agencies, institutions, etc.
Hongkong  >>  GPE  >>  Countries, cities, states
$6 million  >>  MONEY  >>  Monetary values, including unit


In [9]:
from spacy import displacy

displacy.render(doc2,style='ent',jupyter=True)

In [10]:
displacy.render(doc2,style='dep',jupyter=True,options={'distance':90})

In [None]:
displacy.serve(doc2,style='dep')

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



# semantic similarity

In [None]:
doc1 = nlp('cat')

In [None]:
doc2 = nlp('lion')

In [None]:
doc1.similarity(doc2)

In [None]:
ex1 = nlp('wolf dog cat fish bird')

In [None]:
for token1 in ex1:
    for token2 in ex1:
        print((token1.text,token2.text),'similarity =>',token1.similarity(token2))

In [None]:
mylist = [(token1.text,token2.text,token1.similarity(token2)) for token1 in ex1 for token2 in ex1]

In [None]:
mylist

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(mylist)

In [None]:
df.head()

In [None]:
df.columns = ['token1','token2','similarity']

In [None]:
df

# stop words

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
print(STOP_WORDS)

In [None]:
nlp.vocab['the'].is_stop

In [None]:
mysentense = nlp('This is a sentense about how to use stopwords in natural language processing')

In [None]:
for words in mysentense:
    if words.is_stop == True:
        print(words)

In [None]:
for words in mysentense:
    if words.is_stop == False:
        print(words)

In [None]:
[words for words in mysentense if words.is_stop == False]

#### adding stop words

In [None]:
STOP_WORDS.add('lol')

In [None]:
nlp.vocab['lol']

# Noun Chunks
  
  * noun + word describing the noun
  * noun phrases
  * adnomial
  * root.text


In [None]:
vl = nlp('The man reading the news is very tall')

In [None]:
for token in vl.noun_chunks:
    print(token.text)

In [None]:
for token in vl.noun_chunks:
    print(token.root.text)

In [None]:
for token in vl.noun_chunks:
    print(token.root.text,'connecyer_text =>',token.root.head.text)

## Sentence Segmentation or Boundary Detection
* ##### Deciding where sentences begin and end
===================================================
* a) If it's a period, it ends a sentence.
* (b) If the preceding token is in the hand-compiled list of abbreviations, then it doesn't end a sentence.
* (c) If the next token is capitalized, then it ends a sentence.
* ===================================================
* Default = Uses the Dependency parser
* Custom Rule Based or Manual
* You set boundaries before parsing the doc

In [1]:
# Manual or Custom Based
def mycustom_boundary(docx):
    for token in docx[:-1]:
        if token.text == '...':
            docx[token.i+1].is_sent_start = True
    return docx

In [4]:
nlp.add_pipe(mycustom_boundary,before='parser')

In [5]:
mydoc = nlp(u"This is my first sentence...the last comment was so cuul... what if...? this is the last sentence")

In [6]:
for sentence in mydoc.sents:
    print(sentence.text)

This is my first sentence...
the last comment was so cuul...
what if...
?
this is the last sentence


## Custom rule based

In [7]:
from spacy.lang.en import English
from spacy.pipeline import SentenceSegmenter

In [10]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    for word in doc:
        if seen_newline and not word.is_space:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text == '\t':
            seen_newline = True
    if start < len(doc):
        yield doc[start:len(doc)]

In [11]:
nlp = English()  # just the language with no model
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
nlp.add_pipe(sbd)

In [12]:
doc = nlp(u"This is a great sentence\n\nThis is another comment\nAnd more")
for sent in doc.sents:
    print(sent.text)

This is a great sentence

This is another comment
And more


# Intent Classification with Rasa NLU and SpaCy
* A Libary for intent recognition and entity extraction based on SpaCy and Sklearn
* NLP = NLU+NLG+ More
* NLP = understand,process,interprete everyday human language
* NLU = unstructured inputs and convert them into a structured form that a machine can understand and act upon
* Uses
* Chatbot task
* NL understanding
* Intent classification

###### Installation
* pip install rasa_nlu
* python -m rasa_nlu.server &
* sklearn_crfsuite
###### using spacy as backend
* pip install rasa_nlu[spacy]
* python -m spacy download en_core_web_md
* python -m spacy link en_core_web_md en

= = Dataset = =

* demo-rasa.json
* config_spacy.yaml

In [4]:
pip install rasa_nlu


Collecting rasa_nlu
  Downloading https://files.pythonhosted.org/packages/19/c4/c6146c445a17b6ce414d773f93c941c44ca16720609000ae3d01409f9dfb/rasa_nlu-0.15.1-py3-none-any.whl (147kB)
Collecting jsonschema~=2.6 (from rasa_nlu)
  Downloading https://files.pythonhosted.org/packages/77/de/47e35a97b2b05c2fadbec67d44cfcdcd09b8086951b331d82de90d2912da/jsonschema-2.6.0-py2.py3-none-any.whl
Collecting coloredlogs~=10.0 (from rasa_nlu)
  Downloading https://files.pythonhosted.org/packages/08/0f/7877fc42fff0b9d70b6442df62d53b3868d3a6ad1b876bdb54335b30ff23/coloredlogs-10.0-py2.py3-none-any.whl (47kB)
Collecting typing~=3.6 (from rasa_nlu)
  Downloading https://files.pythonhosted.org/packages/fe/2e/b480ee1b75e6d17d2993738670e75c1feeb9ff7f64452153cf018051cc92/typing-3.7.4.1-py3-none-any.whl
Collecting cloudpickle~=0.6.1 (from rasa_nlu)
  Downloading https://files.pythonhosted.org/packages/fc/87/7b7ef3038b4783911e3fdecb5c566e3a817ce3e890e164fc174c088edb1e/cloudpickle-0.6.1-py2.py3-none-any.whl
Collect

ERROR: spyder 3.3.6 requires pyqt5<5.13; python_version >= "3", which is not installed.
ERROR: spyder 3.3.6 requires pyqtwebengine<5.13; python_version >= "3", which is not installed.
ERROR: jupyterlab-server 1.0.6 has requirement jsonschema>=3.0.1, but you'll have jsonschema 2.6.0 which is incompatible.
ERROR: Could not install packages due to an EnvironmentError: [WinError 5] Access is denied: 'c:\\programdata\\anaconda3\\lib\\site-packages\\jsonschema-3.0.2.dist-info\\COPYING'
Consider using the `--user` option or check the permissions.



Note: you may need to restart the kernel to use updated packages.


In [6]:
python -m rasa_nlu.server 

SyntaxError: invalid syntax (<ipython-input-6-da6270326668>, line 1)

In [8]:
pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting tabulate (from sklearn_crfsuite)
  Downloading https://files.pythonhosted.org/packages/c4/41/523f6a05e6dc3329a5660f6a81254c6cd87e5cfb5b7482bae3391d86ec3a/tabulate-0.8.6.tar.gz (45kB)
Collecting python-crfsuite>=0.8.3 (from sklearn_crfsuite)
  Downloading https://files.pythonhosted.org/packages/87/07/91b578dabc20e78f77aa51dc2e1570099b9b4cc2f7f437a7007d212be464/python_crfsuite-0.9.6-cp37-cp37m-win_amd64.whl (154kB)
Building wheels for collected packages: tabulate
  Building wheel for tabulate (setup.py): started
  Building wheel for tabulate (setup.py): finished with status 'done'
  Created wheel for tabulate: filename=tabulate-0.8.6-cp37-none-any.whl size=23279 sha256=78f932b93131a13109744f63177cc3eb0b9e7517d1a7e98ef419d6704cf9f973
  Stored in directory: C:\Users\Vimannyu Singh\AppData\

In [None]:
pip install rasa_nlu[spacy]

In [None]:
python -m spacy download en_core_web_md
python -m spacy link en_core_web_md e