In [1]:
import nltk

In [2]:
corpus = """Hello Welcome, to my NLP Repository.
Hope you are having a good day!
I am just trying to learn more about NLP and ML.
"""
print(corpus)

Hello Welcome, to my NLP Repository.
Hope you are having a good day!
I am just trying to learn more about NLP and ML.



In [3]:
from nltk.tokenize import sent_tokenize

documents = sent_tokenize(corpus)
documents

['Hello Welcome, to my NLP Repository.',
 'Hope you are having a good day!',
 'I am just trying to learn more about NLP and ML.']

In [4]:
from nltk.tokenize import word_tokenize

words = word_tokenize(corpus)
print(words)

['Hello', 'Welcome', ',', 'to', 'my', 'NLP', 'Repository', '.', 'Hope', 'you', 'are', 'having', 'a', 'good', 'day', '!', 'I', 'am', 'just', 'trying', 'to', 'learn', 'more', 'about', 'NLP', 'and', 'ML', '.']


In [5]:
for sentence in documents:
    print(word_tokenize(sentence))

['Hello', 'Welcome', ',', 'to', 'my', 'NLP', 'Repository', '.']
['Hope', 'you', 'are', 'having', 'a', 'good', 'day', '!']
['I', 'am', 'just', 'trying', 'to', 'learn', 'more', 'about', 'NLP', 'and', 'ML', '.']


In [6]:
from nltk.tokenize import wordpunct_tokenize

words = wordpunct_tokenize(corpus)
print(words)

['Hello', 'Welcome', ',', 'to', 'my', 'NLP', 'Repository', '.', 'Hope', 'you', 'are', 'having', 'a', 'good', 'day', '!', 'I', 'am', 'just', 'trying', 'to', 'learn', 'more', 'about', 'NLP', 'and', 'ML', '.']


In [7]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize(corpus))

['Hello', 'Welcome', ',', 'to', 'my', 'NLP', 'Repository.', 'Hope', 'you', 'are', 'having', 'a', 'good', 'day', '!', 'I', 'am', 'just', 'trying', 'to', 'learn', 'more', 'about', 'NLP', 'and', 'ML', '.']


In [9]:
import spacy

nlp = spacy.blank('en')

In [12]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai as it costs 2$ per plate")

for token in doc:
    print(token)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
2
$
per
plate


In [14]:
doc[1], doc[10]

(Strange, 2)

In [16]:
doc = nlp('''"Let's go to N.Y!"''')
for token in doc:
    print(token)

"
Let
's
go
to
N.Y
!
"


In [20]:
type(doc), type(token), type(doc[:2]), type(doc[2])

(spacy.tokens.doc.Doc,
 spacy.tokens.token.Token,
 spacy.tokens.span.Span,
 spacy.tokens.token.Token)

In [25]:
doc = nlp("Peter gave two $ to Tony.")
doc[2], doc[2].like_num

(two, True)

In [26]:
doc[3], doc[3].is_currency

($, True)

In [27]:
for token in doc:
    print(token, '==>', "index: ", token.i,
          "is_alpha: ", token.is_alpha,
          "is_punct: ", token.is_punct,
          "like_num: ", token.like_num,
          "is_currency: ", token.is_currency
          )

Peter ==> index:  0 is_alpha:  True is_punct:  False like_num:  False is_currency:  False
gave ==> index:  1 is_alpha:  True is_punct:  False like_num:  False is_currency:  False
two ==> index:  2 is_alpha:  True is_punct:  False like_num:  True is_currency:  False
$ ==> index:  3 is_alpha:  False is_punct:  False like_num:  False is_currency:  True
to ==> index:  4 is_alpha:  True is_punct:  False like_num:  False is_currency:  False
Tony ==> index:  5 is_alpha:  True is_punct:  False like_num:  False is_currency:  False
. ==> index:  6 is_alpha:  False is_punct:  True like_num:  False is_currency:  False


In [28]:
with open("sample_text.txt", "r") as f:
    doc = nlp(f.read())
    for token in doc:
        if token.like_email:
            print(token)

virat@kohli.com
maria@sharapova.com
serena@williams.com
joe@root.com


In [30]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x27144558780>

In [31]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chaat of delhi")

for sentence in doc.sents:
    print(sentence)

Dr. Strange loves pav bhaji of mumbai.
Hulk loves chaat of delhi


In [32]:
doc = nlp('''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
''')

for token in doc:
    if token.like_url:
        print(token)

http://www.data.gov/
http://www.science
http://data.gov.uk/.
http://www3.norc.org/gss+website/
http://www.europeansocialsurvey.org/.


In [33]:
doc = nlp("Tony gave two $ to Peter, Bruce gave 500 € to Steve")

for token in doc:
    if token.is_currency:
        print(doc[token.i - 1], token)

two $
500 €
