In [29]:
import spacy

In [30]:
nlp = spacy.blank('en')

doc = nlp('Dr. Strange lived in Hanoi and he loves this place so much.')
for token in doc:
    print(token)

Dr.
Strange
lived
in
Hanoi
and
he
loves
this
place
so
much
.


In [31]:
doc[2]

lived

In [32]:
doc = nlp('Tony gave two $ to Peter')

In [33]:
dir(doc)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '_bulk_merge',
 '_context',
 '_get_array_attrs',
 '_realloc',
 '_vector',
 '_vector_norm',
 'cats',
 'char_span',
 'copy',
 'count_by',
 'doc',
 'ents',
 'extend_tensor',
 'from_array',
 'from_bytes',
 'from_dict',
 'from_disk',
 'from_docs',
 'from_json',
 'get_extension',
 'get_lca_matrix',
 'has_annotation',
 'has_extension',
 'has_unknown_spaces',
 'has_vector',
 'is_nered',
 'is_parsed',
 'is_sentenced',
 'is_tagged',
 'lang',
 'lang_',
 'mem',
 'noun_chunks',
 'noun_chunks_iterator',
 'remove_extension',
 'retokenize',
 'sentiment',
 'sents',
 'set

In [34]:
token0 = doc[0]
token0

Tony

In [35]:
type(token0)

spacy.tokens.token.Token

In [36]:
token0.is_alpha

True

In [37]:
token2 = doc[2]
token2.like_num

True

In [38]:
with open('./data/students.txt') as f:
    text = f.readlines()
text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com']

In [39]:
text = ' '.join(text)
text



In [40]:
doc = nlp(text)
emails = []
for token in doc:
    if token.like_email:
        emails.append(token.text)
emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

### Customize tokenizer

In [41]:
from spacy.symbols import ORTH
nlp = spacy.blank('en')
doc = nlp('gimme double cheese extra large healthy pizza')
tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [42]:
nlp.tokenizer.add_special_case('gimme',[{ORTH:'gim'},{ORTH:'me'}])
doc = nlp('gimme double cheese extra large healthy pizza')
tokens = [token.text for token in doc]
tokens 

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [43]:
doc = nlp('DR. Strange loves banh mi in Hanoi. Thor loves restroom in Nam Dinh')
for sentence in doc.sents:
    print(sentence)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [44]:
nlp.pipeline

[]

In [45]:
nlp.add_pipe('sentencizer')
doc = nlp('Dr. Strange loves banh mi in Hanoi. Thor loves restroom in Nam Dinh')
for sentence in doc.sents:
    print(sentence)

Dr. Strange loves banh mi in Hanoi.
Thor loves restroom in Nam Dinh


# Exercise
(1) Think stats is a free book to study statistics (https://greenteapress.com/thinkstats2/thinkstats2.pdf)

This book has references to many websites from where you can download free datasets. You are an NLP engineer working for some company and you want to collect all dataset websites from this book. To keep exercise simple you are given a paragraph from this book and you want to grab all urls from this paragraph using spacy

In [47]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''
nlp = spacy.blank('en')
doc = nlp(text)
urls = []
for token in doc:
    if token.like_url:
        urls.append(token.text)
urls
# TODO: Write code here
# Hint: token has an attribute that can be used to detect a url

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

(2) Extract all money transaction from below sentence along with currency. Output should be,

two $

500 €

In [78]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc = nlp(transactions)
tmp =[]
for token in doc:
    if token.like_num and doc[token.i+1].is_currency:
        print(token.text, doc[token.i+1])

two $
500 €
