In [1]:
import spacy

In [2]:
nlp = spacy.blank("en")

In [7]:
doc = nlp('''"Let's go to N.Y.!"''')

for token in doc:
    print(token)

"
Let
's
go
to
N.Y.
!
"


In [8]:
doc = nlp("Mr.David has a very big house. He took it out of a mortgage loan for $500 only")

for token in doc:
    print(token)

Mr.
David
has
a
very
big
house
.
He
took
it
out
of
a
mortgage
loan
for
$
500
only


In [9]:
type(nlp)

spacy.lang.en.English

In [10]:
type(doc)

spacy.tokens.doc.Doc

In [11]:
type(token)

spacy.tokens.token.Token

In [14]:
doc[1:8]

David has a very big house.

In [19]:
for token in doc:
    print(token.text,'==> index:',token.i,
         ';is_aplha:', token.is_alpha,
          ';is_punct:', token.is_punct,
         ';is_num:', token.like_num,
         ';is_currency:', token.is_currency)

Mr. ==> index: 0 ;is_aplha: False ;is_punct: False ;is_num: False ;is_currency: False
David ==> index: 1 ;is_aplha: True ;is_punct: False ;is_num: False ;is_currency: False
has ==> index: 2 ;is_aplha: True ;is_punct: False ;is_num: False ;is_currency: False
a ==> index: 3 ;is_aplha: True ;is_punct: False ;is_num: False ;is_currency: False
very ==> index: 4 ;is_aplha: True ;is_punct: False ;is_num: False ;is_currency: False
big ==> index: 5 ;is_aplha: True ;is_punct: False ;is_num: False ;is_currency: False
house ==> index: 6 ;is_aplha: True ;is_punct: False ;is_num: False ;is_currency: False
. ==> index: 7 ;is_aplha: False ;is_punct: True ;is_num: False ;is_currency: False
He ==> index: 8 ;is_aplha: True ;is_punct: False ;is_num: False ;is_currency: False
took ==> index: 9 ;is_aplha: True ;is_punct: False ;is_num: False ;is_currency: False
it ==> index: 10 ;is_aplha: True ;is_punct: False ;is_num: False ;is_currency: False
out ==> index: 11 ;is_aplha: True ;is_punct: False ;is_num: Fal

In [22]:
#load the students list
with open('students.txt') as f:
    text = f.readlines()
text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

In [23]:
#join into a single array
text = " ".join(text)
text




In [27]:
#extract email of students and append in array
doc = nlp(text)
emails = []
for token in doc:
    if token.like_email:
        emails.append(token.text)
emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

In [29]:
#customizing tokenizers
doc = nlp('gimme a ride to bahamas sir')
tokens = [token.text for token in doc]
tokens

['gimme', 'a', 'ride', 'to', 'bahamas', 'sir']

In [34]:
#customize gimme to 'gim' and 'me'
from spacy.symbols import ORTH
nlp.tokenizer.add_special_case('gimme', [{ORTH:'gim'},{ORTH:'me'},])
doc = nlp('gimme a ride to bahamas sir')
tokens = [token.text for token in doc]
tokens



['gim', 'me', 'a', 'ride', 'to', 'bahamas', 'sir']

In [35]:
#splitting sentences
doc = nlp("Mr.David has a very big house. He took it out of a mortgage loan for $500 only")
for sentence in doc.sents:
    print(sentence)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [38]:
#the error is indicating the abscence of a pipeline
#let's check if we have any pipeline
nlp.pipe_names


[]

In [39]:
#It is empty
#hence we add a sentencizer pipeline

nlp.add_pipe('sentencizer')
nlp.pipe_names

['sentencizer']

In [40]:
doc = nlp("Mr.David has a very big house. He took it out of a mortgage loan for $500 only")
for sentence in doc.sents:
    print(sentence)

Mr.David has a very big house.
He took it out of a mortgage loan for $500 only


In [41]:
#extract url form this text
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

doc = nlp(text)
url = []
for token in doc:
    if token.like_url:
        url.append(token.text)
url
        

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [57]:
#extract financial transactions along with currency
line = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc = nlp(line)
for token in doc:
    if token.like_num and doc[token.i+1].is_currency:
        print(token.text, doc[token.i+1].text)

two $
500 €


In [None]:
#alternatively
line = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc = nlp(line)
for token in doc:
    if token.is_currency:
        print(doc[token.i-1],token)
        

        

