In [3]:
import spacy

In [4]:
nlp = spacy.blank('en')
doc = nlp('Dr Strange loved Pav Bhaji in Mumbai as it costs only 2$ a plate')
for token in doc:
    print(token)

Dr
Strange
loved
Pav
Bhaji
in
Mumbai
as
it
costs
only
2
$
a
plate


In [6]:
type(nlp)

spacy.lang.en.English

In [7]:
type(doc)

spacy.tokens.doc.Doc

In [9]:
print(doc[1:5])
print(type(doc[1:5]))

Strange loved Pav Bhaji
<class 'spacy.tokens.span.Span'>


In [10]:
with open('list.txt') as f:
    text = f.readlines()
text

['Name        BirthDay        Email\n',
 'Virat       5 June, 1992    virat@kohli.com\n',
 'Maria       12 April, 2001  maria@sharapova.com\n',
 'Serena      24 June, 1998   serena@williams.com\n',
 'Joe         1 May, 1897     joe@rogan.com']

In [11]:
text = ' '.join(text)
text

'Name        BirthDay        Email\n Virat       5 June, 1992    virat@kohli.com\n Maria       12 April, 2001  maria@sharapova.com\n Serena      24 June, 1998   serena@williams.com\n Joe         1 May, 1897     joe@rogan.com'

In [13]:
doc = nlp(text)
emails = []
for token in doc:
    if token.like_email:
        emails.append(token.text)
print(emails)

['virat@kohli.com', 'maria@sharapova.com', 'serena@williams.com', 'joe@rogan.com']


In [14]:
tokens = [token.text for token in doc]
tokens

['Name',
 '       ',
 'BirthDay',
 '       ',
 'Email',
 '\n ',
 'Virat',
 '      ',
 '5',
 'June',
 ',',
 '1992',
 '   ',
 'virat@kohli.com',
 '\n ',
 'Maria',
 '      ',
 '12',
 'April',
 ',',
 '2001',
 ' ',
 'maria@sharapova.com',
 '\n ',
 'Serena',
 '     ',
 '24',
 'June',
 ',',
 '1998',
 '  ',
 'serena@williams.com',
 '\n ',
 'Joe',
 '        ',
 '1',
 'May',
 ',',
 '1897',
 '    ',
 'joe@rogan.com']

In [17]:
#handling special cases
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case('gimme',[
    {ORTH: 'gim'},
    {ORTH: 'me'}
])

doc = nlp('give me a double cheeseburger with extra healthy pizza and a diet coke')

tokens = [token.text for token in doc]
tokens

['give',
 'me',
 'a',
 'double',
 'cheeseburger',
 'with',
 'extra',
 'healthy',
 'pizza',
 'and',
 'a',
 'diet',
 'coke']

In [34]:
#adding a pipeline component manually

#nlp.add_pipe('sentencizer')


ValueError: [E007] 'sentencizer' already exists in pipeline. Existing names: ['sentencizer']

In [None]:
nlp.pipe_names

['sentencizer']

In [35]:
doc = nlp("Dr. Strange is kinda Strange. Hulk is a big green dude")
for sentence in doc.sents:
    print(sentence)

Dr. Strange is kinda Strange.
Hulk is a big green dude


In [36]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''
nlp.remove_pipe('sentencizer')
nlp.pipe_names

[]

In [39]:
doc = nlp(text)
tokens = [token for token in doc]
tokens


[,
 Look,
 for,
 data,
 to,
 help,
 you,
 address,
 the,
 question,
 .,
 Governments,
 are,
 good,
 ,
 sources,
 because,
 data,
 from,
 public,
 research,
 is,
 often,
 freely,
 available,
 .,
 Good,
 ,
 places,
 to,
 start,
 include,
 http://www.data.gov/,
 ,,
 and,
 http://www.science,
 .,
 ,
 gov/,
 ,,
 and,
 in,
 the,
 United,
 Kingdom,
 ,,
 http://data.gov.uk/.,
 ,
 Two,
 of,
 my,
 favorite,
 data,
 sets,
 are,
 the,
 General,
 Social,
 Survey,
 at,
 http://www3.norc.org/gss+website/,
 ,,
 ,
 and,
 the,
 European,
 Social,
 Survey,
 at,
 http://www.europeansocialsurvey.org/.,
 ]

In [40]:
urls = []
for token in tokens:
    if token.like_url:
        urls.append(token.text)
urls


['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [48]:
text = 'Tony gave two $ to Peter, Bruce gave 500 $ to Steve'
doc = nlp(text)
print(doc)

Tony gave two $ to Peter, Bruce gave 500$ to Steve


In [49]:
for token in doc:
    if token.like_num and doc[token.i+1].is_currency:
        print(token.text , doc[token.i+1].text)

two $
500 $
