In [1]:
!pip install spacy



In [2]:
import spacy

In [3]:
nlp = spacy.blank("en")

In [4]:
doc = nlp("I was born in India in 1996, I am 27 years old")

In [5]:
for token in doc:
    print(token)

I
was
born
in
India
in
1996
,
I
am
27
years
old


### Grab tokens using index

In [6]:
token0 = doc[0]
token0

I

In [7]:
token0.is_alpha

True

In [8]:
token6 = doc[6]
token6

1996

In [9]:
type(token)

spacy.tokens.token.Token

In [10]:
dir(token)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang

### Span object

In [11]:
span = doc[0:7]
span

I was born in India in 1996

In [12]:
type(span)

spacy.tokens.span.Span

### Token attribute

In [13]:
print(token6.is_alpha)
print(token6.is_digit)

False
True


In [14]:
doc = nlp("MacbookAir costs $100 for its M2 Series. ")

In [15]:
token2=doc[2]

In [16]:
token2

$

In [17]:
token2.is_currency

True

In [18]:
for token in doc:
    print(token,'==>','index:',token.i,
         'is_alpha:', token.is_alpha,
         'is_punct:', token.is_punct,
         'is_digit:', token.is_digit,
         'is_currency:', token.is_currency)

MacbookAir ==> index: 0 is_alpha: True is_punct: False is_digit: False is_currency: False
costs ==> index: 1 is_alpha: True is_punct: False is_digit: False is_currency: False
$ ==> index: 2 is_alpha: False is_punct: False is_digit: False is_currency: True
100 ==> index: 3 is_alpha: False is_punct: False is_digit: True is_currency: False
for ==> index: 4 is_alpha: True is_punct: False is_digit: False is_currency: False
its ==> index: 5 is_alpha: True is_punct: False is_digit: False is_currency: False
M2 ==> index: 6 is_alpha: False is_punct: False is_digit: False is_currency: False
Series ==> index: 7 is_alpha: True is_punct: False is_digit: False is_currency: False
. ==> index: 8 is_alpha: False is_punct: True is_digit: False is_currency: False


### Collecting Emails from student info

In [19]:
with open('students.txt') as f:
    text = f.readlines()
text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

In [20]:
text = " ".join(text)

In [21]:
text



In [22]:
doc = nlp(text)

In [23]:
emails=[]

In [24]:
for token in doc:
    if token.like_email:
        emails.append(token.text)
        
emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

### Other Languages

In [25]:
nlp = spacy.blank('ml')

In [26]:
doc = nlp('മാക്ബുക്ക്എയർ അതിന്റെ എംടു സീരീസിന് $100 ചിലവാകുന്നു.')

In [27]:
for token in doc:
    print(token,token.is_alpha)

മാക്ബുക്ക്എയർ False
അതിന്റെ False
എംടു False
സീരീസിന് False
$ False
100 False
ചിലവാകുന്നു. False


In [28]:
for token in doc:
    print(token,token.is_currency)

മാക്ബുക്ക്എയർ False
അതിന്റെ False
എംടു False
സീരീസിന് False
$ True
100 False
ചിലവാകുന്നു. False


In [29]:
for token in doc:
    print(token,token.like_num)

മാക്ബുക്ക്എയർ False
അതിന്റെ False
എംടു False
സീരീസിന് False
$ False
100 True
ചിലവാകുന്നു. False


### Customizing tokenizer

In [30]:
from spacy.symbols import ORTH

nlp = spacy.blank("en")
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [31]:
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"}, #we can only split the words(cant replace by new words like "give+me")
])
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

## Sentence tokanization(Segmentation)

In [32]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x11f3d6690>

In [33]:
doc = nlp('Natural language processing (NLP) is a branch of artificial intelligence (AI). Itenables computers to comprehend, generate, and manipulate human language. ')

In [34]:
for sentence in doc.sents:
    print(sentence)

Natural language processing (NLP) is a branch of artificial intelligence (AI).
Itenables computers to comprehend, generate, and manipulate human language.


## Exercise

#### 1. Extracting URLs

In [38]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''
#Hint: token has an attribute that can be used to detect a url


In [40]:
doc = nlp(text)
url = []
for token in doc:
    if token.like_url:
        url.append(token.text)
url

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [45]:
# alternative solution
doc = nlp(text)
data_websites = [token.text for token in doc if token.like_url ] 
data_websites

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

#### 2. Extract all money transaction from below sentence along with currency. 
Output should be,
two $ ,
500 €

In [47]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"


In [48]:
doc = nlp(transactions)

In [49]:
for token in doc:
    if token.like_num and doc[token.i+1].is_currency:
        print(token.text,doc[token.i+1].text)
        

two $
500 €
