In [1]:
import spacy

In [3]:
nlp = spacy.blank("en")

doc = nlp("Hi i am Tahmid nice to meet you.")

for token in doc:
    print(token)

Hi
i
am
Tahmid
nice
to
meet
you
.


<h3>Using index to grab tokens</h3>

In [7]:

doc[0]

Hi

In [8]:
doc[:4]

Hi i am Tahmid

In [9]:
type(doc[:4])

spacy.tokens.span.Span

In [10]:
token = doc[1]
token.text

'i'

In [11]:
dir(token)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang

In [12]:
token.is_alpha

True

In [14]:
token.is_currency

False

In [16]:
for token in doc:
    print(token, "==>", "index: ", token.i, "is_alpha:", token.is_alpha, 
          "is_punct:", token.is_punct, 
          "like_num:", token.like_num,
          "is_currency:", token.is_currency,
         )

Hi ==> index:  0 is_alpha: True is_punct: False like_num: False is_currency: False
i ==> index:  1 is_alpha: True is_punct: False like_num: False is_currency: False
am ==> index:  2 is_alpha: True is_punct: False like_num: False is_currency: False
Tahmid ==> index:  3 is_alpha: True is_punct: False like_num: False is_currency: False
nice ==> index:  4 is_alpha: True is_punct: False like_num: False is_currency: False
to ==> index:  5 is_alpha: True is_punct: False like_num: False is_currency: False
meet ==> index:  6 is_alpha: True is_punct: False like_num: False is_currency: False
you ==> index:  7 is_alpha: True is_punct: False like_num: False is_currency: False
. ==> index:  8 is_alpha: False is_punct: True like_num: False is_currency: False


<b>Collecting email ids of students from students information sheet</b>

In [18]:
with open("Nlp-Notebook/Tokenization/students.txt") as f:
    text = f.readlines()
text


['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 'Tahmid   20 jan,1900    xyz@gmail.com\n',
 '\n',
 '\n']

In [19]:
text = " ".join(text)
text



In [22]:
doc=nlp(text)

In [23]:
doc

Dayton high school, 8th grade students information
 
 Name	birth day   	email
 -----	------------	------
 Virat   5 June, 1882    virat@kohli.com
 Maria	12 April, 2001  maria@sharapova.com
 Serena  24 June, 1998   serena@williams.com 
 Joe      1 May, 1997    joe@root.com
 Tahmid   20 jan,1900    xyz@gmail.com
 
 

In [29]:
emails=[]
for token in doc:
    if token.like_email:
        emails.append(token)

In [30]:
emails

[virat@kohli.com,
 maria@sharapova.com,
 serena@williams.com,
 joe@root.com,
 xyz@gmail.com]

<b>Supports other Language (Example Bangla)</b>

In [32]:
nlp = spacy.blank("bn")
doc = nlp(u'আমি বাংলায় গান গাই। তুমি কি গাও?')

In [33]:
doc

আমি বাংলায় গান গাই। তুমি কি গাও?

In [34]:
doc[0]

আমি

<b>Custom Tokkenization</b>

In [37]:
from spacy.symbols import ORTH

nlp = spacy.blank("en")
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens


['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [40]:
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gi"},
    {ORTH: "mme"},
])
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gi', 'mme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

<h3>Sentence Tokenization or Segmentation</h3>


In [41]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [42]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x17e4c2f2850>

In [43]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

Dr. Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi


In [44]:
nlp.pipeline

[('sentencizer', <spacy.pipeline.sentencizer.Sentencizer at 0x17e4c2f2850>)]

<h3>Collecting dataset websites from a book paragraph</h3>

In [67]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

# TODO: Write code here
# Hint: token has an attribute that can be used to detect a url

In [68]:
doc=nlp(text)

In [69]:
doc


Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.

In [74]:
url=[token for token in doc if token.like_url]

In [75]:
url

[http://www.data.gov/,
 http://www.science,
 http://data.gov.uk/.,
 http://www3.norc.org/gss+website/,
 http://www.europeansocialsurvey.org/.]

In [76]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

# TODO: Write code here
# Hint: Use token.i for the index of a token and token.is_currency for currency symbol detection

In [78]:
doc=nlp(transactions)

In [79]:
doc

Tony gave two $ to Peter, Bruce gave 500 € to Steve

In [87]:
for i in range(len(doc)-1):
    if(doc[i].like_num and doc[i+1].is_currency):
        print(doc[i],doc[i+1])
    

two $
500 €
