In [1]:
!pip install spacy



In [2]:
import spacy

In [3]:
nlp = spacy.blank('en')
doc = nlp("Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate. My name is Raghu")

In [4]:
for sentence in doc:
    print(sentence)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.
My
name
is
Raghu


In [5]:
type(sentence)

spacy.tokens.token.Token

In [6]:
type(doc)

spacy.tokens.doc.Doc

In [7]:
type(nlp)

spacy.lang.en.English

In [8]:
doc

Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate. My name is Raghu

In [9]:
doc[0]

Dr.

In [10]:
token0 = doc[0]

In [11]:
token0.is_alpha

False

In [12]:
token0.like_num

False

In [13]:
token0.is_currency

False

### Collecting email ids of students from students information sheet


In [14]:

with open("coc.txt") as f:
    text = f.readlines()
text

['Greetings Snegha,\n',
 '\n',
 'I hope this email finds you well. I wanted to follow up regarding the second round presentation interview that I had with Lenovo.\n',
 'Interview panel(Dhijith) requested a quick 10-minute call after the interview, but unfortunately, I was unable to join due to network issues. \n',
 'I apologize for any inconvenience this may have caused. xyz@gmail.com\n',
 '\n',
 'I am now available to schedule a call during this week, if needed. Please let me know a convenient time and date for you, and I will make sure to be available.\n',
 '\n',
 'abc@gmail.com\n',
 'Best regards,\n',
 'Raghu M\n',
 '\n',
 '\n',
 '\n',
 '\n']

In [15]:
text = ' '.join(text)
text

'Greetings Snegha,\n \n I hope this email finds you well. I wanted to follow up regarding the second round presentation interview that I had with Lenovo.\n Interview panel(Dhijith) requested a quick 10-minute call after the interview, but unfortunately, I was unable to join due to network issues. \n I apologize for any inconvenience this may have caused. xyz@gmail.com\n \n I am now available to schedule a call during this week, if needed. Please let me know a convenient time and date for you, and I will make sure to be available.\n \n abc@gmail.com\n Best regards,\n Raghu M\n \n \n \n \n'

In [16]:
doc = nlp(text)
doc

Greetings Snegha,
 
 I hope this email finds you well. I wanted to follow up regarding the second round presentation interview that I had with Lenovo.
 Interview panel(Dhijith) requested a quick 10-minute call after the interview, but unfortunately, I was unable to join due to network issues. 
 I apologize for any inconvenience this may have caused. xyz@gmail.com
 
 I am now available to schedule a call during this week, if needed. Please let me know a convenient time and date for you, and I will make sure to be available.
 
 abc@gmail.com
 Best regards,
 Raghu M
 
 
 
 

In [17]:
email = []
for token in doc:
    if token.like_email:
        email.append(token.text)
print(email)

['xyz@gmail.com', 'abc@gmail.com']


### Support in other languages


In [18]:
nlp = spacy.blank("hi")
doc = nlp("भैया जी! 5000 ₹ उधार थे वो वापस देदो")
for token in doc:
    print(token, token.is_currency)

भैया False
जी False
! False
5000 False
₹ True
उधार False
थे False
वो False
वापस False
देदो False


### Customizing tokenizer


In [19]:
from spacy.symbols import ORTH

nlp = spacy.blank("en")
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [20]:
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"},
])
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

### Sentence Tokenization or Segmentation

In [21]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [22]:
nlp.pipeline

[]

In [23]:
nlp.add_pipe('sentencizer')


<spacy.pipeline.sentencizer.Sentencizer at 0x1cb61a241c0>

In [None]:
nlp.pipeline

In [24]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

In [25]:
doc = nlp(text)
doc


Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.

In [26]:
for sentence in doc.sents:
    print(sentence)


Look for data to help you address the question.
Governments are good
sources because data from public research is often freely available.
Good
places to start include http://www.data.gov/, and http://www.science.

gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.



In [27]:
for tok in doc:
    print(tok)



Look
for
data
to
help
you
address
the
question
.
Governments
are
good


sources
because
data
from
public
research
is
often
freely
available
.
Good


places
to
start
include
http://www.data.gov/
,
and
http://www.science
.


gov/
,
and
in
the
United
Kingdom
,
http://data.gov.uk/.


Two
of
my
favorite
data
sets
are
the
General
Social
Survey
at
http://www3.norc.org/gss+website/
,


and
the
European
Social
Survey
at
http://www.europeansocialsurvey.org/.




#### extracting URL

In [28]:
url = []
for tok in doc:
    if tok.like_url:
        url.append(tok.text)

In [29]:
url

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']