## Get Started With spaCy

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

### Tokenization

In [3]:
text01 = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(text01)
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [4]:
doc[0]

Apple

In [5]:
import requests
url = "https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2017/04/04080929/Tripadvisor_hotelreviews_Shivambansal.txt"
r = requests.get(url)

In [6]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'lxml')

In [16]:
text02 = soup.find('p').text
doc01 = nlp(text02)

In [21]:
# List of sentences
sent_doc01 = list(doc01.sents)
print("No. of sentences:",len(sent_doc01))

No. of sentences: 3655


In [22]:
# Tokenize text02
tokens02 = []
for token in doc01:
    tokens02.append(token.text)

In [25]:
print("No. of tokens:",len(tokens02))

No. of tokens: 61466


### POS

In [35]:
for token in doc:
    print("{0}\t{1}\t{2}\t{3}".format(
    token.text, token.idx, token.pos_, token.dep_))

Apple	0	PROPN	nsubj
is	6	VERB	aux
looking	9	VERB	ROOT
at	17	ADP	prep
buying	20	VERB	pcomp
U.K.	27	PROPN	compound
startup	32	NOUN	dobj
for	40	ADP	prep
$	44	SYM	quantmod
1	45	NUM	compound
billion	47	NUM	pobj


In [36]:
text02_idx = []
text02_pos = []
text02_dep = []
for token in doc01:
    text02_idx.append(token.idx)
    text02_pos.append(token.pos_)
    text02_dep.append(token.dep_)
    

In [38]:
import pandas as pd

text02_summary = pd.DataFrame({
    'tokens': tokens02,
    'index' : text02_idx,
    'pos' : text02_pos,
    'dep' : text02_dep
})


### POS and dependencies

In [42]:
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(
    token.text, token.lemma_, token.tag_,
     token.shape_, token.is_alpha, token.is_stop))

Apple	apple	NNP	Xxxxx	True	False
is	be	VBZ	xx	True	True
looking	look	VBG	xxxx	True	False
at	at	IN	xx	True	True
buying	buy	VBG	xxxx	True	False
U.K.	u.k.	NNP	X.X.	False	False
startup	startup	NN	xxxx	True	False
for	for	IN	xxx	True	True
$	$	$	$	False	False
1	1	CD	d	False	False
billion	billion	CD	xxxx	True	False


In [44]:
spacy.explain("VBZ")

'verb, 3rd person singular present'

### Named Entity

In [None]:
for ent in doc.ents:
    print("{0}\t{1}\t{2}\t{3}".format(ent.text, ent.start_char, ent.end_char, ent.label_))