In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

#### we created a doc from the text, which is a container for a document and all of its annotations. Then we iterated through the document to see what spaCy had parsed.

In [None]:
# token.text = tokenized word from sentence
# token.lemma_ = root form of the token
# token.pos_ = type of word
# token.is_stop = is it stopword?
text = "Balaji goes wide off the off-stump and Astle punishes him wih a cracking square cut"
doc = nlp(text)

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.is_stop)

Balaji Balaji PROPN False
goes go VERB False
wide wide ADV False
off off ADP True
the the DET True
off off ADJ True
- - PUNCT False
stump stump NOUN False
and and CCONJ True
Astle Astle PROPN False
punishes punish VERB False
him -PRON- PRON True
wih wih VERB False
a a DET True
cracking crack VERB False
square square NOUN False
cut cut NOUN False


#### reformat the spaCy parse of the sentence as a pandas dataframe

In [None]:
import pandas as pd

cols = ("text", "lemma", "POS", "explain", "stopword")
rows = []

for t in doc:
    row = [t.text, t.lemma_, t.pos_, spacy.explain(t.pos_), t.is_stop]
    rows.append(row)

df = pd.DataFrame(rows, columns=cols)
    
df

Unnamed: 0,text,lemma,POS,explain,stopword
0,Balaji,Balaji,PROPN,proper noun,False
1,goes,go,VERB,verb,False
2,wide,wide,ADV,adverb,False
3,off,off,ADP,adposition,True
4,the,the,DET,determiner,True
5,off,off,ADJ,adjective,True
6,-,-,PUNCT,punctuation,False
7,stump,stump,NOUN,noun,False
8,and,and,CCONJ,coordinating conjunction,True
9,Astle,Astle,PROPN,proper noun,False


#### text -  raw text
#### lemma – a root form of the word
#### pos - part of speech
#### explain - long form of pos
#### stopword - a flag for whether the word is a stopword – i.e., a common word that may be filtered out

#### using the displaCy library to visualize the parse tree

In [None]:
# how every word can be related? for example - goes - who? balaji , goes - what? wide / off
from spacy import displacy
# style = dep (describes its relationship with its parent)
displacy.render(doc, style="dep", jupyter=True)

In [None]:
# parse tree
import spacy
from nltk import Tree

en_nlp = spacy.load('en')

doc = en_nlp("Balaji goes wide off the off-stump and Astle punishes him wih a cracking square cut")

def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_


[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

                goes                
   ______________|_____              
  |     |             off           
  |     |              |             
  |     |            stump          
  |     |     _________|_________    
Balaji wide the off    -   and Astle

    punishes    
  _____|______   
him          wih

      cut          
  _____|_______     
 a  cracking square



[None, None, None]

#### sentence boundary detection (SBD) – also known as sentence segmentation

In [None]:
# boundary is at "but"
text = "Slip and silly point, wallace on strike, three quarter length, good thinking, gets wallace on the pad/ Next ball, the three quarter length again, the flipper, perfect prescription, flies off the splice and wide of gully, robin reacting a shade late, that was a catch but he fields it on the bounce and one to the batsman."

doc = nlp(text)

for sent in doc.sents:
    print(">", sent)

> Slip and silly point, wallace on strike, three quarter length, good thinking, gets wallace on the pad/ Next ball, the three quarter length again, the flipper, perfect prescription, flies off the splice and wide of gully, robin reacting a shade late, that was a catch
> but he fields it on the bounce and one to the batsman.


spaCy creates a document, it uses a principle of non-destructive tokenization meaning that the tokens, sentences, etc., are simply indexes into a long array.They don't carve the text stream into little pieces. So each sentence is a span with a start and an end index into the document array.

In [None]:
# number of characters before and after "but"
for sent in doc.sents:
    print(">", sent.start, sent.end)

> 0 55
> 55 68


We can index into the document array to pull out the tokens for one sentence

In [None]:
doc[48:54]

shade late, that was a

index into a specific token, such as the verb went in the last sentence

In [None]:
token = doc[51]
print(token.text, token.lemma_, token.pos_, spacy.explain(token.pos_), token.is_stop)

that that DET determiner True
