<h1 style="text-align:center"> Text Processing using Spacy 

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
with open("Text.txt","r") as file:
    data = file.read().replace("\n"," ")

In [4]:
data[:20]

'And Eurypylus, son o'

In [5]:
#converting data to lower case
data = data.lower()

In [6]:
#creating nlp object
doc = nlp(data)

## Tokens

#### sentence tokens

In [7]:
sentences = list(doc.sents)
for sentence in sentences :
    print(f"{sentence[:4]}")

and eurypylus, son
eurypylus gave him chase
the bloody hand fell
thus furiously did the
as for the son
he rushed across the
were the dense phalanxes


#### word tokens

In [8]:
for token in doc[:10]:
    print(token)

and
eurypylus
,
son
of
euaemon
,
killed
hypsenor
,


## Remove punctuations

In [9]:
def remove_punc(text):
    output = [token for token in text if not token.is_punct]
    return output
    

In [10]:
token_words = remove_punc(doc)

In [11]:
token_words[:10]

[and, eurypylus, son, of, euaemon, killed, hypsenor, the, son, of]

## Remove stopwords

In [12]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS
stop_words = list(stop_words)

In [13]:
def remove_stopwords(text):
    output = [token for token in text if not token.is_stop]
    return output

In [14]:
token_words_sw = remove_stopwords(token_words)

In [15]:
token_words_sw[:10]

[eurypylus,
 son,
 euaemon,
 killed,
 hypsenor,
 son,
 noble,
 dolopion,
 priest,
 river]

## Lemmatization

In [16]:
def lemmatization(text):
    output = []
    for token in text:
        output.append(str(token.lemma_))
    return output

In [17]:
lemma_text = lemmatization(token_words_sw)
lemma_text[:10]

['eurypylus',
 'son',
 'euaemon',
 'kill',
 'hypsenor',
 'son',
 'noble',
 'dolopion',
 'priest',
 'river']

## Part of speech tagging

In [18]:
for token in token_words_sw[:5]:
    print(f"""Token:{token.text}
    Tag:{str(token.tag_):10} Pos:{str(token.pos_):}
    Explanation: {spacy.explain(token.tag_)}""")

Token:eurypylus
    Tag:NNP        Pos:PROPN
    Explanation: noun, proper singular
Token:son
    Tag:NN         Pos:NOUN
    Explanation: noun, singular or mass
Token:euaemon
    Tag:NNP        Pos:PROPN
    Explanation: noun, proper singular
Token:killed
    Tag:VBN        Pos:VERB
    Explanation: verb, past participle
Token:hypsenor
    Tag:NN         Pos:NOUN
    Explanation: noun, singular or mass


In [19]:
from spacy import displacy

In [20]:
displacy.render(doc[:10], style="dep", jupyter=True)