<h1 style="text-align:center"> Text Processing using Spacy 

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
with open("Text.txt","r") as file:
    data = file.read().replace("\n"," ")

In [4]:
data[:20]

'Now is the winter of'

In [5]:
#converting data to lower case
data = data.lower()

In [6]:
#creating nlp object
doc = nlp(data)

## Tokens

#### sentence tokens

In [7]:
sentences = list(doc.sents)
for sentence in sentences :
    print(f"{sentence[:4]}")

now is the winter
now are our brows
grim-visaged war
but i, that


#### word tokens

In [8]:
for token in doc[:10]:
    print(token)

now
is
the
winter
of
our
discontent
made
glorious
summer


## Remove punctuations

In [9]:
def remove_punc(text):
    output = [token for token in text if not token.is_punct]
    return output
    

In [10]:
token_words = remove_punc(doc)

In [11]:
token_words[:10]

[now, is, the, winter, of, our, discontent, made, glorious, summer]

## Remove stopwords

In [12]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS
stop_words = list(stop_words)

In [13]:
def remove_stopwords(text):
    output = [token for token in text if not token.is_stop]
    return output

In [14]:
token_words_sw = remove_stopwords(token_words)

In [15]:
token_words_sw[:10]

[winter, discontent, glorious, summer, sun, york, clouds, lour'd, house, deep]

## Lemmatization

In [16]:
def lemmatization(text):
    output = []
    for token in text:
        output.append(str(token.lemma_))
    return output

In [17]:
lemma_text = lemmatization(token_words_sw)
lemma_text[:10]

['winter',
 'discontent',
 'glorious',
 'summer',
 'sun',
 'york',
 'cloud',
 "lour'd",
 'house',
 'deep']

## Part of speech tagging

In [18]:
for token in token_words_sw[:5]:
    print(f"""Token:{token.text}
    Tag:{str(token.tag_):10} Pos:{str(token.pos_):}
    Explanation: {spacy.explain(token.tag_)}""")

Token:winter
    Tag:NN         Pos:NOUN
    Explanation: noun, singular or mass
Token:discontent
    Tag:NN         Pos:NOUN
    Explanation: noun, singular or mass
Token:glorious
    Tag:JJ         Pos:ADJ
    Explanation: adjective (English), other noun-modifier (Chinese)
Token:summer
    Tag:NN         Pos:NOUN
    Explanation: noun, singular or mass
Token:sun
    Tag:NN         Pos:NOUN
    Explanation: noun, singular or mass


In [19]:
from spacy import displacy

In [20]:
displacy.render(doc[:10], style="dep", jupyter=True)