<h1 style="text-align:center"> Text Processing

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
with open("Text.txt", "r") as file:
    data = file.read().replace("\n"," ")

In [3]:
data

"Now is the winter of our discontent Made glorious summer by this sun of York; And all the clouds that lour'd upon our house In the deep bosom of the ocean buried. Now are our brows bound with victorious wreaths; Our bruised arms hung up for monuments; Our stern alarums changed to merry meetings, Our dreadful marches to delightful measures. Grim-visaged war hath smooth'd his wrinkled front; And now, instead of mounting barded steeds To fright the souls of fearful adversaries, He capers nimbly in a lady's chamber To the lascivious pleasing of a lute. But I, that am not shaped for sportive tricks, Nor made to court an amorous looking-glass; I, that am rudely stamp'd, and want love's majesty To strut before a wanton ambling nymph; I, that am curtail'd of this fair proportion,"

## Convert text to lowercase

In [4]:
data = data.lower()
data[:20]

'now is the winter of'

## Remove punctuation

In [5]:
import string

In [6]:
def remove_punctuation(text):
    puncs = set(string.punctuation)
    s = "".join([ch for ch in text if not ch in puncs ])
    return s

In [7]:
clean_data = remove_punctuation(data)

In [8]:
clean_data[:100]

'now is the winter of our discontent made glorious summer by this sun of york and all the clouds that'

## Remove whitespaces

In [9]:
clean_data = clean_data.strip()

In [10]:
clean_data[:100]

'now is the winter of our discontent made glorious summer by this sun of york and all the clouds that'

## Tokenization

#### Sentence tokenization

In [11]:
sentences = sent_tokenize(data)
for sentence in sentences:
    print(sentence)

now is the winter of our discontent made glorious summer by this sun of york; and all the clouds that lour'd upon our house in the deep bosom of the ocean buried.
now are our brows bound with victorious wreaths; our bruised arms hung up for monuments; our stern alarums changed to merry meetings, our dreadful marches to delightful measures.
grim-visaged war hath smooth'd his wrinkled front; and now, instead of mounting barded steeds to fright the souls of fearful adversaries, he capers nimbly in a lady's chamber to the lascivious pleasing of a lute.
but i, that am not shaped for sportive tricks, nor made to court an amorous looking-glass; i, that am rudely stamp'd, and want love's majesty to strut before a wanton ambling nymph; i, that am curtail'd of this fair proportion,


#### Word tokenization

In [12]:
words = word_tokenize(clean_data)
for word in words[:10]:
    print(word)

now
is
the
winter
of
our
discontent
made
glorious
summer


## Remove stopwords 

In [13]:
from nltk.corpus import stopwords

In [14]:
stop_words = set(stopwords.words("english"))

In [15]:
def remove_stopwords(text):
    output_text = [ele for ele in words if not ele in stop_words ]
    return output_text

In [16]:
clean_data_sw = remove_stopwords(words)
clean_data_sw[:10]

['winter',
 'discontent',
 'made',
 'glorious',
 'summer',
 'sun',
 'york',
 'clouds',
 'lourd',
 'upon']

## Stemming

In [17]:
from nltk import PorterStemmer

In [18]:
def stemming(text):
    stemmer = PorterStemmer()
    output = [ stemmer.stem(ele) for ele in text]
    return output

In [19]:
stem_text = stemming(clean_data_sw)
stem_text[:10]

['winter',
 'discont',
 'made',
 'gloriou',
 'summer',
 'sun',
 'york',
 'cloud',
 'lourd',
 'upon']

## Lemmatization

In [20]:
from nltk.stem import WordNetLemmatizer

In [21]:
def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    output = [ lemmatizer.lemmatize(ele) for ele in text]
    return output

In [22]:
lem_text = lemmatization(clean_data_sw)
lem_text[:10]

['winter',
 'discontent',
 'made',
 'glorious',
 'summer',
 'sun',
 'york',
 'cloud',
 'lourd',
 'upon']

## Part of speech tagging

In [23]:
pos = nltk.pos_tag(clean_data_sw)
pos[:10]

[('winter', 'NN'),
 ('discontent', 'NN'),
 ('made', 'VBD'),
 ('glorious', 'JJ'),
 ('summer', 'NN'),
 ('sun', 'NN'),
 ('york', 'NN'),
 ('clouds', 'NN'),
 ('lourd', 'VBZ'),
 ('upon', 'IN')]