<h1 style="text-align:center"> Text Processing

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
with open("Text.txt", "r") as file:
    data = file.read().replace("\n"," ")

In [3]:
data[:100]

'And Eurypylus, son of Euaemon, killed Hypsenor, the son of noble Dolopion, who had been made priest '

## Convert text to lowercase

In [4]:
data = data.lower()
data[:20]

'and eurypylus, son o'

## Remove punctuation

In [5]:
import string

In [6]:
def remove_punctuation(text):
    puncs = set(string.punctuation)
    s = "".join([ch for ch in text if not ch in puncs ])
    return s

In [7]:
clean_data = remove_punctuation(data)

In [8]:
clean_data[:100]

'and eurypylus son of euaemon killed hypsenor the son of noble dolopion who had been made priest of t'

## Remove whitespaces

In [9]:
clean_data = clean_data.strip()

In [10]:
clean_data[:100]

'and eurypylus son of euaemon killed hypsenor the son of noble dolopion who had been made priest of t'

## Tokenization

#### Sentence tokenization

In [14]:
sentences = sent_tokenize(data)
for sentence in sentences[:2]:
    print(sentence)

and eurypylus, son of euaemon, killed hypsenor, the son of noble dolopion, who had been made priest of the river scamander, and was honoured among the people as though he were a god.
eurypylus gave him chase as he was flying before him, smote him with his sword upon the arm, and lopped his strong hand from off it.


#### Word tokenization

In [15]:
words = word_tokenize(clean_data)
for word in words[:10]:
    print(word)

and
eurypylus
son
of
euaemon
killed
hypsenor
the
son
of


## Remove stopwords 

In [16]:
from nltk.corpus import stopwords

In [17]:
stop_words = set(stopwords.words("english"))

In [18]:
def remove_stopwords(text):
    output_text = [ele for ele in words if not ele in stop_words ]
    return output_text

In [19]:
clean_data_sw = remove_stopwords(words)
clean_data_sw[:10]

['eurypylus',
 'son',
 'euaemon',
 'killed',
 'hypsenor',
 'son',
 'noble',
 'dolopion',
 'made',
 'priest']

## Stemming

In [20]:
from nltk import PorterStemmer

In [21]:
def stemming(text):
    stemmer = PorterStemmer()
    output = [ stemmer.stem(ele) for ele in text]
    return output

In [22]:
stem_text = stemming(clean_data_sw)
stem_text[:10]

['eurypylu',
 'son',
 'euaemon',
 'kill',
 'hypsenor',
 'son',
 'nobl',
 'dolopion',
 'made',
 'priest']

## Lemmatization

In [23]:
from nltk.stem import WordNetLemmatizer

In [24]:
def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    output = [ lemmatizer.lemmatize(ele) for ele in text]
    return output

In [25]:
lem_text = lemmatization(clean_data_sw)
lem_text[:10]

['eurypylus',
 'son',
 'euaemon',
 'killed',
 'hypsenor',
 'son',
 'noble',
 'dolopion',
 'made',
 'priest']

## Part of speech tagging

In [26]:
pos = nltk.pos_tag(clean_data_sw)
pos[:10]

[('eurypylus', 'JJ'),
 ('son', 'NN'),
 ('euaemon', 'NN'),
 ('killed', 'VBD'),
 ('hypsenor', 'JJ'),
 ('son', 'NN'),
 ('noble', 'JJ'),
 ('dolopion', 'NN'),
 ('made', 'VBD'),
 ('priest', 'JJS')]