In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
# Function for Part-of-Speech (POS) tagging
def perform_pos_tagging(text):
    # Tokenize the input text
    words = nltk.word_tokenize(text)
    # Perform POS tagging
    pos_tags = nltk.pos_tag(words)
    return pos_tags

In [3]:
# Function for tokenization
def tokenize_text(text):
    # Tokenize the input text
    words = word_tokenize(text)
    return words

In [4]:
# Function for lemmatization
def lemmatize_text(text):
    # Tokenize the input text
    words = word_tokenize(text)
    # Initialize a WordNet Lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Apply lemmatization to each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    # Reconstruct the text with lemmatization
    result_text = ' '.join(lemmatized_words)
    return result_text

In [5]:
# Function for stop word removal
def remove_stopwords(text):
    # Tokenize the input text
    words = word_tokenize(text)
    # Define a list of English stopwords
    stop_words = set(stopwords.words('english'))
    # Remove stop words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Reconstruct the text without stop words
    result_text = ' '.join(filtered_words)
    return result_text


In [6]:
# Function for stemming
def perform_stemming(text):
    # Tokenize the input text
    words = word_tokenize(text)
    # Initialize a Porter Stemmer
    stemmer = PorterStemmer()
    # Apply stemming to each word
    stemmed_words = [stemmer.stem(word) for word in words]
    # Reconstruct the text with stemming
    result_text = ' '.join(stemmed_words)
    return result_text

In [7]:
# Input text
input_text = "This is an example of a document for tokenization. This is an example document for POS tagging and stemming."


In [8]:
# Apply stop word removal
stopword_removed_text = remove_stopwords(input_text)


In [9]:
# Apply stemming
stemmed_text = perform_stemming(input_text)

In [10]:
# Tokenize the text
tokenized_text = tokenize_text(input_text)

In [11]:
# Lemmatize the text
lemmatized_text = lemmatize_text(input_text)


In [12]:
# Apply POS tagging
pos_tags_result = perform_pos_tagging(input_text)

In [13]:
# Display the results
print("Original Text:")
print(input_text)

Original Text:
This is an example of a document for tokenization. This is an example document for POS tagging and stemming.


In [14]:
print("After Stopword Removal:")
print(stopword_removed_text)

After Stopword Removal:
example document tokenization . example document POS tagging stemming .


In [15]:
print("After Stemming:")
print(stemmed_text)

After Stemming:
thi is an exampl of a document for token . thi is an exampl document for po tag and stem .


In [16]:
print("Tokenized Text:")
print(tokenized_text)

Tokenized Text:
['This', 'is', 'an', 'example', 'of', 'a', 'document', 'for', 'tokenization', '.', 'This', 'is', 'an', 'example', 'document', 'for', 'POS', 'tagging', 'and', 'stemming', '.']


In [18]:
print("Lemmatized Text:")
print(lemmatized_text)

Lemmatized Text:
This is an example of a document for tokenization . This is an example document for POS tagging and stemming .


In [19]:
print("POS Tagging Results:")
for word, pos_tag in pos_tags_result:
    print(f"{word}: {pos_tag}")


POS Tagging Results:
This: DT
is: VBZ
an: DT
example: NN
of: IN
a: DT
document: NN
for: IN
tokenization: NN
.: .
This: DT
is: VBZ
an: DT
example: NN
document: NN
for: IN
POS: NNP
tagging: NN
and: CC
stemming: NN
.: .
