In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample document
document = "The quick brown fox jumps over the lazy dog. The dog barks loudly."

# Tokenization
tokens = word_tokenize(document)

# POS Tagging
pos_tags = pos_tag(tokens)

# Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

# Print the results
print("Tokenization: ", tokens)
print("POS Tagging: ", pos_tags)
print("Stop Words Removal: ", filtered_tokens)
print("Stemming: ", stemmed_tokens)
print("Lemmatization: ", lemmatized_tokens)

# TF-IDF representation
corpus = [document]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()

# Print TF-IDF values
print("TF-IDF Representation:")
for i, j in zip(*tfidf_matrix.nonzero()):
    print("Token: '{}' - TF-IDF: {}".format(feature_names[j], tfidf_matrix[i, j]))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Tokenization:  ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', 'The', 'dog', 'barks', 'loudly', '.']
POS Tagging:  [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.'), ('The', 'DT'), ('dog', 'NN'), ('barks', 'VBZ'), ('loudly', 'RB'), ('.', '.')]
Stop Words Removal:  ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog', '.', 'dog', 'barks', 'loudly', '.']
Stemming:  ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '.', 'dog', 'bark', 'loudli', '.']
Lemmatization:  ['quick', 'brown', 'fox', 'jump', 'lazy', 'dog', '.', 'dog', 'bark', 'loudly', '.']
TF-IDF Representation:
Token: 'loudly' - TF-IDF: 0.2182178902359924
Token: 'barks' - TF-IDF: 0.2182178902359924
Token: 'dog' - TF-IDF: 0.4364357804719848
Token: 'lazy' - TF-IDF: 0.2182178902359924
Token: 'over' - TF-IDF: 0.2182178902359924
Token: 'jumps' - TF-IDF: 0.2182178902359924
Token: 'fox' - TF-IDF: 0.2182178902

