In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample document
document = "The quick brown fox jumps over the lazy dog. The dog barks loudly."

# Tokenization
tokens = word_tokenize(document)

# POS Tagging
pos_tags = pos_tag(tokens)

# Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

# Print the results
print("Tokenization: ", tokens)
print("POS Tagging: ", pos_tags)
print("Stop Words Removal: ", filtered_tokens)
print("Stemming: ", stemmed_tokens)
print("Lemmatization: ", lemmatized_tokens)

# TF-IDF representation
corpus = [document]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()

# Print TF-IDF values
print("TF-IDF Representation:")
for i, j in zip(*tfidf_matrix.nonzero()):
    print("Token: '{}' - TF-IDF: {}".format(feature_names[j], tfidf_matrix[i, j]))


In [None]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "The dog barks loudly.",
    "The cat sleeps peacefully."
]

# Tokenization and stop words removal
stop_words = set(stopwords.words('english'))
tokenized_documents = [nltk.word_tokenize(doc) for doc in documents]
filtered_documents = [[token.lower() for token in doc if token.lower() not in stop_words] for doc in tokenized_documents]

# Calculate Term Frequency (TF) representation
tf_vectorizer = TfidfVectorizer(use_idf=False)
tf_matrix = tf_vectorizer.fit_transform([" ".join(doc) for doc in filtered_documents])
tf_feature_names = tf_vectorizer.get_feature_names()

# Calculate Inverse Document Frequency (IDF) representation
idf_vectorizer = TfidfVectorizer(use_idf=True)
idf_matrix = idf_vectorizer.fit_transform([" ".join(doc) for doc in filtered_documents])
idf_feature_names = idf_vectorizer.get_feature_names()

# Print Term Frequency (TF) representation
print("Term Frequency (TF) Representation:")
for doc_index, doc in enumerate(tf_matrix.toarray()):
    print(f"Document {doc_index + 1}:")
    for feature_index, tf_value in enumerate(doc):
        print(f"Token: '{tf_feature_names[feature_index]}' - TF: {tf_value}")
    print()

# Print Inverse Document Frequency (IDF) representation
print("Inverse Document Frequency (IDF) Representation:")
for doc_index, doc in enumerate(idf_matrix.toarray()):
    print(f"Document {doc_index + 1}:")
    for feature_index, idf_value in enumerate(doc):
        print(f"Token: '{idf_feature_names[feature_index]}' - IDF: {idf_value}")
    print()
