In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import string

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
sample_document = """
This is a sample document. It contains several sentences that need to be preprocessed.
We will tokenize the text, remove stopwords, apply stemming and lemmatization, and calculate TF-IDF.
"""

sample_document

'\nThis is a sample document. It contains several sentences that need to be preprocessed. \nWe will tokenize the text, remove stopwords, apply stemming and lemmatization, and calculate TF-IDF.\n'

In [None]:
tokens = word_tokenize(sample_document)
tokens

['This',
 'is',
 'a',
 'sample',
 'document',
 '.',
 'It',
 'contains',
 'several',
 'sentences',
 'that',
 'need',
 'to',
 'be',
 'preprocessed',
 '.',
 'We',
 'will',
 'tokenize',
 'the',
 'text',
 ',',
 'remove',
 'stopwords',
 ',',
 'apply',
 'stemming',
 'and',
 'lemmatization',
 ',',
 'and',
 'calculate',
 'TF-IDF',
 '.']

In [None]:
pos_tags = nltk.pos_tag(tokens)
pos_tags

[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('sample', 'JJ'),
 ('document', 'NN'),
 ('.', '.'),
 ('It', 'PRP'),
 ('contains', 'VBZ'),
 ('several', 'JJ'),
 ('sentences', 'NNS'),
 ('that', 'WDT'),
 ('need', 'VBP'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('preprocessed', 'VBN'),
 ('.', '.'),
 ('We', 'PRP'),
 ('will', 'MD'),
 ('tokenize', 'VB'),
 ('the', 'DT'),
 ('text', 'NN'),
 (',', ','),
 ('remove', 'VB'),
 ('stopwords', 'NNS'),
 (',', ','),
 ('apply', 'VB'),
 ('stemming', 'VBG'),
 ('and', 'CC'),
 ('lemmatization', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('calculate', 'VB'),
 ('TF-IDF', 'NNP'),
 ('.', '.')]

In [None]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
filtered_tokens

['sample',
 'document',
 '.',
 'contains',
 'several',
 'sentences',
 'need',
 'preprocessed',
 '.',
 'tokenize',
 'text',
 ',',
 'remove',
 'stopwords',
 ',',
 'apply',
 'stemming',
 'lemmatization',
 ',',
 'calculate',
 'TF-IDF',
 '.']

In [None]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
stemmed_tokens

['sampl',
 'document',
 '.',
 'contain',
 'sever',
 'sentenc',
 'need',
 'preprocess',
 '.',
 'token',
 'text',
 ',',
 'remov',
 'stopword',
 ',',
 'appli',
 'stem',
 'lemmat',
 ',',
 'calcul',
 'tf-idf',
 '.']

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
lemmatized_tokens

['sample',
 'document',
 '.',
 'contains',
 'several',
 'sentence',
 'need',
 'preprocessed',
 '.',
 'tokenize',
 'text',
 ',',
 'remove',
 'stopwords',
 ',',
 'apply',
 'stemming',
 'lemmatization',
 ',',
 'calculate',
 'TF-IDF',
 '.']

In [None]:
def calculate_tf_idf(documents):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names

# Calculate TF-IDF
documents = [sample_document]
tfidf_matrix, feature_names = calculate_tf_idf(documents)

# Print TF-IDF information
tfidf_matrix.toarray(), feature_names


(array([[0.36514837, 0.18257419, 0.18257419, 0.18257419, 0.18257419,
         0.18257419, 0.18257419, 0.18257419, 0.18257419, 0.18257419,
         0.18257419, 0.18257419, 0.18257419, 0.18257419, 0.18257419,
         0.18257419, 0.18257419, 0.18257419, 0.18257419, 0.18257419,
         0.18257419, 0.18257419, 0.18257419, 0.18257419, 0.18257419,
         0.18257419, 0.18257419]]),
 array(['and', 'apply', 'be', 'calculate', 'contains', 'document', 'idf',
        'is', 'it', 'lemmatization', 'need', 'preprocessed', 'remove',
        'sample', 'sentences', 'several', 'stemming', 'stopwords', 'text',
        'tf', 'that', 'the', 'this', 'to', 'tokenize', 'we', 'will'],
       dtype=object))