In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
document = "This is an example document that we will use to demonstrate document preprocessing."

In [None]:
tokens = word_tokenize(document)

In [None]:
tokens

['This',
 'is',
 'an',
 'example',
 'document',
 'that',
 'we',
 'will',
 'use',
 'to',
 'demonstrate',
 'document',
 'preprocessing',
 '.']

In [None]:
# POS tagging
pos_tags = pos_tag(tokens)

In [None]:
pos_tags

[('This', 'DT'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('example', 'NN'),
 ('document', 'NN'),
 ('that', 'IN'),
 ('we', 'PRP'),
 ('will', 'MD'),
 ('use', 'VB'),
 ('to', 'TO'),
 ('demonstrate', 'VB'),
 ('document', 'NN'),
 ('preprocessing', 'NN'),
 ('.', '.')]

In [None]:
# Stopwords removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if not word.lower() in stop_words]

In [None]:
filtered_tokens

['example', 'document', 'use', 'demonstrate', 'document', 'preprocessing', '.']

In [None]:
# Stemming
ps = PorterStemmer()
stemmed_tokens = [ps.stem(word) for word in filtered_tokens]

In [None]:
stemmed_tokens

['exampl', 'document', 'use', 'demonstr', 'document', 'preprocess', '.']

In [None]:
# Lemmatization
wnl = WordNetLemmatizer()
lemmatized_tokens = [wnl.lemmatize(word) for word in filtered_tokens]

In [None]:
lemmatized_tokens

['example', 'document', 'use', 'demonstrate', 'document', 'preprocessing', '.']

In [None]:
#2nd part tf-idf

In [None]:
import math
from collections import Counter

In [None]:
corpus = [
'The quick brown fox jumps over the lazy dog',
'The brown fox is quick',
'The lazy dog is sleeping'
]

In [None]:
tokenized_docs = [doc.lower().split() for doc in corpus]

In [None]:
# Count the term frequency for each document
tf_docs = [Counter(tokens) for tokens in tokenized_docs]

In [None]:
# Calculate the inverse document frequency for each term
n_docs = len(corpus)
idf = {}
for tokens in tokenized_docs:
  for token in set(tokens):
    idf[token] = idf.get(token, 0) + 1
for token in idf:
  idf[token] = math.log(n_docs / idf[token])


In [None]:
# Calculate the TF-IDF weights for each document
tfidf_docs = []
for tf_doc in tf_docs:
  tfidf_doc = {}
  for token, freq in tf_doc.items():
    tfidf_doc[token] = freq * idf[token]
  tfidf_docs.append(tfidf_doc)

In [None]:
# Print the resulting TF-IDF representation for each document
for i, tfidf_doc in enumerate(tfidf_docs):
  print(f"Document {i+1}: {tfidf_doc}")

Document 1: {'the': 0.0, 'quick': 0.4054651081081644, 'brown': 0.4054651081081644, 'fox': 0.4054651081081644, 'jumps': 1.0986122886681098, 'over': 1.0986122886681098, 'lazy': 0.4054651081081644, 'dog': 0.4054651081081644}
Document 2: {'the': 0.0, 'brown': 0.4054651081081644, 'fox': 0.4054651081081644, 'is': 0.4054651081081644, 'quick': 0.4054651081081644}
Document 3: {'the': 0.0, 'lazy': 0.4054651081081644, 'dog': 0.4054651081081644, 'is': 0.4054651081081644, 'sleeping': 1.0986122886681098}
