In [81]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer


In [82]:
with open("amg.txt", "r")as file:
    document = file.read()
# document

In [83]:
cleaned_document = re.sub(r'[^\w\s]', '', document)
# cleaned_document
# Explanation: The regular expression [^\w\s] matches any character that is not a word character (alphanumeric or underscore) or whitespace. The re.sub() function replaces these characters with an empty string, effectively removing them.


In [84]:
# tokens
tokens = word_tokenize(cleaned_document)
# Explanation: Tokenization is the process of splitting the cleaned document into individual words or tokens. The word_tokenize() function from NLTK is used here.

In [85]:
pos_tags = pos_tag(tokens)
# pos_tags
# Explanation: POS tagging assigns a part-of-speech tag to each token in the document. The pos_tag() function from NLTK performs this task.

In [86]:
# Stopwords removal
stopwords = set(stopwords.words("english"))
filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
# Explanation: Stopwords are common words (e.g., "is", "a", "the") that do not carry significant meaning. The stopwords.words("english") function returns a set of English stopwords. The list comprehension filters out the tokens that are not stopwords.


In [87]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
# Explanation: Stemming reduces words to their base or root form. The PorterStemmer from NLTK is used to perform stemming on the filtered tokens.


In [88]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
# Explanation: Lemmatization aims to reduce words to their base form (lemma) while considering the context. The WordNetLemmatizer from NLTK is used for lemmatizing the filtered tokens.


In [89]:
preprocessed_document = ' '.join(lemmatized_tokens)
documents = [preprocessed_document]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
# Get the feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_values = tfidf_matrix.toarray()[0]

print("\nTerm Frequency-Inverse Document Frequency (TF-IDF) Representation:")
for term, value in zip(feature_names, tfidf_values):
    print(f"{term}: {value}")


Term Frequency-Inverse Document Frequency (TF-IDF) Representation:
13: 0.06286946134619315
15: 0.06286946134619315
40liter: 0.06286946134619315
44: 0.06286946134619315
4matic: 0.06286946134619315
577: 0.06286946134619315
60: 0.06286946134619315
627: 0.06286946134619315
accelerate: 0.06286946134619315
additionally: 0.06286946134619315
advanced: 0.18860838403857944
allows: 0.06286946134619315
allwheeldrive: 0.1257389226923863
also: 0.06286946134619315
amg: 0.1257389226923863
astonishing: 0.06286946134619315
automatic: 0.06286946134619315
barrier: 0.06286946134619315
best: 0.06286946134619315
buyer: 0.06286946134619315
cabin: 0.18860838403857944
capability: 0.1257389226923863
capable: 0.1257389226923863
choice: 0.06286946134619315
city: 0.06286946134619315
class: 0.06286946134619315
combining: 0.06286946134619315
comfortable: 0.06286946134619315
deep: 0.06286946134619315
delivers: 0.06286946134619315
differential: 0.06286946134619315
downside: 0.06286946134619315
ease: 0.0628694613461931