In [12]:
!pip install nltk



In [14]:
# Step 1: Import and download NLTK resources
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter
import math

# Download required resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Step 2: Sample document
text = "Natural Language Processing is amazing!"

# Step 3: Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)

# Step 4: Stop Words Removal (and removing punctuation)
stop_words = set(stopwords.words('english'))
filtered = [w for w in tokens if w.lower() not in stop_words and w.isalpha()]
print("After Stopword Removal:", filtered)

# Step 5: Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in filtered]
print("Stemmed Words:", stemmed)

# Step 6: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w.lower()) for w in filtered]
print("Lemmatized Words:", lemmatized)

# Step 7: Term Frequency (TF)
tf_counts = Counter(lemmatized)
total_terms = sum(tf_counts.values())
tf = {word: count / total_terms for word, count in tf_counts.items()}
print("TF:", tf)

# Step 8: Inverse Document Frequency (IDF)
# Example: a small corpus with 3 documents
documents = [
    ['natural', 'language', 'processing', 'amazing'],
    ['machine', 'learning', 'is', 'fun'],
    ['natural', 'language', 'processing', 'applications']
]

N = len(documents)
idf = {}
for word in tf:
    doc_count = sum(1 for doc in documents if word in doc)
    idf[word] = math.log(N / doc_count)
print("IDF:", idf)

# Step 9: TF-IDF Calculation
tf_idf = {word: tf[word] * idf[word] for word in tf}
print("TF-IDF:", tf_idf)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omsaw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\omsaw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\omsaw\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\omsaw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Tokens: ['Natural', 'Language', 'Processing', 'is', 'amazing', '!']
After Stopword Removal: ['Natural', 'Language', 'Processing', 'amazing']
Stemmed Words: ['natur', 'languag', 'process', 'amaz']
Lemmatized Words: ['natural', 'language', 'processing', 'amazing']
TF: {'natural': 0.25, 'language': 0.25, 'processing': 0.25, 'amazing': 0.25}
IDF: {'natural': 0.4054651081081644, 'language': 0.4054651081081644, 'processing': 0.4054651081081644, 'amazing': 1.0986122886681098}
TF-IDF: {'natural': 0.1013662770270411, 'language': 0.1013662770270411, 'processing': 0.1013662770270411, 'amazing': 0.27465307216702745}
