In [None]:
import os
import sys
import warnings
warnings.simplefilter(action='ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import spacy
from nltk.stem import SnowballStemmer

try:
    nlp = spacy.load("en_core_web_sm")
except:
    print("SpaCy model 'en_core_web_sm' not found. Install it using:")
    print("python -m spacy download en_core_web_sm")
    sys.exit(1)

stemmer = SnowballStemmer(language='english')

filename = input("Enter text file name: ")

filepath = os.path.join(sys.path[0], filename)

if not os.path.exists(filepath):
    print(f"Error: File '{filename}' not found.")
    sys.exit(1)

with open(filepath, "r", encoding="utf-8") as file:
    content = file.read()

print("\nOriginal Text Sample:")
print(content[:300])
print()

doc = nlp(content)
tokens = [token for token in doc if not token.is_space]
token_texts = [token.text for token in tokens]

print(f"Total Tokens Count: {len(tokens)}")
print()

lemmas = [token.lemma_ for token in tokens]

print("=== Lemmatized Sample (First 20 tokens) ===")
print(lemmas[:20])
print()

print("Word --> Lemma")
for word, lemma in zip(token_texts[:30], lemmas[:30]):
    print(f"{word} --> {lemma}")
print()

stems = [stemmer.stem(token.text.lower()) for token in tokens]

print("=== Stemmed Sample (First 20 tokens) ===")
print(stems[:20])
print()

print("Word --> Stem")
for word, stem in zip(token_texts[:30], stems[:30]):
    print(f"{word} --> {stem}")
print()

print("=== Comparison: Lemmatization vs Stemming ===")
print("Word\t\tLemma\t\tStem")
print("------------------------------------------")

for word, lemma, stem in zip(token_texts[:30], lemmas[:30], stems[:30]):
    print(f"{word}\t\t{lemma}\t\t{stem}")
print()

print("Conclusion:")
print(
"Lemmatization produces dictionary-based meaningful root words, "
"while stemming may distort words by chopping suffixes. "
"For NLP tasks like search, topic modeling, and information retrieval, "
"lemmatization gives better and cleaner output."
)
