In [None]:
import os,sys
import warnings
import pandas as pd

warnings.simplefilter(action='ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

filename = input("Enter text file name: ")

file_path = os.path.join(sys.path[0], filename)
if not os.path.exists(file_path):
    print(f"Error: File '{filename}' not found.")
    sys.exit(1)

with open(file_path, "r", encoding="utf-8") as f:
    content = f.read()

print("=== Original Text Sample (First 300 chars) ===")
print(content[:300])
print()

try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
except:
    print("SpaCy model 'en_core_web_sm' not found.")
    print("Install using: python -m spacy download en_core_web_sm")
    sys.exit(1)

doc = nlp(content)
cleaned_tokens = [
    token.text.lower()
    for token in doc
    if not token.is_stop and token.is_alpha
]

print("=== Cleaned Text Sample ===")
print(" ".join(cleaned_tokens[:50]))
print()

from sklearn.feature_extraction.text import TfidfVectorizer

documents = [" ".join(cleaned_tokens)]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

try:
    feature_names = vectorizer.get_feature_names_out()
except:
    feature_names = vectorizer.get_feature_names()

print("=== TF-IDF Features ===")
print(list(feature_names))
print()

print("=== IDF Values ===")
for word, val in zip(feature_names, vectorizer.idf_):
    print(f"{word:<20} : {val:.4f}")
print()

print("=== TF-IDF Matrix ===")
df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
print(df.round(4))