### ***Main.py***

In [None]:
import os, sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer


def main():

    nlp = spacy.load("en_core_web_sm")
    fn = "Sample.txt"
    fp = os.path.join(sys.path[0], fn)

    with open(fp, "r", encoding="utf-8") as file:
        documents = [line.strip() for line in file if line.strip()]

    clean_docs = []
    for text in documents:
        doc = nlp(text.lower())
        tokens = [
            token.text
            for token in doc
            if token.is_alpha and not token.is_stop
        ]
        clean_docs.append(" ".join(tokens))

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(clean_docs)

    
    features = tfidf_vectorizer.get_feature_names()
    idf_values = tfidf_vectorizer.idf_

    print("\nTF-IDF Breakdown (Multiple Documents):")

    for doc_idx, doc_vector in enumerate(tfidf_matrix.toarray()):
        print(f"\nDocument {doc_idx + 1}:")
        for word_idx, tfidf in enumerate(doc_vector):
            if tfidf > 0:
                word = features[word_idx]
                idf = idf_values[word_idx]
                tf = tfidf / idf
                print(
                    f"{word:12s} | "
                    f"TF: {tf:.4f} | "
                    f"IDF: {idf:.4f} | "
                    f"TF-IDF: {tfidf:.4f}"
                )

    print("\nKey Observations & Interpretations:")
    print("1. Common words across documents have lower IDF values.")
    print("2. Rare words receive higher TF-IDF scores.")
    print("3. TF-IDF highlights important document-specific terms.")
    print("4. It improves document comparison over simple BoW.")


if __name__ == "__main__":
    main()



### ***NLP_MOdules.py***

In [None]:
import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load("en_core_web_sm")                           
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def read_from_file(filename):
    with open(filename, "r", encoding="utf-8") as file:
        documents = [line.strip() for line in file if line.strip()]
    return documents

def read_from_sentence(sentence):
    doc = nlp(sentence.lower()) 
    tokens = [token.text for token in doc if token.is_alpha and not token.is_stop]  
    clean_text = [" ".join(tokens)]
    return clean_text

def preprocess_using_spaCy(documents):
    clean_docs = []
    for text in documents:
        doc = nlp(text.lower())
        tokens = [token.text for token in doc if token.is_alpha and not token.is_stop]
        clean_docs.append(" ".join(tokens))
    return clean_docs


def convert_bow_counter(clean_docs):
    count_vectorizer = CountVectorizer()
    bow_matrix = count_vectorizer.fit_transform(clean_docs)
    return count_vectorizer, bow_matrix

def prepare_output_text(vectorizer, bow_matrix):

    vocab = vectorizer.vocabulary_
    total_counts = np.sum(bow_matrix.toarray(), axis=0)

    print("\nVocabulary:", vocab)
    print("\nFrequencies:")
    for word, index in sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1]):
        print(f"{word} : {total_counts[index]}")

def convert_bow_tfidf(clean_docs):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(clean_docs)
    features = tfidf_vectorizer.get_feature_names_out()
    idf_values = tfidf_vectorizer.idf_
    return features, idf_values, tfidf_matrix

def print_tfidf_result(features, idf_values, tfidf_matrix):
    print("TF-IDF Breakdown (Multiple Documents):\n")
    for doc_idx, doc_vector in enumerate(tfidf_matrix.toarray()):
        print(f"\n Document {doc_idx + 1}:")
        for word_idx, tfidf in enumerate(doc_vector):
            if tfidf > 0:
                idf = idf_values[word_idx]
                tf = tfidf / idf
                word = features[word_idx]
                print(f"{word:12s} | TF: {tf:.4f} | IDF: {idf:.4f} | TF-IDF: {tfidf:.4f}")

import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load("en_core_web_sm")                           
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def read_from_file(filename):
    with open(filename, "r", encoding="utf-8") as file:
        documents = [line.strip() for line in file if line.strip()]
    return documents

def read_from_sentence(sentence):
    doc = nlp(sentence.lower())  
    tokens = [token.text for token in doc if token.is_alpha and not token.is_stop]  
    clean_text = [" ".join(tokens)]
    return clean_text

def preprocess_using_spaCy(documents):
    clean_docs = []
    for text in documents:
        doc = nlp(text.lower())
        tokens = [token.text for token in doc if token.is_alpha and not token.is_stop]
        clean_docs.append(" ".join(tokens))
    return clean_docs


def convert_bow_counter(clean_docs):
    count_vectorizer = CountVectorizer()
    bow_matrix = count_vectorizer.fit_transform(clean_docs)
    return count_vectorizer, bow_matrix

def prepare_output_text(vectorizer, bow_matrix):
    vocab = vectorizer.vocabulary_
    total_counts = np.sum(bow_matrix.toarray(), axis=0)

    print("\nVocabulary:", vocab)
    print("\nFrequencies:")
    for word, index in sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1]):
        print(f"{word} : {total_counts[index]}")

def convert_bow_tfidf(clean_docs):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(clean_docs)
    features = tfidf_vectorizer.get_feature_names_out()
    idf_values = tfidf_vectorizer.idf_
    return features, idf_values, tfidf_matrix

def print_tfidf_result(features, idf_values, tfidf_matrix):
    print("TF-IDF Breakdown (Multiple Documents):\n")
    for doc_idx, doc_vector in enumerate(tfidf_matrix.toarray()):
        print(f"\n Document {doc_idx + 1}:")
        for word_idx, tfidf in enumerate(doc_vector):
            if tfidf > 0:
                idf = idf_values[word_idx]
                tf = tfidf / idf
                word = features[word_idx]
                print(f"{word:12s} | TF: {tf:.4f} | IDF: {idf:.4f} | TF-IDF: {tfidf:.4f}")