In [None]:
import os
import sys
import warnings
import numpy as np
import pandas as pd

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
warnings.simplefilter(action='ignore')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("spaCy model 'en_core_web_sm' not found.")
    print("Install it using: python -m spacy download en_core_web_sm")
    sys.exit(1)


def main():
    filename = input("Enter sports news text file name: ")

    filepath = os.path.join(sys.path[0], filename)

    if not os.path.isfile(filepath):
        print(f"Error: File '{filename}' not found.")
        sys.exit(1)

    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    print("=== Original Text Sample (First 300 chars) ===")
    print(content[:300])
    print()

    documents = [doc.strip() for doc in content.split('---') if doc.strip()]

    cleaned_docs = []
    all_tokens = []

    for doc in documents:
        spacy_doc = nlp(doc.lower())
        tokens = [
            token.text
            for token in spacy_doc
            if not token.is_stop and not token.is_punct and not token.is_space
        ]
        all_tokens.extend(tokens)
        cleaned_docs.append(" ".join(tokens))

    print("=== Cleaned Text Sample ===")
    print(all_tokens[:50])
    print()

    bow_vectorizer = CountVectorizer()
    bow_matrix = bow_vectorizer.fit_transform(cleaned_docs)

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_docs)

    feature_names = tfidf_vectorizer.get_feature_names()
    idf_values = tfidf_vectorizer.idf_

    print("=== TF-IDF Features ===")
    print(list(feature_names))
    print()

    print("=== IDF Values ===")
    for word, idf in zip(feature_names, idf_values):
        print(f"{word} : {idf:.4f}")
    print()

    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
    print("=== TF-IDF Matrix ===")
    print(tfidf_df.round(4))
    print()

    embeddings = np.array([nlp(doc).vector for doc in cleaned_docs])
    print("=== Word Embedding Vectors ===")
    print(embeddings)
    print()

    print("=== Vector Shapes ===")
    print(f"BoW shape: {bow_matrix.shape}")
    print(f"TF-IDF shape: {tfidf_matrix.shape}")
    print(f"Embedding shape: {embeddings.shape}")
    print()

    print("=== Cosine Similarity (BoW) ===")
    print(cosine_similarity(bow_matrix))
    print()

    print("=== Cosine Similarity (TF-IDF) ===")
    print(cosine_similarity(tfidf_matrix))
    print()

    print("=== Cosine Similarity (Embeddings) ===")
    print(cosine_similarity(embeddings))
    print()

    print("=== Observations ===")
    print("1. Bag-of-Words considers only word frequency.")
    print("2. TF-IDF highlights important words across documents.")
    print("3. Word embeddings capture semantic meaning and context.")
    print("4. Embedding similarity reflects deeper relationships between sports news articles.")


if __name__ == "__main__":
    main()
