In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import sys
import warnings
warnings.filterwarnings('ignore')

import spacy
from sklearn.feature_extraction.text import CountVectorizer

filename = input("Enter sports news text file name: ")
filepath = os.path.join(sys.path[0], filename)

with open(filepath, 'r', encoding='utf-8') as file:
    documents = file.readlines()

print("\n=== Original Text Sample ===")
print(documents[0][:300])
print()

nlp = spacy.load("en_core_web_sm")
cleaned_docs = []

for text in documents:
    doc = nlp(text.lower())
    words = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            words.append(token.text)
    cleaned_docs.append(" ".join(words))

print("=== Preprocessed Text Sample ===")
print(cleaned_docs[0][:300])
print()

vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(cleaned_docs)

print("=== Bag-of-Words Matrix ===")
print(bow_matrix.toarray())
print()

print("=== Word Frequencies ===")
words = vectorizer.get_feature_names()
frequencies = bow_matrix.toarray().sum(axis=0)

word_freq = list(zip(words, frequencies))
word_freq.sort(key=lambda x: x[1], reverse=True)

for word, freq in word_freq:
    print(f"{word}: {freq}")
