In [1]:
import sys
from pathlib import Path

ROOT = Path().resolve()
sys.path.insert(0, str(ROOT))


In [2]:
from src.config import Config
from src.dataio import load_dataset, make_or_load_splits, build_splits
from src.clean import batch_clean

cfg = Config()

df = load_dataset(str(cfg.dataset_csv))

train_idx, test_idx = make_or_load_splits(
    df, cfg.splits_dir,
    seed=cfg.seed,
    test_size=cfg.test_size,
)

splits = build_splits(df, train_idx, test_idx)

X_train = batch_clean(splits.X_train)
y_train = splits.y_train
labels = list(splits.label_encoder.classes_)

labels


['business', 'entertainment', 'politics', 'sport', 'tech']

In [3]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def top_terms_by_class(X, y, feature_names, top_n=15):
    out = {}
    classes = np.unique(y)
    for c in classes:
        idx = np.where(y == c)[0]
        scores = np.asarray(X[idx].sum(axis=0)).ravel()
        top_idx = scores.argsort()[::-1][:top_n]
        out[int(c)] = [feature_names[i] for i in top_idx]
    return out

bow = CountVectorizer(max_features=5000)
tfidf = TfidfVectorizer(max_features=5000)

X_bow = bow.fit_transform(X_train)
X_tfidf = tfidf.fit_transform(X_train)

bow_terms = top_terms_by_class(X_bow, y_train, bow.get_feature_names_out(), top_n=20)
tfidf_terms = top_terms_by_class(X_tfidf, y_train, tfidf.get_feature_names_out(), top_n=20)
print("BoW / TF-IDF key words")
for i, label in enumerate(labels):
    print(f"\n=== {label.upper()} ===")
    print("BoW   :", bow_terms[i][:10])
    print("TF-IDF:", tfidf_terms[i][:10])


BoW / TF-IDF key words

=== BUSINESS ===
BoW   : ['said', 'year', 'mr', 'market', 'growth', 'government', 'new', 'economy', 'company', 'bank']
TF-IDF: ['said', 'growth', 'economy', 'bank', 'sales', 'year', 'market', 'company', 'mr', 'shares']

=== ENTERTAINMENT ===
BoW   : ['said', 'film', 'best', 'music', 'year', 'new', 'awards', 'number', 'uk', 'band']
TF-IDF: ['film', 'best', 'music', 'band', 'said', 'awards', 'award', 'album', 'year', 'festival']

=== POLITICS ===
BoW   : ['said', 'mr', 'labour', 'government', 'blair', 'election', 'people', 'party', 'minister', 'new']
TF-IDF: ['mr', 'said', 'labour', 'blair', 'election', 'party', 'government', 'brown', 'minister', 'people']

=== SPORT ===
BoW   : ['said', 'year', 'game', 'england', 'time', 'win', 'world', 'players', 'cup', 'play']
TF-IDF: ['england', 'game', 'win', 'said', 'cup', 'injury', 'match', 'club', 'players', 'world']

=== TECH ===
BoW   : ['said', 'people', 'new', 'technology', 'mr', 'mobile', 'games', 'software', 'net', '

In [4]:
def jaccard(a, b):
    a, b = set(a), set(b)
    return len(a & b) / len(a | b) if (a | b) else 0.0

print("Jaccard overlap (top-20) per class:")
for i, label in enumerate(labels):
    jac = jaccard(bow_terms[i], tfidf_terms[i])
    print(f"{label:12s}: {jac:.3f}")


Jaccard overlap (top-20) per class:
business    : 0.667
entertainment: 0.481
politics    : 0.600
sport       : 0.667
tech        : 0.667


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# sublinear_tf
tfidf_log = TfidfVectorizer(sublinear_tf=True, max_features=5000)
X_log = tfidf_log.fit_transform(X_train)
log_terms = top_terms_by_class(X_log, y_train, tfidf_log.get_feature_names_out(), top_n=20)

# ngram (1,2)
tfidf_ngram = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)
X_ng = tfidf_ngram.fit_transform(X_train)
ng_terms = top_terms_by_class(X_ng, y_train, tfidf_ngram.get_feature_names_out(), top_n=20)


In [6]:
print("\nExample: SPORT top terms")
sport_id = labels.index("sport") if "sport" in labels else 0
print("TF-IDF default:", tfidf_terms[sport_id][:10])
print("TF-IDF sublinear:", log_terms[sport_id][:10])
print("TF-IDF 1-2gram:", ng_terms[sport_id][:10])


Example: SPORT top terms
TF-IDF default: ['england', 'game', 'win', 'said', 'cup', 'injury', 'match', 'club', 'players', 'world']
TF-IDF sublinear: ['game', 'win', 'cup', 'said', 'match', 'injury', 'england', 'players', 'team', 'play']
TF-IDF 1-2gram: ['england', 'game', 'win', 'said', 'cup', 'injury', 'match', 'club', 'players', 'play']


In [7]:
print("\n=== TF-IDF Top Terms Comparison by Class ===")

for i, label in enumerate(labels):
    print(f"\n### {label.upper()} ###")
    print("TF-IDF default   :", tfidf_terms[i][:10])
    print("TF-IDF sublinear :", log_terms[i][:10])
    print("TF-IDF 1-2gram   :", ng_terms[i][:10])



=== TF-IDF Top Terms Comparison by Class ===

### BUSINESS ###
TF-IDF default   : ['said', 'growth', 'economy', 'bank', 'sales', 'year', 'market', 'company', 'mr', 'shares']
TF-IDF sublinear : ['said', 'growth', 'market', 'economy', 'year', 'company', 'bank', 'shares', 'firm', '2004']
TF-IDF 1-2gram   : ['said', 'growth', 'economy', 'bank', 'sales', 'year', 'market', 'company', 'shares', 'mr']

### ENTERTAINMENT ###
TF-IDF default   : ['film', 'best', 'music', 'band', 'said', 'awards', 'award', 'album', 'year', 'festival']
TF-IDF sublinear : ['film', 'best', 'music', 'awards', 'award', 'said', 'band', 'star', 'year', 'album']
TF-IDF 1-2gram   : ['film', 'best', 'music', 'band', 'awards', 'said', 'album', 'award', 'year', 'festival']

### POLITICS ###
TF-IDF default   : ['mr', 'said', 'labour', 'blair', 'election', 'party', 'government', 'brown', 'minister', 'people']
TF-IDF sublinear : ['mr', 'labour', 'election', 'blair', 'said', 'government', 'party', 'minister', 'people', 'tory']
T