In [1]:

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.sparse import vstack
import numpy as np
import re

# --- Load data ---
categories = ['rec.sport.baseball', 'sci.space', 'talk.politics.mideast']
newsgroups = fetch_20newsgroups(subset='all', shuffle=True, categories=categories)

# --- Helper function ---
def extract_body(text):
    parts = re.split(r'\n\s*\n', text, maxsplit=1)
    return parts[1] if len(parts) > 1 else text

cleaned_data = [extract_body(doc) for doc in newsgroups.data]
labels = newsgroups.target

# --- Split data ---
X_train, X_test, y_train, y_test = train_test_split(
    cleaned_data, labels, test_size=0.1, random_state=42
)

# Split training data into labeled and unlabeled
X_train_labeled, X_train_unlabeled, y_train_labeled, y_dummy = train_test_split(
    X_train, y_train, test_size=0.75, random_state=42
)

# --- Vectorization ---
count_vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X_train_labeled_vec = count_vectorizer.fit_transform(X_train_labeled)
X_train_unlabeled_vec = count_vectorizer.transform(X_train_unlabeled)
X_test_vec = count_vectorizer.transform(X_test)

# --- Initial model ---
classifier = LogisticRegression(max_iter=1000)
# classifier = MultinomialNB()

classifier.fit(X_train_labeled_vec, y_train_labeled)

# --- Self-training loop ---
threshold = 0.05
y_train_unlabeled_prev = None

for i in range(1, 100):
    y_train_unlabeled = classifier.predict(X_train_unlabeled_vec)
    acc_unlabeled = accuracy_score(y_dummy, y_train_unlabeled)
    print(f"\nIteration {i}: Accuracy on unlabeled (debug only): {100 * acc_unlabeled:.2f}%")

    if y_train_unlabeled_prev is not None:
        label_change = 100 * (1 - accuracy_score(y_train_unlabeled, y_train_unlabeled_prev))
        print(f"Label change since last iteration: {label_change:.2f}%\n")
        if accuracy_score(y_train_unlabeled, y_train_unlabeled_prev) > (1 - threshold):
            print("Change below threshold, stopping self-training.")
            break

    y_train_unlabeled_prev = y_train_unlabeled

    # Combine labeled + pseudo-labeled data
    X_combined = vstack([X_train_labeled_vec, X_train_unlabeled_vec])
    y_combined = np.concatenate([y_train_labeled, y_train_unlabeled])

    # Retrain model
    classifier.fit(X_combined, y_combined)

# --- Final Evaluation ---
test_acc = 100 * accuracy_score(y_test, classifier.predict(X_test_vec))
print(f"\nTesting:\nAccuracy on test data: {test_acc:.2f}%")



Iteration 1: Accuracy on unlabeled (debug only): 93.30%

Iteration 2: Accuracy on unlabeled (debug only): 93.30%
Label change since last iteration: 0.00%

Change below threshold, stopping self-training.

Testing:
Accuracy on test data: 93.17%
