In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# ==============================
# Step 1: Load 3 classes
# ==============================
categories = ['comp.graphics', 'sci.space', 'rec.sport.baseball']
data = fetch_20newsgroups(subset='all', categories=categories,
                          remove=('headers', 'footers', 'quotes'))

X, y = data.data, data.target

# Vectorize text
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(X)

# ==============================
# Step 2: Train/Test split (10% test)
# ==============================
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=42
)

# ==============================
# Step 3: Split labeled (25%) and unlabeled (75%)
# ==============================
X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(
    X_train_full, y_train_full, test_size=0.75, stratify=y_train_full, random_state=42
)

# ==============================
# Step 4: Self-training loop
# ==============================
prev_pseudo_labels = None
tolerance = 0.01   # stop if <1% of labels change
max_iter = 10

for iteration in range(max_iter):
    print(f"\nIteration {iteration+1}")

    # Train model on current labeled set
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_labeled, y_labeled)

    # Predict on unlabeled
    pseudo_labels = clf.predict(X_unlabeled)

    # Check convergence
    if prev_pseudo_labels is not None:
        changes = np.mean(pseudo_labels != prev_pseudo_labels)
        print(f"Label change ratio: {changes:.4f}")
        if changes < tolerance:
            print("Converged. Stopping self-training.")
            break
    prev_pseudo_labels = pseudo_labels.copy()

    # Combine labeled + pseudo-labeled
    X_labeled = np.vstack([X_labeled.toarray(), X_unlabeled.toarray()])
    y_labeled = np.concatenate([y_labeled, pseudo_labels])

# ==============================
# Step 5: Final evaluation
# ==============================
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\nFinal Test Accuracy: {acc:.4f}")



Iteration 1

Iteration 2
Label change ratio: 0.0035
Converged. Stopping self-training.

Final Test Accuracy: 0.8378
