In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
import torch, torch.nn as nn, torch.optim as optim


# Load Data

categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers','footers','quotes'))

vectorizer = CountVectorizer(binary=True, stop_words='english', max_features=5000)
X = vectorizer.fit_transform(data.data)
vocab = vectorizer.get_feature_names_out()
y_true = data.target   # only for evaluation


# Define Labeled Features

labeled_features = {
    "ibm": 0,
    "dos": 0,
    "apple": 1,
    "mac": 1
}
lf_indices = {vectorizer.vocabulary_[w]: c for w,c in labeled_features.items() if w in vectorizer.vocabulary_}


# Baseline A: Feature Voting

def feature_voting(X):
    preds = []
    for i in range(X.shape[0]):
        indices = X[i].nonzero()[1]
        votes = [lf_indices[j] for j in indices if j in lf_indices]
        if len(votes) == 0:
            preds.append(0)  # default to class 0
        else:
            preds.append(max(set(votes), key=votes.count))
    return np.array(preds)

preds_voting = feature_voting(X)
acc_voting = (preds_voting == y_true).mean()
print("Baseline A (Feature Voting) Accuracy:", acc_voting)


# Baseline B: Pseudo-Labeling + Logistic Regression

pseudo_labels = []
pseudo_X = []
for i in range(X.shape[0]):
    indices = X[i].nonzero()[1]
    votes = [lf_indices[j] for j in indices if j in lf_indices]
    if len(votes) > 0:
        pseudo_labels.append(max(set(votes), key=votes.count))
        pseudo_X.append(X[i].toarray()[0])

if len(pseudo_X) > 0:
    clf = LogisticRegression(max_iter=500)
    clf.fit(pseudo_X, pseudo_labels)
    preds_pseudo = clf.predict(X.toarray())
    acc_pseudo = (preds_pseudo == y_true).mean()
else:
    acc_pseudo = 0.0

print("Baseline B (Pseudo-Labeling) Accuracy:", acc_pseudo)


# GE-FL (Generalized Expectation)
num_classes = 2
num_features = X.shape[1]
X_tensor = torch.tensor(X.toarray(), dtype=torch.float32)

W = nn.Parameter(torch.zeros(num_features, num_classes))
optimizer = optim.LBFGS([W], lr=0.1, max_iter=50)

def model(X):
    logits = X @ W
    return torch.softmax(logits, dim=1)

def ge_objective():
    probs = model(X_tensor)
    loss = torch.tensor(0.0)
    for idx, cls in lf_indices.items():
        mask = X_tensor[:, idx] > 0
        if mask.sum() == 0:
            continue
        dist = probs[mask].mean(0)
        ref = torch.ones(num_classes) * 0.1/(num_classes-1)
        ref[cls] = 0.9
        ref = ref / ref.sum()
        kl = torch.sum(ref * torch.log((ref + 1e-9)/(dist + 1e-9)))
        loss += kl
    loss += 0.5 * torch.sum(W**2)
    return loss

def closure():
    optimizer.zero_grad()
    loss = ge_objective()
    loss.backward()
    return loss

optimizer.step(closure)

# Evaluate GE-FL
probs = model(X_tensor).detach().numpy()
preds_ge = probs.argmax(1)
acc_ge = (preds_ge == y_true).mean()
print("GE-FL Accuracy:", acc_ge)


Baseline A (Feature Voting) Accuracy: 0.690068493150685
Baseline B (Pseudo-Labeling) Accuracy: 0.6558219178082192


Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:835.)
  loss = float(closure())


GE-FL Accuracy: 0.7842465753424658
