In [None]:
import numpy as np
from sklearn.datasets import fetch_openml

# Load the dataset
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist.data, mnist.target

# Split data into training and test sets
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

y_train_large = (y_train >= '7')            # True if digit is 7, 8, or 9
y_train_odd = (y_train.astype('int8') % 2 == 1)  # True if digit is odd
y_multilabel = np.c_[y_train_large, y_train_odd] # Shape: (n_samples, 2)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)


In [None]:
# Select a data point to predict on
some_digit = X_train[0]

In [None]:
display(knn_clf.predict([some_digit]))
# Example output: array([[False, True]])

array([[False,  True]])

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score

y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)


0.9778357403921755

In [None]:
f1_score(y_multilabel, y_train_knn_pred, average="macro")

0.9764102655606048

In [None]:
f1_score(y_multilabel, y_train_knn_pred, average="weighted")

0.9778357403921755

In [None]:
from sklearn.multioutput import ClassifierChain
from sklearn.svm import SVC

chain_clf = ClassifierChain(SVC(), cv=3, random_state=42)


In [None]:
chain_clf.fit(X_train[:2000], y_multilabel[:2000])


In [None]:
chain_clf.predict([some_digit])

array([[0., 1.]])

In [16]:
from sklearn.datasets import fetch_20newsgroups

# You can use 'all' for full dataset, or 'train'/'test' for split subsets
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))  # removes metadata for realism
X = newsgroups.data        # List of news articles (raw text)
y = newsgroups.target      # Integer category label for each article
label_names = newsgroups.target_names  # List of topic/category names
print("Number of articles:", len(X))
print("Sample label names:", label_names)

Number of articles: 18846
Sample label names: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [17]:
import numpy as np

# Build mapping from category name to its integer index
cat_idx = {name: idx for idx, name in enumerate(label_names)}

# Define your custom multilabel mapping
tech_cats = [cat_idx[c] for c in label_names if c.startswith("comp.") or c.startswith("sci.")]
religion_cats = [cat_idx[c] for c in ['alt.atheism', 'soc.religion.christian', 'talk.religion.misc']]
sports_cats = [cat_idx[c] for c in label_names if c.startswith('rec.sport.')]
politics_cats = [cat_idx[c] for c in label_names if c.startswith('talk.politics.')]
forsale_cats = [cat_idx['misc.forsale']]

# Create multilabel array
y_multilabel = np.c_[
    np.isin(y, tech_cats),        # Tech
    np.isin(y, religion_cats),    # Religion
    np.isin(y, sports_cats),      # Sports
    np.isin(y, politics_cats),    # Politics
    np.isin(y, forsale_cats)      # For Sale
]

print("Example multilabels for first 10 articles:\n", y_multilabel[:10])


Example multilabels for first 10 articles:
 [[False False  True False False]
 [ True False False False False]
 [False False False  True False]
 [ True False False False False]
 [ True False False False False]
 [ True False False False False]
 [ True False False False False]
 [False False  True False False]
 [False False  True False False]
 [False  True False False False]]


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000)  # Limit feature size for speed
X_vec = vectorizer.fit_transform(X)
print("TF-IDF vectorized shape:", X_vec.shape)


TF-IDF vectorized shape: (18846, 10000)


In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# Split data for fast experiments (adjust test_size as needed)
X_train, X_test, y_train, y_test = train_test_split(X_vec, y_multilabel, test_size=0.2, random_state=42)

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)


In [20]:
from sklearn.metrics import f1_score, classification_report

# Predict multilabels on the test set
y_pred = knn_clf.predict(X_test)

# Macro F1: average score treating all labels equally
f1_macro = f1_score(y_test, y_pred, average="macro")
print("Macro F1-score:", f1_macro)

# Weighted F1: average weighted by support (label frequency)
f1_weighted = f1_score(y_test, y_pred, average="weighted")
print("Weighted F1-score:", f1_weighted)

# Detailed report by label
print("\nClassification report:\n", classification_report(y_test, y_pred, target_names=['Tech','Religion','Sports','Politics','For Sale']))


Macro F1-score: 0.2077226485340702
Weighted F1-score: 0.33510272681321396

Classification report:
               precision    recall  f1-score   support

        Tech       0.54      0.47      0.50      1786
    Religion       0.57      0.12      0.20       489
      Sports       0.44      0.07      0.11       409
    Politics       0.63      0.10      0.18       529
    For Sale       0.71      0.03      0.05       193

   micro avg       0.54      0.29      0.38      3406
   macro avg       0.58      0.16      0.21      3406
weighted avg       0.56      0.29      0.34      3406
 samples avg       0.26      0.26      0.26      3406



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

clf = OneVsRestClassifier(LogisticRegression())
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Tech','Religion','Sports','Politics','For Sale']))


              precision    recall  f1-score   support

        Tech       0.90      0.87      0.89      1786
    Religion       0.94      0.56      0.70       489
      Sports       0.99      0.58      0.73       409
    Politics       0.92      0.48      0.63       529
    For Sale       0.92      0.44      0.59       193

   micro avg       0.92      0.71      0.80      3406
   macro avg       0.93      0.59      0.71      3406
weighted avg       0.92      0.71      0.79      3406
 samples avg       0.64      0.64      0.64      3406



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
from sklearn.multioutput import ClassifierChain
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

# Use LogisticRegression as the base estimator for the ClassifierChain
chain_clf = ClassifierChain(LogisticRegression(), cv=3, random_state=42)

# Train the ClassifierChain model
chain_clf.fit(X_train, y_train)

# Predict on the test set
y_pred_chain = chain_clf.predict(X_test)

# Print the classification report
print("Classification report for ClassifierChain:\n", classification_report(y_test, y_pred_chain, target_names=['Tech','Religion','Sports','Politics','For Sale']))

Classification report for ClassifierChain:
               precision    recall  f1-score   support

        Tech       0.91      0.88      0.89      1786
    Religion       0.95      0.59      0.73       489
      Sports       0.99      0.63      0.77       409
    Politics       0.91      0.60      0.72       529
    For Sale       0.93      0.39      0.55       193

   micro avg       0.92      0.73      0.82      3406
   macro avg       0.94      0.62      0.73      3406
weighted avg       0.93      0.73      0.81      3406
 samples avg       0.66      0.66      0.66      3406



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
