In [None]:
from time import time

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

categories = [
    "alt.atheism",
    "talk.religion.misc",
    "comp.graphics",
    "sci.space",
]


def size_mb(docs):
    return sum(len(s.encode("utf-8")) for s in docs) / 1e6


def load_dataset(verbose=False, remove=()):
    """Load and vectorize the 20 newsgroups dataset."""

    data_train = fetch_20newsgroups(
        subset="train",
        categories=categories,
        shuffle=True,
        random_state=42,
        remove=remove,
    )

    data_test = fetch_20newsgroups(
        subset="test",
        categories=categories,
        shuffle=True,
        random_state=42,
        remove=remove,
    )

    # order of labels in `target_names` can be different from `categories`
    target_names = data_train.target_names

    # split target in a training set and a test set
    y_train, y_test = data_train.target, data_test.target

    # Extracting features from the training data using a sparse vectorizer
    t0 = time()
    vectorizer = TfidfVectorizer(
        sublinear_tf=True, max_df=0.5, min_df=5, stop_words="english"
    )
    X_train = vectorizer.fit_transform(data_train.data)
    duration_train = time() - t0

    # Extracting features from the test data using the same vectorizer
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration_test = time() - t0

    feature_names = vectorizer.get_feature_names_out()

    if verbose:
        # compute size of loaded data
        data_train_size_mb = size_mb(data_train.data)
        data_test_size_mb = size_mb(data_test.data)

        print(
            f"{len(data_train.data)} documents - "
            f"{data_train_size_mb:.2f}MB (training set)"
        )
        print(f"{len(data_test.data)} documents - {data_test_size_mb:.2f}MB (test set)")
        print(f"{len(target_names)} categories")
        print(
            f"vectorize training done in {duration_train:.3f}s "
            f"at {data_train_size_mb / duration_train:.3f}MB/s"
        )
        print(f"n_samples: {X_train.shape[0]}, n_features: {X_train.shape[1]}")
        print(
            f"vectorize testing done in {duration_test:.3f}s "
            f"at {data_test_size_mb / duration_test:.3f}MB/s"
        )
        print(f"n_samples: {X_test.shape[0]}, n_features: {X_test.shape[1]}")

    return X_train, X_test, y_train, y_test, feature_names, target_names

X_train, X_test, y_train, y_test, feature_names, target_names = load_dataset(
    verbose=True
)

2034 documents - 3.98MB (training set)
1353 documents - 2.87MB (test set)
4 categories
vectorize training done in 0.414s at 9.608MB/s
n_samples: 2034, n_features: 7831
vectorize testing done in 0.370s at 7.750MB/s
n_samples: 1353, n_features: 7831


In [9]:
# print some data
print(X_train[0])
print(y_train[0])
print(feature_names[0])
print(target_names[0])
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 49 stored elements and shape (1, 7831)>
  Coords	Values
  (0, 2517)	0.18882559857168102
  (0, 453)	0.14810906626622972
  (0, 7330)	0.13783664104130097
  (0, 254)	0.34404631756778353
  (0, 2276)	0.07033982375105988
  (0, 7070)	0.2944436581666543
  (0, 6173)	0.17819540353245403
  (0, 166)	0.08658058224839943
  (0, 3457)	0.09233043938439052
  (0, 7504)	0.06709283290470455
  (0, 4906)	0.13276131063021313
  (0, 6245)	0.17819540353245403
  (0, 4650)	0.10645464205726671
  (0, 4406)	0.1270798399040038
  (0, 5370)	0.13150813563613234
  (0, 1353)	0.12609338927283967
  (0, 2927)	0.24787945867395397
  (0, 3223)	0.08031114792141571
  (0, 2133)	0.1369871763301335
  (0, 5453)	0.09571658744266878
  (0, 5091)	0.23464700210776418
  (0, 5454)	0.12336340331196224
  (0, 2395)	0.09339764418615404
  (0, 4048)	0.05087600635720216
  (0, 3720)	0.1320941585100957
  (0, 6777)	0.13858629940851536
  (0, 2777)	0.12336340331196224
  (0, 6209)	0.06755182000

In [3]:
from sklearn import metrics
from sklearn.utils.extmath import density


def benchmark(clf, custom_name=False):
    print("_" * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print(f"train time: {train_time:.3}s")

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print(f"test time:  {test_time:.3}s")

    score = metrics.accuracy_score(y_test, pred)
    print(f"accuracy:   {score:.3}")

    if hasattr(clf, "coef_"):
        print(f"dimensionality: {clf.coef_.shape[1]}")
        print(f"density: {density(clf.coef_)}")
        print()

    print()
    if custom_name:
        clf_descr = str(custom_name)
    else:
        clf_descr = clf.__class__.__name__
    return clf_descr, score, train_time, test_time

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

results = []
for clf, name in (
    (LogisticRegression(C=5, max_iter=1000), "Logistic Regression"),
    (KNeighborsClassifier(n_neighbors=100), "kNN"),
    (ComplementNB(alpha=0.1), "Complement naive Bayes"),

):
    print("=" * 80)
    print(name)
    results.append(benchmark(clf, name))

Logistic Regression
________________________________________________________________________________
Training: 
LogisticRegression(C=5, max_iter=1000)
train time: 1.08s
test time:  0.001s
accuracy:   0.892
dimensionality: 7831
density: 1.0


kNN
________________________________________________________________________________
Training: 
KNeighborsClassifier(n_neighbors=100)
train time: 0.000999s
test time:  0.111s
accuracy:   0.864

Complement naive Bayes
________________________________________________________________________________
Training: 
ComplementNB(alpha=0.1)
train time: 0.002s
test time:  0.00101s
accuracy:   0.898



In [12]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    "alt.atheism",
    "comp.graphics",
    "comp.sys.ibm.pc.hardware",
    "misc.forsale",
    "rec.autos",
    "sci.space",
    "talk.religion.misc",
]

print("Loading 20 newsgroups training data")
raw_data, _ = fetch_20newsgroups(subset="train", categories=categories, return_X_y=True)
data_size_mb = sum(len(s.encode("utf-8")) for s in raw_data) / 1e6
print(f"{len(raw_data)} documents - {data_size_mb:.3f}MB")

Loading 20 newsgroups training data
3803 documents - 6.245MB


In [10]:
import re

def tokenize(doc):
    """Extract tokens from doc.

    This uses a simple regex that matches word characters to break strings
    into tokens. For a more principled approach, see CountVectorizer or
    TfidfVectorizer.
    """
    return (tok.lower() for tok in re.findall(r"\w+", doc))

In [11]:
from collections import defaultdict

def token_freqs(doc):
    """Extract a dict mapping tokens from doc to their occurrences."""

    freq = defaultdict(int)
    for tok in tokenize(doc):
        freq[tok] += 1
    return freq

In [15]:
from time import time
from sklearn.feature_extraction import DictVectorizer

dict_count_vectorizers = defaultdict(list)

t0 = time()
vectorizer = DictVectorizer()
vectorizer.fit_transform(token_freqs(d) for d in raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(
    vectorizer.__class__.__name__ + "\non freq dicts"
)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")

# print some data
print(token_freqs(raw_data[0]))
print(vectorizer.transform(token_freqs(raw_data[0])))
print(vectorizer.transform(token_freqs(raw_data[0])).toarray().shape)


done in 1.173 s at 5.3 MB/s
Found 47928 unique terms
defaultdict(<class 'int'>, {'subject': 1, 're': 1, 'christian': 1, 'daemons': 1, 'biblical': 1, 'demons': 1, 'the': 17, 'u': 1, 'from': 3, 'stigaard': 4, 'mhd': 2, 'moorhead': 6, 'msus': 3, 'edu': 3, 'reply': 1, 'to': 4, 'organization': 1, 'state': 2, 'university': 2, 'mn': 1, 'nntp': 1, 'posting': 1, 'host': 1, '134': 1, '29': 1, '97': 1, '2': 1, 'lines': 1, '23': 1, '667': 4, 'neighbor': 4, 'of': 5, 'beast': 7, 'no': 2, 'is': 7, 'across': 3, 'street': 2, '664': 1, 'and': 3, '668': 2, 'are': 2, 'neighbors': 1, 'i': 2, 'think': 2, 'some': 1, 'people': 1, 'still': 2, 'not': 2, 'clear': 1, 'on': 1, 'this': 3, 'but': 1, 'rather': 1, 'it': 2, 'in': 1, 'fact': 1, 'which': 1, 'sheesh': 1, 'didn': 1, 't': 2, 'you': 2, 'know': 1, '666': 1, 's': 1, 'apartment': 1, 'hall': 1, 'his': 1, 'along': 1, 'with': 2, 'rest': 1, '6th': 1, 'floor': 1, 'justin': 1, 'trying': 1, 'figure': 1, 'out': 1, 'what': 1, 'has': 1, 'do': 1, 'alt': 1, 'discordia': 1,

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print()
print(X.toarray())
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(1, 3))
X2 = vectorizer2.fit_transform(corpus)
print()
print(vectorizer2.get_feature_names_out())

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]

['and' 'and this' 'and this is' 'document' 'document is' 'document is the'
 'first' 'first document' 'is' 'is the' 'is the first' 'is the second'
 'is the third' 'is this' 'is this the' 'one' 'second' 'second document'
 'the' 'the first' 'the first document' 'the second' 'the second document'
 'the third' 'the third one' 'third' 'third one' 'this' 'this document'
 'this document is' 'this is' 'this is the' 'this the' 'this the first']


In [35]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
X = v.fit_transform(D)
print(v.get_feature_names_out())
print(X)

['bar' 'baz' 'foo']
[[2. 0. 1.]
 [0. 1. 3.]]


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(vectorizer.idf_)

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [40]:
# One Hot Encoding with example text
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())




[array([0, 1]), array([0, 1, 2]), array([0, 1, 2, 3])]
[[1. 0. 1. 0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 1. 0. 1. 0. 0.]
 [0. 1. 1. 0. 0. 0. 0. 1. 0.]]


In [51]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Define the vocabulary and document
vocabulary = ["document", "this", "here", "one", "is", "yet", "another", "third", "second", "and", "the", "first"]
document = ["this", "document", "is", "the", "second", "cool", "document"]

# Create a OneHotEncoder with a fixed vocabulary
encoder = OneHotEncoder(sparse_output=False)

# Transform the document into a 2D array for encoding
# Each word should be treated as a separate "feature" for the encoder
document_array = np.array(document).reshape(-1, 1)

# Apply One-Hot-Encoding
encoded_document = encoder.fit_transform(document_array)
print("One-Hot Encoded Matrix:")
print(encoded_document)

# Sum over rows to create a single vector representation for the document
document_vector = encoded_document.sum(axis=0)
print("\nDocument One-Hot Vector Representation:")
print(document_vector)

# Show feature names corresponding to the vector
print("\nFeature Names:")
print(encoder.get_feature_names_out())


One-Hot Encoded Matrix:
[[0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]]

Document One-Hot Vector Representation:
[1. 2. 1. 1. 1. 1.]

Feature Names:
['x0_cool' 'x0_document' 'x0_is' 'x0_second' 'x0_the' 'x0_this']
