In [163]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Define the vocabulary and document
vocabulary = ["document", "this", "here", "one", "is", "yet", "another", "third", "second", "and", "the", "first", "cool"]
document = ["this", "document", "is", "the", "second", "cool", "document"]

encoder = OneHotEncoder(categories=[vocabulary], sparse_output=False)

# Transform the document into a 2D array for encoding
# Each word should be treated as a separate "feature" for the encoder
document_array = np.array(document).reshape(-1, 1)

# Apply One-Hot-Encoding
encoded_document = encoder.fit_transform(document_array)
print("One-Hot Encoded Matrix:")
print(encoded_document)

# Show feature names corresponding to the vector
print("\nFeature Names:")
print(encoder.get_feature_names_out())

document_vector = (encoded_document.sum(axis=0) > 0).astype(int)
print("\nDocument Vector:")
print(document_vector)


One-Hot Encoded Matrix:
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

Feature Names:
['x0_document' 'x0_this' 'x0_here' 'x0_one' 'x0_is' 'x0_yet' 'x0_another'
 'x0_third' 'x0_second' 'x0_and' 'x0_the' 'x0_first' 'x0_cool']

Document Vector:
[1 1 0 0 1 0 0 0 1 0 1 0 1]


In [141]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print()
print(X.toarray())
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(1, 3))
X2 = vectorizer2.fit_transform(corpus)
print()
print(vectorizer2.get_feature_names_out())

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]

['and' 'and this' 'and this is' 'document' 'document is' 'document is the'
 'first' 'first document' 'is' 'is the' 'is the first' 'is the second'
 'is the third' 'is this' 'is this the' 'one' 'second' 'second document'
 'the' 'the first' 'the first document' 'the second' 'the second document'
 'the third' 'the third one' 'third' 'third one' 'this' 'this document'
 'this document is' 'this is' 'this is the' 'this the' 'this the first']


In [142]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(vectorizer.idf_)

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [143]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    'rec.sport.hockey',
    'talk.politics.misc',
    'talk.politics.guns',
    "misc.forsale",
]

print("Loading 20 newsgroups training data")
raw_data, target = fetch_20newsgroups(subset="train", categories=categories, return_X_y=True)
data_size_mb = sum(len(s.encode("utf-8")) for s in raw_data) / 1e6
print(f"{len(raw_data)} documents - {data_size_mb:.3f}MB")

Loading 20 newsgroups training data
2196 documents - 4.433MB


In [144]:
print(raw_data[9])

#print category of first document
print(categories[target[9]])

From: arc@cco.caltech.edu (Aaron Ray Clements)
Subject: Re: the usual
Organization: California Institute of Technology, Pasadena
Lines: 30
NNTP-Posting-Host: sandman.caltech.edu

kendall@lds.loral.com (Colin Kendall 6842) writes:

>I just heard some anti-gun-control people giving the usual arguments:
>It's everyone's right to bear arms, and the way to solve the problem
>of people getting killed by guns is better law enforcement.

>It strikes me that this argument could be logically extended as follows:

>A nuclear weapon is an "arm", hence anyone has a right to have 
>nuclear weapons. And if someone uses his nuclear weapons to blow
>up New York, L.A., and Chicago, that's okay as long as we have a
>good police force capable of finding him and putting him in jail, 
>which will serve as a deterrent to others.

>Do any anti-gun-control people disagree with this, and if so,  why?

Yes, I am pro-gun, and yes, I do disagree with this statement.
Nuclear weapons in and of themselves are dangero

In [164]:
from time import time

t0 = time()
vectorizer = CountVectorizer(stop_words="english")
X=vectorizer.fit_transform(raw_data)
duration = time() - t0
print(f"done in {duration:.3f} s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")
#most frequent words
print(X.toarray().sum(axis=0).argsort()[::-1][:10])
print(np.array(vectorizer.get_feature_names_out())[X.toarray().sum(axis=0).argsort()[::-1][:10]])

done in 0.568 s
Found 36122 unique terms
[13502 31432 10268 20915 24579 35452  6628 25356 12931 33720]
['edu' 'subject' 'com' 'lines' 'organization' 'writes' 'article' 'people'
 'don' 'university']


In [146]:
import re

def tokenize(doc):
    """Extract tokens from doc using a regex."""
    # Tokenize and ensure a list of lowercase words
    return [tok.lower() for tok in re.findall(r"\w+", doc)]

def count_vectorize(corpus):
    # Step 1: Tokenize each document
    tokenized_corpus = [tokenize(doc) for doc in corpus]

    # Step 2: Build a vocabulary (sorted list of unique words)
    vocabulary = sorted(set(word for tokens in tokenized_corpus for word in tokens))

    # Step 3: Count occurrences of each word in each document
    count_matrix = []
    for tokens in tokenized_corpus:
        row = [tokens.count(word) for word in vocabulary]
        count_matrix.append(row)
    
    return vocabulary, count_matrix

# Example usage
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

# Unigram vectorization
vocabulary, count_matrix = count_vectorize(corpus)

print("Vocabulary:", vocabulary)
print("Count Matrix:")
for row in count_matrix:
    print(row)

Vocabulary: ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
Count Matrix:
[0, 1, 1, 1, 0, 0, 1, 0, 1]
[0, 2, 0, 1, 0, 1, 1, 0, 1]
[1, 0, 0, 1, 1, 0, 1, 1, 1]
[0, 1, 1, 1, 0, 0, 1, 0, 1]


In [147]:
t0 = time()
vocabulary, count_matrix = count_vectorize(raw_data)
duration = time() - t0
print(f"done in {duration//60:.0f}m {duration%60:.3f}s")

print("Vocabulary:", vocabulary[:5])
print("Count Matrix:")
for i in range(5):
    print(count_matrix[i])

done in 5m 1.671s
Vocabulary: ['0', '00', '000', '000007', '000152']
Count Matrix:
[2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [172]:
def size_mb(docs):
    return sum(len(s.encode("utf-8")) for s in docs) / 1e6


def load_dataset(verbose=False, remove=()):
    """Load and vectorize the 20 newsgroups dataset."""

    data_train = fetch_20newsgroups(
        subset="train",
        categories=categories,
        shuffle=True,
        random_state=42,
        remove=remove,
    )

    data_test = fetch_20newsgroups(
        subset="test",
        categories=categories,
        shuffle=True,
        random_state=42,
        remove=remove,
    )

    # order of labels in `target_names` can be different from `categories`
    target_names = data_train.target_names

    # split target in a training set and a test set
    y_train, y_test = data_train.target, data_test.target

    # Extracting features from the training data
    vectorizer = CountVectorizer(stop_words="english")
    X_train = vectorizer.fit_transform(data_train.data)

    # Extracting features from the test data using the same vectorizer
    X_test = vectorizer.transform(data_test.data)

    feature_names = vectorizer.get_feature_names_out()

    return X_train, X_test, y_train, y_test, feature_names, target_names

X_train, X_test, y_train, y_test, feature_names, target_names = load_dataset(
    verbose=True
)

In [173]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

In [174]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics

def plot_feature_effects():
    # learned coefficients weighted by frequency of appearance
    average_feature_effects = clf.coef_ * np.asarray(X_train.mean(axis=0)).ravel()

    for i, label in enumerate(target_names):
        top5 = np.argsort(average_feature_effects[i])[-5:][::-1]
        if i == 0:
            top = pd.DataFrame(feature_names[top5], columns=[label])
            top_indices = top5
        else:
            top[label] = feature_names[top5]
            top_indices = np.concatenate((top_indices, top5), axis=None)
    top_indices = np.unique(top_indices)
    predictive_words = feature_names[top_indices]

    ## plot feature effects
    #bar_size = 0.25
    #padding = 0.75
    #y_locs = np.arange(len(top_indices)) * (4 * bar_size + padding)
#
    #fig, ax = plt.subplots(figsize=(10, 8))
    #for i, label in enumerate(target_names):
    #    ax.barh(
    #        y_locs + (i - 2) * bar_size,
    #        average_feature_effects[i, top_indices],
    #        height=bar_size,
    #        label=label,
    #    )
    #ax.set(
    #    yticks=y_locs,
    #    yticklabels=predictive_words,
    #    ylim=[
    #        0 - 4 * bar_size,
    #        len(top_indices) * (4 * bar_size + padding) - 4 * bar_size,
    #    ],
    #)
    #ax.legend(loc="lower right")

    score = metrics.accuracy_score(y_test, pred)
    print(f"accuracy:   {score:.3}")
    
    print()
    print("top 5 keywords per class:")
    print(top)

    #return ax


_ = plot_feature_effects()#.set_title("Average feature effect on the original data")

accuracy:   0.894

top 5 keywords per class:
   misc.forsale rec.sport.hockey talk.politics.guns talk.politics.misc
0          sale           hockey                gun             writes
1           edu             team                com            article
2         lines             game               guns                com
3  organization               ca                edu             people
4           new     organization            subject            clinton


In [151]:
from sklearn import metrics
from sklearn.utils.extmath import density

def benchmark(clf, custom_name=False):
    print("_" * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print(f"train time: {train_time:.3}s")

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print(f"test time:  {test_time:.3}s")

    score = metrics.accuracy_score(y_test, pred)
    print(f"accuracy:   {score:.3}")

    print()
    if custom_name:
        clf_descr = str(custom_name)
    else:
        clf_descr = clf.__class__.__name__
    return clf_descr, score, train_time, test_time

In [175]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier

results = []
for clf, name in (
    (LogisticRegression(C=5, max_iter=1000), "Logistic Regression"),
    (KNeighborsClassifier(n_neighbors=100), "kNN"),
    (ComplementNB(alpha=0.1), "Complement naive Bayes"),

):
    print("=" * 80)
    print(name)
    results.append(benchmark(clf, name))

Logistic Regression
________________________________________________________________________________
Training: 
LogisticRegression(C=5, max_iter=1000)
train time: 0.765s
test time:  0.002s
accuracy:   0.893

kNN
________________________________________________________________________________
Training: 
KNeighborsClassifier(n_neighbors=100)
train time: 0.000999s
test time:  0.277s
accuracy:   0.394

Complement naive Bayes
________________________________________________________________________________
Training: 
ComplementNB(alpha=0.1)
train time: 0.006s
test time:  0.001s
accuracy:   0.915

