### 20 Newsgroups Experiment:

- Vector + BOW features vs. Pure BOW features
- Minimal pre-processing

# Load Dataset

In [1]:
import sklearn.datasets

all_categories = [
    "alt.atheism",
    "comp.graphics",
    "comp.os.ms-windows.misc",
    "comp.sys.ibm.pc.hardware",
    "comp.sys.mac.hardware",
    "comp.windows.x",
    "misc.forsale",
    "rec.autos",
    "rec.motorcycles",
    "rec.sport.baseball",
    "rec.sport.hockey",
    "sci.crypt",
    "sci.electronics",
    "sci.med",
    "sci.space",
    "soc.religion.christian",
    "talk.politics.guns",
    "talk.politics.mideast",
    "talk.politics.misc",
    "talk.religion.misc",
]
categories = all_categories
# categories = ["alt.atheism", "comp.graphics", "rec.autos", "sci.space", "talk.religion.misc"]

# newsgroups_full = sklearn.datasets.load_files("20news_noDup", encoding="ANSI")
newsgroups_train = sklearn.datasets.fetch_20newsgroups(
    subset="train", remove=("headers",), categories=categories, shuffle=False
)
newsgroups_test = sklearn.datasets.fetch_20newsgroups(
    subset="test", remove=("headers",), categories=categories, shuffle=False
)

In [2]:
from tqdm.auto import tqdm

In [3]:
import numpy as np

In [4]:
import pandas as pd

train_df = pd.DataFrame(data=[newsgroups_train.data, newsgroups_train.target]).T
test_df = pd.DataFrame(data=[newsgroups_test.data, newsgroups_test.target]).T

for df in [train_df, test_df]:
    df.columns = ["text", "target"]
    df["text"] = df["text"].astype("string")
    # VW multiclass: Target cannot be 0
    df["target"] = [x + 1 for x in df["target"]]

In [22]:
# Preview first 100 rows
train_df[:100]

Unnamed: 0,text,target,vectorised
0,gajarsky@pilot.njin.net writes: morgan and g...,10,"[(▁gaj, [0.321606, 0.108409, -0.240523, -0.208..."
1,"Well, I just got my Centris 610 yesterday. ...",5,"[(▁well, [0.024367, -0.0046, 0.135297, 0.00219..."
2,Archive-name: cryptography-faq/part10 Last-mod...,12,"[(▁archive, [0.312358, -0.565265, 0.338044, -0..."
3,> ATTENTION: Mac Quadra owners: Many storage i...,5,"[(▁, [-0.643406, -0.245415, -0.161094, -0.0333..."
4,bobbe@vice.ICO.TEK.COM (Robert Beauchaine) wri...,1,"[(▁bob, [-0.039117, -0.57734, 0.397217, -0.034..."
...,...,...,...
95,smith@pell.anu.edu.au (Michael Smith) writes: ...,5,"[(▁smith, [0.05963, -0.0852, 0.530824, -0.2427..."
96,In article <1qkqrhINNobc@matt.ksu.ksu.edu> ken...,3,"[(▁in, [-0.121403, -0.19953, -0.167232, -0.238..."
97,In article <C5L5x0.KJ7@vcd.hp.com> johne@vcd.h...,13,"[(▁in, [-0.121403, -0.19953, -0.167232, -0.238..."
98,>Les Bartel's comments: >>>>Sorry I can't help...,8,"[(▁, [-0.643406, -0.245415, -0.161094, -0.0333..."


# Add Token Vectors

In [6]:
from bpemb import BPEmb

multibpemb = BPEmb(lang="en", vs=1000000, dim=300)

BPEmb fallback: en from vocab size 1000000 to 200000




In [7]:
import pickle

In [8]:
# Vectorise documents and add to DF
for df in [train_df, test_df]:
    vectorised_docs = []
    for doc in tqdm(df["text"]):
        # Normalise whitespace
        doc = " ".join(doc.split())

        subwords = multibpemb.encode(doc)
        vectors = multibpemb.embed(doc)
        vector_tuples = list(zip(subwords, vectors))
        vectorised_docs.append(vector_tuples)
    df["vectorised"] = vectorised_docs

  0%|          | 0/11314 [00:00<?, ?it/s]

  0%|          | 0/7532 [00:00<?, ?it/s]

In [9]:
# import numpy as np

# # Vector normalisation sample:
# # The first token in training doc 7033, normalised
# vector = train_df["vectorised"][7033][0][1]
# norm = np.sqrt(vector.dot(vector))

# normalised = (vector / np.linalg.norm(vector) + 1) / 2
# norm_2 = (vector / norm + 1) / 2
# print(min(vector), max(vector))
# print(min(normalised), max(normalised))
# print(min(norm_2), max(norm_2))

# Train/Test Split

In [10]:
# import sklearn.model_selection

# df_train, df_test = sklearn.model_selection.train_test_split(
#     pd_df, test_size=0.25, random_state=1
# )

# Done automatically by fetch_20newsgroups().

# VW

In [11]:
from vowpalwabbit import pyvw

In [12]:
# baseline model
vw_opts = {
    # General options
    "random_seed": 1,
    # Input options
    # Output options
    "progress": 100,
    # Example Manipulation options
    # Update rule options
    "loss_function": "logistic",
    # Weight options
    "bit_precision": 28,
    # Holdout options
    # Feature namespace options
    # Multiclass options
    "oaa": len(categories)
    # Other options
}

In [13]:
# We will run both models at the same time
vector_model = pyvw.vw(**vw_opts)
bow_model = pyvw.vw(**vw_opts)

In [14]:
def doc2namespaces(doc):
    """
    Takes a list of (<text>, <vector>), returns the vector and bow feature strings in VW format
    """
    # Vector namespaces
    vector_ns = []

    # Pre-processing -- Remove:
    # (1) VW special characters
    # (2) Duplicate tokens
    # (3) Tokens with no alphabetic characters
    # Also: Normalise vectors

    seen_tokens = set()

    # This is the final list of (<text>, <vector>) we will use
    clean_vectors = []
    for text, vector in doc:
        # Handle VW special chars
        text = text.replace(":", "").replace("|", "").strip()

        if text and text not in seen_tokens and any(char.isalpha() for char in text):
            seen_tokens.add(text)

            # Normalise vectors
            # Vector normalisation into the range [0,1]
            norm = np.sqrt(vector.dot(vector))
            vector = ((vector / norm) + 1) / 2

            clean_vectors.append((text, vector.astype("float64")))

    for dimension in range(300):
        vector_ns.append(f"|vector_d{dimension}")

        # List comprehension for building the feature string for this dimension
        vector_ns += [f"{text}:{vector[dimension]}" for text, vector in clean_vectors]

    # BOW namespace
    bow_features = [text for text, _ in clean_vectors]
    bow_ns = ["|bow"] + bow_features

    return vector_ns, bow_ns

In [15]:
# Training
for row in tqdm(train_df.itertuples(), total=len(train_df)):
    vector_ns, bow_ns = doc2namespaces(row.vectorised)

    # Vector model
    vw_string_builder = [f"{row.target}"] + vector_ns + bow_ns
    vw_string = " ".join(vw_string_builder)
    vector_model.learn(vw_string)

    # BOW model
    vw_string_builder = [f"{row.target}"] + bow_ns
    vw_string = " ".join(vw_string_builder)
    bow_model.learn(vw_string)

  0%|          | 0/11314 [00:00<?, ?it/s]

In [16]:
# Testing
test_labels = test_df["target"]
vector_predict = []
bow_predict = []

for row in tqdm(test_df.itertuples(), total=len(test_df)):
    vector_ns, bow_ns = doc2namespaces(row.vectorised)

    # Vector model
    vw_string_builder = vector_ns + bow_ns
    vw_string = " ".join(vw_string_builder)
    vector_predict.append(vector_model.predict(vw_string))

    # BOW model
    vw_string_builder = bow_ns
    vw_string = " ".join(vw_string_builder)
    bow_predict.append(bow_model.predict(vw_string))

  0%|          | 0/7532 [00:00<?, ?it/s]

# Evaluation

In [17]:
import sklearn.metrics

In [18]:
print("Classification report (Vector):")
print(
    sklearn.metrics.classification_report(
        test_labels[: len(vector_predict)], vector_predict
    )
)
print("Accuracy score (Vector):")
print(
    sklearn.metrics.accuracy_score(test_labels[: len(vector_predict)], vector_predict)
)

Classification report (Vector):
              precision    recall  f1-score   support

           1       0.72      0.72      0.72       319
           2       0.64      0.74      0.68       389
           3       0.77      0.61      0.68       394
           4       0.75      0.62      0.68       392
           5       0.74      0.78      0.76       385
           6       0.83      0.80      0.81       395
           7       0.79      0.86      0.82       390
           8       0.87      0.81      0.84       396
           9       0.84      0.92      0.88       398
          10       0.90      0.90      0.90       397
          11       0.92      0.95      0.93       399
          12       0.86      0.86      0.86       396
          13       0.68      0.68      0.68       393
          14       0.84      0.77      0.80       396
          15       0.79      0.87      0.83       394
          16       0.80      0.88      0.84       398
          17       0.75      0.82      0.78      

In [19]:
print("Classification report (BOW):")
print(
    sklearn.metrics.classification_report(test_labels[: len(bow_predict)], bow_predict)
)
print("Accuracy score (BOW):")
print(sklearn.metrics.accuracy_score(test_labels[: len(bow_predict)], bow_predict))

Classification report (BOW):
              precision    recall  f1-score   support

           1       0.77      0.72      0.75       319
           2       0.65      0.76      0.70       389
           3       0.80      0.62      0.70       394
           4       0.70      0.68      0.69       392
           5       0.77      0.78      0.77       385
           6       0.86      0.81      0.83       395
           7       0.70      0.91      0.79       390
           8       0.85      0.82      0.84       396
           9       0.86      0.92      0.89       398
          10       0.88      0.89      0.88       397
          11       0.95      0.93      0.94       399
          12       0.87      0.87      0.87       396
          13       0.66      0.70      0.68       393
          14       0.85      0.77      0.81       396
          15       0.82      0.87      0.84       394
          16       0.78      0.89      0.83       398
          17       0.71      0.84      0.77       36

In [20]:
bow_model.num_weights()

268435456

In [21]:
bow_model.predict(vw_string)

8

----------
## Vectors not normalised, BOW features included, duplicates and tokens with no alphabetic chars removed

```
Classification report (Vector):
              precision    recall  f1-score   support

           1       0.89      0.86      0.88       209
           2       0.78      0.76      0.77       253
           3       0.79      0.84      0.81       250
           4       0.81      0.74      0.77       238
           5       0.85      0.87      0.86       233
           6       0.89      0.87      0.88       237
           7       0.85      0.83      0.84       245
           8       0.88      0.88      0.88       241
           9       0.94      0.91      0.92       244
          10       0.96      0.93      0.94       247
          11       0.98      0.93      0.95       246
          12       0.96      0.89      0.92       240
          13       0.80      0.89      0.84       223
          14       0.87      0.90      0.88       248
          15       0.88      0.93      0.91       258
          16       0.88      0.92      0.90       250
          17       0.91      0.87      0.89       224
          18       0.97      0.88      0.92       255
          19       0.90      0.87      0.88       200
          20       0.68      0.86      0.76       166

    accuracy                           0.87      4707
   macro avg       0.87      0.87      0.87      4707
weighted avg       0.88      0.87      0.87      4707

Accuracy score (Vector):
0.8721053749734438
-----
Classification report (BOW):
              precision    recall  f1-score   support

           1       0.92      0.88      0.90       209
           2       0.79      0.82      0.80       253
           3       0.82      0.86      0.84       250
           4       0.84      0.77      0.80       238
           5       0.89      0.88      0.89       233
           6       0.91      0.90      0.90       237
           7       0.80      0.89      0.84       245
           8       0.93      0.91      0.92       241
           9       0.95      0.93      0.94       244
          10       0.97      0.96      0.96       247
          11       0.98      0.96      0.97       246
          12       0.95      0.93      0.94       240
          13       0.85      0.87      0.86       223
          14       0.93      0.94      0.94       248
          15       0.93      0.95      0.94       258
          16       0.92      0.95      0.94       250
          17       0.94      0.92      0.93       224
          18       0.98      0.94      0.96       255
          19       0.94      0.88      0.91       200
          20       0.81      0.83      0.82       166

    accuracy                           0.90      4707
   macro avg       0.90      0.90      0.90      4707
weighted avg       0.90      0.90      0.90      4707

Accuracy score (BOW):
0.9009985128531973
```

## Normalised vectors

```
Classification report (Vector):
              precision    recall  f1-score   support

           1       0.87      0.89      0.88       209
           2       0.79      0.84      0.81       253
           3       0.89      0.84      0.86       250
           4       0.82      0.78      0.80       238
           5       0.92      0.88      0.90       233
           6       0.89      0.90      0.90       237
           7       0.84      0.89      0.87       245
           8       0.92      0.90      0.91       241
           9       0.96      0.93      0.95       244
          10       0.98      0.97      0.98       247
          11       0.98      0.97      0.98       246
          12       0.97      0.93      0.95       240
          13       0.80      0.89      0.84       223
          14       0.91      0.93      0.92       248
          15       0.95      0.96      0.95       258
          16       0.90      0.96      0.93       250
          17       0.94      0.93      0.93       224
          18       0.98      0.94      0.96       255
          19       0.91      0.87      0.89       200
          20       0.85      0.84      0.85       166

    accuracy                           0.90      4707
   macro avg       0.90      0.90      0.90      4707
weighted avg       0.91      0.90      0.90      4707

Accuracy score (Vector):
0.9041852560016996
```