### 20 Newsgroups Experiment:

- Vector + BOW features vs. Pure BOW features
- Minimal pre-processing

# Load Dataset

In [1]:
import spacy

spacy_nlp = spacy.load("en_core_web_md")

In [2]:
import sklearn.datasets

newsgroups = sklearn.datasets.load_files("20news_noDup", encoding="ANSI")

In [3]:
from tqdm.auto import tqdm

In [4]:
import numpy as np

In [5]:
import pandas as pd

pd_df = pd.DataFrame(data=[newsgroups.data, newsgroups.target]).T
pd_df.columns = ["text", "target"]
pd_df["text"] = pd_df["text"].astype("string")
# VW multiclass: Target cannot be 0
pd_df["target"] = [x + 1 for x in pd_df["target"]]

In [6]:
# Preview first 100 rows
pd_df[:100]

Unnamed: 0,text,target
0,"From: haston@utkvx.utk.edu (Haston, Donald Way...",2
1,From: jr0930@eve.albany.edu (REGAN JAMES P) S...,2
2,From: cfaehl@vesta.unm.edu (Chris Faehl) Subj...,1
3,From: sera@zuma.UUCP (Serdar Argic) Subject: ...,18
4,From: boyle@cactus.org (Craig Boyle) Subject:...,8
...,...,...
95,"From: kcochran@nyx.cs.du.edu (Keith ""Justified...",20
96,From: jyaruss@hamp.hampshire.edu Subject: Mis...,9
97,From: mse@cc.bellcore.com (25836-michael evenc...,10
98,From: rogerh@Autodesk.COM (Roger Hupfauer) Su...,8


# Add Token Vectors

In [7]:
import pickle

In [8]:
# Uncomment for first load

# vectorised_docs = []
# with spacy_nlp.select_pipes(enable=["tok2vec"]):
#     for doc in tqdm(spacy_nlp.pipe(pd_df["text"]), total=len(pd_df)):
#         vectorised_docs.append(doc)

# with open("vectorised_docs.pkl", "wb") as fp:
#     pickle.dump(vectorised_docs, fp)

In [9]:
# Use pre-saved vectorised spacy Docs
with open("vectorised_docs.pkl", "rb") as fp:
    vectorised_docs = pickle.load(fp)

In [10]:
# Add to DataFrame
pd_df["vectorised"] = vectorised_docs

In [128]:
# Vector normalisation sample:
# "From", the first token in doc 7033, normalised
normalised = ((df_train["vectorised"][7033][0].vector / df_train["vectorised"][7033][0].vector_norm) + 1) / 2
print(min(normalised), max(normalised))

0.41479465 0.7338455


# Train/Test Split

In [12]:
import sklearn.model_selection

df_train, df_test = sklearn.model_selection.train_test_split(
    pd_df, test_size=0.25, random_state=1
)

# VW

In [129]:
from vowpalwabbit import pyvw

In [130]:
# baseline model
vw_opts = {
    # General options
    "random_seed": 1,
    # Input options
    # Output options
    "progress": 50,
    # Example Manipulation options
    # Update rule options
    "loss_function": "logistic",
    # Weight options
    "bit_precision": 28,
    # Holdout options
    # Feature namespace options
    # Multiclass options
    "oaa": 20
    # Other options
}

In [131]:
# We will run both models at the same time
vector_model = pyvw.vw(**vw_opts)
bow_model = pyvw.vw(**vw_opts)

In [132]:
def doc2namespaces(doc):
    """
    Takes a spacy Doc, returns the vector and bow feature strings in VW format
    """
    # Vector namespaces
    vector_ns = []
    
    # Pre-processing -- Remove:
    # (1) Empty tokens and vectors
    # (2) Non-alphabetic tokens
    # (3) VW special characters
    # (4) Duplicate tokens
    seen_tokens = set()
    
    # This is a list of (<text>, <vector>)
    vector_tuples = []
    for token in doc:
        # Handle VW special chars
        text = token.text.replace(":", "").replace("|", "").strip()
        
        # Only keep non-empty alphabetic strings/vectors
        if text and text.isalpha() and text not in seen_tokens and np.any(token.vector):
            seen_tokens.add(text)
            
            # Vector normalisation into the range [0,1]
            vector = ((token.vector / token.vector_norm) + 1) / 2
            vector_tuples.append((text, vector.astype("float64")))

    for dimension in range(300):
        # 300 dimensions produced by spacy tok2vec
        vector_ns.append(f"|vector_d{dimension}")

        # List comprehension for building the feature string for this dimension
        vector_ns += [f"{text}:{vector[dimension]}" for text, vector in vector_tuples]

    # BOW namespace
    bow_features = [text for text, _ in vector_tuples]
    bow_ns = ["|bow"] + bow_features
    
    return vector_ns, bow_ns

In [133]:
# Training
for row in tqdm(df_train.itertuples(), total=len(df_train)):
    vector_ns, bow_ns = doc2namespaces(row.vectorised)

    # Vector model
    vw_string_builder = [f"{row.target}"] + vector_ns + bow_ns
    vw_string = " ".join(vw_string_builder)
    vector_model.learn(vw_string)

    # BOW model
    vw_string_builder = [f"{row.target}"] + bow_ns
    vw_string = " ".join(vw_string_builder)
    bow_model.learn(vw_string)

  0%|          | 0/14121 [00:00<?, ?it/s]

In [135]:
# Testing
test_labels = df_test["target"]
vector_predict = []
bow_predict = []

for row in tqdm(df_test.itertuples(), total=len(df_test)):
    vector_ns, bow_ns = doc2namespaces(row.vectorised)

    # Vector model
    vw_string_builder = vector_ns + bow_ns
    vw_string = " ".join(vw_string_builder)
    vector_predict.append(vector_model.predict(vw_string))

    # BOW model
    vw_string_builder = bow_ns
    vw_string = " ".join(vw_string_builder)
    bow_predict.append(bow_model.predict(vw_string))

  0%|          | 0/4707 [00:00<?, ?it/s]

# Evaluation

In [136]:
import sklearn.metrics

In [137]:
print("Classification report (Vector):")
print(
    sklearn.metrics.classification_report(
        test_labels[: len(vector_predict)], vector_predict
    )
)
print("Accuracy score (Vector):")
print(
    sklearn.metrics.accuracy_score(test_labels[: len(vector_predict)], vector_predict)
)

Classification report (Vector):
              precision    recall  f1-score   support

           1       0.82      0.84      0.83       209
           2       0.83      0.85      0.84       253
           3       0.85      0.82      0.83       250
           4       0.83      0.76      0.79       238
           5       0.87      0.87      0.87       233
           6       0.93      0.89      0.91       237
           7       0.80      0.91      0.85       245
           8       0.91      0.94      0.93       241
           9       0.94      0.92      0.93       244
          10       0.98      0.97      0.97       247
          11       0.98      0.97      0.98       246
          12       0.98      0.90      0.94       240
          13       0.84      0.88      0.86       223
          14       0.91      0.93      0.92       248
          15       0.95      0.95      0.95       258
          16       0.88      0.95      0.91       250
          17       0.93      0.94      0.94      

In [138]:
print("Classification report (BOW):")
print(
    sklearn.metrics.classification_report(test_labels[: len(bow_predict)], bow_predict)
)
print("Accuracy score (BOW):")
print(sklearn.metrics.accuracy_score(test_labels[: len(bow_predict)], bow_predict))

Classification report (BOW):
              precision    recall  f1-score   support

           1       0.90      0.84      0.87       209
           2       0.77      0.79      0.78       253
           3       0.77      0.82      0.80       250
           4       0.79      0.74      0.76       238
           5       0.82      0.85      0.83       233
           6       0.90      0.86      0.88       237
           7       0.75      0.90      0.82       245
           8       0.95      0.92      0.93       241
           9       0.93      0.93      0.93       244
          10       0.95      0.96      0.95       247
          11       0.98      0.97      0.97       246
          12       0.95      0.90      0.93       240
          13       0.86      0.85      0.86       223
          14       0.94      0.94      0.94       248
          15       0.94      0.93      0.94       258
          16       0.88      0.94      0.91       250
          17       0.94      0.93      0.93       22

----------
## Vectors with normalisation into [0,1], BOW features included, duplicates and non-alphabetic tokens removed

### (5000 train samples, 500 test samples)

```
Classification report (Vector):
              precision    recall  f1-score   support

           1       0.85      0.81      0.83        21
           2       0.63      0.77      0.70        31
           3       0.90      0.84      0.87        32
           4       0.68      0.76      0.72        25
           5       0.95      0.86      0.90        21
           6       0.84      0.67      0.74        24
           7       0.96      0.92      0.94        24
           8       0.76      0.96      0.85        27
           9       0.83      0.79      0.81        19
          10       1.00      0.94      0.97        31
          11       0.91      0.97      0.94        32
          12       1.00      1.00      1.00        25
          13       0.89      0.89      0.89        27
          14       0.84      0.90      0.87        29
          15       0.96      0.86      0.91        28
          16       0.88      0.92      0.90        24
          17       0.83      0.96      0.89        25
          18       0.80      0.84      0.82        19
          19       0.95      0.73      0.83        26
          20       1.00      0.60      0.75        10

    accuracy                           0.86       500
   macro avg       0.87      0.85      0.86       500
weighted avg       0.87      0.86      0.86       500

Accuracy score (Vector):
0.86
-----
Classification report (BOW):
              precision    recall  f1-score   support

           1       0.78      0.86      0.82        21
           2       0.67      0.77      0.72        31
           3       0.81      0.81      0.81        32
           4       0.78      0.72      0.75        25
           5       0.86      0.86      0.86        21
           6       0.78      0.88      0.82        24
           7       0.81      0.88      0.84        24
           8       0.74      0.96      0.84        27
           9       0.82      0.74      0.78        19
          10       0.97      0.90      0.93        31
          11       0.91      0.91      0.91        32
          12       0.93      1.00      0.96        25
          13       0.88      0.78      0.82        27
          14       0.86      0.83      0.84        29
          15       0.96      0.79      0.86        28
          16       0.85      0.92      0.88        24
          17       0.86      1.00      0.93        25
          18       0.87      0.68      0.76        19
          19       0.86      0.73      0.79        26
          20       1.00      0.50      0.67        10

    accuracy                           0.84       500
   macro avg       0.85      0.83      0.83       500
weighted avg       0.85      0.84      0.84       500

Accuracy score (BOW):
0.838
```

## (Full train/test set)

```
Classification report (Vector):
              precision    recall  f1-score   support

           1       0.82      0.84      0.83       209
           2       0.83      0.85      0.84       253
           3       0.85      0.82      0.83       250
           4       0.83      0.76      0.79       238
           5       0.87      0.87      0.87       233
           6       0.93      0.89      0.91       237
           7       0.80      0.91      0.85       245
           8       0.91      0.94      0.93       241
           9       0.94      0.92      0.93       244
          10       0.98      0.97      0.97       247
          11       0.98      0.97      0.98       246
          12       0.98      0.90      0.94       240
          13       0.84      0.88      0.86       223
          14       0.91      0.93      0.92       248
          15       0.95      0.95      0.95       258
          16       0.88      0.95      0.91       250
          17       0.93      0.94      0.94       224
          18       0.97      0.94      0.95       255
          19       0.91      0.87      0.89       200
          20       0.81      0.79      0.80       166

    accuracy                           0.90      4707
   macro avg       0.90      0.89      0.90      4707
weighted avg       0.90      0.90      0.90      4707

Accuracy score (Vector):
0.8973868706182282
-----
Classification report (BOW):
              precision    recall  f1-score   support

           1       0.90      0.84      0.87       209
           2       0.77      0.79      0.78       253
           3       0.77      0.82      0.80       250
           4       0.79      0.74      0.76       238
           5       0.82      0.85      0.83       233
           6       0.90      0.86      0.88       237
           7       0.75      0.90      0.82       245
           8       0.95      0.92      0.93       241
           9       0.93      0.93      0.93       244
          10       0.95      0.96      0.95       247
          11       0.98      0.97      0.97       246
          12       0.95      0.90      0.93       240
          13       0.86      0.85      0.86       223
          14       0.94      0.94      0.94       248
          15       0.94      0.93      0.94       258
          16       0.88      0.94      0.91       250
          17       0.94      0.93      0.93       224
          18       0.98      0.92      0.95       255
          19       0.92      0.85      0.89       200
          20       0.78      0.75      0.77       166

    accuracy                           0.88      4707
   macro avg       0.88      0.88      0.88      4707
weighted avg       0.89      0.88      0.88      4707

Accuracy score (BOW):
0.8831527512215849
```