In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from datasets import load_dataset
from sklearn.metrics import classification_report

from npc_gzip.compressors.base import BaseCompressor
from npc_gzip.compressors.gzip_compressor import GZipCompressor
from npc_gzip.knn_classifier import KnnClassifier



# An interesting result
![Red is where (bigger, more complex) models lose to a silly approach](../static/gzip-performance.png "Results Table")


[A recent paper](https://aclanthology.org/2023.findings-acl.426.pdf) suggests you don't need big, complicated machine learning models for many simpler text classification tasks. We're going to test this in this notebook.

## Download Dataset

In [2]:
dataset = load_dataset("ag_news")

X_train = np.array(dataset['train']['text'])  # type: ignore
X_test = np.array(dataset['test']['text']) # type: ignore
y_train = np.array(dataset['train']['label']) # type: ignore
y_test = np.array(dataset['test']['label']) # type: ignore

label_map = {
    idx: name
    for idx, name
    in enumerate(dataset["train"].features['label'].names) # type:ignore
}
print(f"All labels: {label_map}", end="\n\n")

All labels: {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}



In [3]:
print("Training sample: ", X_train[0])
print(f"Label: {y_train[0]} ({label_map[y_train[0]]})")

Training sample:  Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
Label: 2 (Business)


## "Train" model

This model is interesting in that it doesn't need "training", and works by comparison at inference time 

In [4]:
compressor = GZipCompressor()
model = KnnClassifier(
    compressor=compressor,
    training_inputs=X_train.tolist(),
    training_labels=y_train.tolist(),
    distance_metric="ncd",
)

## Test outputs

In [5]:
random_indicies = np.random.choice(X_test.shape[0], 250, replace=False)
(distances, labels, similar_samples) = model.predict(
    X_test[random_indicies].tolist(), 1, sampling_percentage=0.05
)

report = classification_report(y_test[random_indicies].tolist(), labels.reshape(-1))

print(report)

Compressing input...: 100%|██████████| 250/250 [01:21<00:00,  3.07it/s]


              precision    recall  f1-score   support

           0       0.78      0.77      0.77        60
           1       0.92      0.87      0.89        76
           2       0.58      0.73      0.65        52
           3       0.72      0.63      0.67        62

    accuracy                           0.76       250
   macro avg       0.75      0.75      0.75       250
weighted avg       0.77      0.76      0.76       250



In [6]:
test_sample = "Socialites unite dolphin groups Dolphin groups, or 'pods', rely on socialites to keep them from collapsing, scientists claim."
(_, test_label_list, similar_samples) = model.predict(
    [test_sample], 1, sampling_percentage=0.2
)
test_label = test_label_list[0]

Compressing input...: 100%|██████████| 1/1 [00:01<00:00,  1.21s/it]


In [7]:
print(f"Label: {test_label} ({label_map[test_label]})")

Label: 0 (World)


In [8]:
similar_samples

array([["Pigeons 'sense magnetic field' Homing pigeons use the Earth's magnetic field to navigate their way home over long distances, scientists claim."]],
      dtype='<U1012')

# Findings

This model is super cool! It's slow at inference time but our use case is one-at-a-time inference. Let's try to put it into production!