# data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
df_train = pd.read_csv("/content/drive/MyDrive/r8/r8-train-stemmed.csv")
df_test = pd.read_csv("/content/drive/MyDrive/r8/r8-test-stemmed.csv")

In [3]:
uniq = list(set(df_train["intent"].values))
labels = {j:i for i,j in zip(range(len(uniq)), uniq)}
labels

{'interest': 0,
 'earn': 1,
 'acq': 2,
 'ship': 3,
 'trade': 4,
 'money-fx': 5,
 'crude': 6,
 'grain': 7}

In [4]:
df_train["label"] = df_train["intent"].apply(lambda x: labels[x])
df_test["label"] = df_test["intent"].apply(lambda x: labels[x])

# raw text : vanilla KNN

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(df_train['text'])
X_test = vectorizer.transform(df_test['text'])

# Encode the labels
le = LabelEncoder()
y_train = le.fit_transform(df_train['label'])
y_test = le.transform(df_test['label'])

# Train a KNN classifier
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)

# Predict the classes for the test set
predicted_classes = knn.predict(X_test)

# Calculate accuracy
print("Accuracy:", accuracy_score(y_test, predicted_classes))


Accuracy: 0.7912288716308817


# raw text : knn_ncd(gzip) - ***Less is More***


In [None]:
from tqdm import tqdm
from collections import Counter
import gzip

k = 2

predicted_classes = []

for row_test in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    test_text = row_test[1]["text"]
    test_label = row_test[1]["label"]
    c_test_text = len(gzip.compress(test_text.encode()))
    distance_from_test_instance = []

    for row_train in df_train.iterrows():
        train_text = row_train[1]["text"]
        train_label = row_train[1]["label"]
        c_train_text = len(gzip.compress(train_text.encode()))

        train_plus_test = " ".join([test_text, train_text])
        c_train_plus_test = len(gzip.compress(train_plus_test.encode()))

        ncd = ( (c_train_plus_test - min(c_train_text, c_test_text))
                / max(c_test_text, c_train_text) )
        distance_from_test_instance.append(ncd)

    sorted_idx = np.argsort(np.array(distance_from_test_instance))
    top_k_class = np.array(df_train["label"])[sorted_idx[:k]]
    predicted_class = Counter(top_k_class).most_common()[0][0]

    predicted_classes.append(predicted_class)

print("Accuracy:", np.mean(np.array(predicted_classes) == df_test["label"].values))

100%|██████████| 2189/2189 [24:20<00:00,  1.50it/s]

Accuracy: 0.912745545911375






* KNN with NCD is really good on this r8 dataset.


* We'll use this as the replica of the University of Waterloo paper.


*  I'll have to investigate later what the core differences are between the r8 dataset and the webkb dataset that allowed for such different accuracies.
  * In my webkb dataset, I preprocess the strings to remove the html/div tags. I'm going to try to see if that was a bad assumption by testing a model without any pre-processing.
  





# raw text : knn_ncd(zstandard)

In [None]:
!pip install zstandard


Collecting zstandard
  Downloading zstandard-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/2.7 MB[0m [31m1.8 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.7/2.7 MB[0m [31m41.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: zstandard
Successfully installed zstandard-0.21.0


In [None]:
from tqdm import tqdm
import numpy as np
from collections import Counter

cctx = zstd.ZstdCompressor()

k = 2
predicted_classes = []

for row_test in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    test_text = row_test[1]["text"]
    test_label = row_test[1]["label"]
    c_test_text = len(cctx.compress(test_text.encode()))
    distance_from_test_instance = []

    for row_train in df_train.iterrows():
        train_text = row_train[1]["text"]
        train_label = row_train[1]["label"]
        c_train_text = len(cctx.compress(train_text.encode()))

        train_plus_test = " ".join([test_text, train_text])
        c_train_plus_test = len(cctx.compress(train_plus_test.encode()))

        ncd = ((c_train_plus_test - min(c_train_text, c_test_text))
               / max(c_test_text, c_train_text))
        distance_from_test_instance.append(ncd)

    sorted_idx = np.argsort(np.array(distance_from_test_instance))
    top_k_class = np.array(df_train["label"])[sorted_idx[:k]]
    predicted_class = Counter(top_k_class).most_common()[0][0]

    predicted_classes.append(predicted_class)

print("Accuracy:", np.mean(np.array(predicted_classes) == df_test["label"].values))


100%|██████████| 2189/2189 [11:42<00:00,  3.12it/s]

Accuracy: 0.8072179077204202





# raw text : knn_ncd (zstandard compression dictionary)

In [None]:
import zstandard as zstd
import numpy as np
from collections import Counter
from tqdm import tqdm

# Train a dictionary
training_data = df_train["text"].apply(lambda x: x.encode()).tolist()
dictionary = zstd.train_dictionary(131072, training_data)  # 131072 is the size of the dictionary

# Create a zstandard compression context using the dictionary
cctx = zstd.ZstdCompressor(dict_data=dictionary)

# Testing KNN NCD model using zstandard with dictionary
k = 2
predicted_classes = []

for row_test in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    test_text = row_test[1]["text"].encode()
    test_label = row_test[1]["label"]
    c_test_text = len(cctx.compress(test_text))
    distance_from_test_instance = []

    for row_train in df_train.iterrows():
        train_text = row_train[1]["text"].encode()
        train_label = row_train[1]["label"]
        c_train_text = len(cctx.compress(train_text))

        train_plus_test = b" ".join([test_text, train_text])
        c_train_plus_test = len(cctx.compress(train_plus_test))

        ncd = ((c_train_plus_test - min(c_train_text, c_test_text))
                / max(c_test_text, c_train_text))
        distance_from_test_instance.append(ncd)

    sorted_idx = np.argsort(np.array(distance_from_test_instance))
    top_k_class = np.array(df_train["label"])[sorted_idx[:k]]
    predicted_class = Counter(top_k_class).most_common()[0][0]

    predicted_classes.append(predicted_class)

print("Accuracy:", np.mean(np.array(predicted_classes) == df_test["label"].values))


100%|██████████| 2189/2189 [10:18<00:00,  3.54it/s]

Accuracy: 0.6477843764275925





# embeddings : BERT

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.7 MB/s[0m eta [36m0:00:

In [None]:
import pandas as pd
df_train = pd.read_csv("/content/drive/MyDrive/r8/r8-train-stemmed.csv")
df_test = pd.read_csv("/content/drive/MyDrive/r8/r8-test-stemmed.csv")

In [None]:
from transformers import BertTokenizerFast, TFBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import SparseCategoricalAccuracy
import numpy as np

# Load tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def preprocess_function(texts):
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=128, return_tensors='tf')
    return encodings['input_ids'], encodings['attention_mask']

In [None]:
X_train = preprocess_function(df_train['text'].tolist())
X_test = preprocess_function(df_test['text'].tolist())

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(df_train['intent'])
y_test = le.transform(df_test['intent'])

In [None]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(np.unique(y_train)))

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.compile(optimizer=Adam(learning_rate=5e-5), loss=SparseCategoricalCrossentropy(from_logits=True), metrics=[SparseCategoricalAccuracy()])

In [None]:
model.fit(X_train, y_train, epochs=3, batch_size=16)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f47203ca590>

In [None]:
y_pred = model.predict(X_test).logits.argmax(-1)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9529465509365007


The results are as good as the paper, with the KNN_NCD model with compressed outputs scoring withing 10% accuracy.

In [None]:
# So, at this point, we've recreated the ingenuity of the Waterloo folks who showed that run of the mill compression algorithms can
# provide for models that can perform within reason of LLM's

# We've also tried to use alternate forms of compression and approaches using zstd and compression dictionaries.

# GZIP on raw text outperforms all other approachs on the KNN NCD model

# embeddings prove to lose relevant information when compressed using gzip

# What we want to experiment with is compressing embeddings while maintaing it's nuance.

# The first thing we'll do is extract BERT vectors that match our input space.

# extract BERT embeddings


In [7]:
pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.4 MB/s[0m eta [36m0:00:0

In [8]:
from transformers import BertTokenizerFast, TFBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import SparseCategoricalAccuracy
import numpy as np

# Load tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
le = LabelEncoder()
y_train = le.fit_transform(df_train['intent'])
y_test = le.transform(df_test['intent'])

In [10]:
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(np.unique(y_train)))

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
import tensorflow as tf

def preprocess_function(texts):
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=128, return_tensors='tf')
    return list(zip(encodings['input_ids'], encodings['attention_mask']))

def get_embeddings_from_tokenized_data(X_data, batch_size=32):
    embeddings_list = []
    num_batches = int(np.ceil(len(X_data) / batch_size))
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(X_data))
        batch_input_ids, batch_attention_mask = zip(*X_data[start_idx:end_idx])
        with tf.GradientTape(persistent=False):
            input_ids = tf.convert_to_tensor(batch_input_ids)
            attention_mask = tf.convert_to_tensor(batch_attention_mask)
            outputs = bert_model.bert(input_ids, attention_mask=attention_mask)
            embeddings = outputs[0]
            embeddings = tf.reduce_mean(embeddings, axis=1)
            embeddings_list.append(embeddings.numpy())
    return np.concatenate(embeddings_list)

X_train = preprocess_function(df_train["text"].tolist())
X_test = preprocess_function(df_test["text"].tolist())


X_train_embeddings = get_embeddings_from_tokenized_data(X_train, batch_size=32)

In [12]:
X_test_embeddings = get_embeddings_from_tokenized_data(X_test, batch_size=32)

In [13]:
df_train['embedding'] = X_train_embeddings.tolist()
df_test['embedding'] = X_test_embeddings.tolist()

In [14]:
df_train.head(5)

Unnamed: 0,text,edge,intent,label,embedding
0,champion product approv stock split champion p...,champion product approv stock split champion p...,earn,1,"[-0.04421480745077133, -0.19200418889522552, 0..."
1,comput termin system cpml complet sale comput ...,comput termin system cpml complet sale comput ...,acq,2,"[-0.4854443073272705, -0.37823349237442017, 0...."
2,cobanco inc cbco year net shr ct dlr net asset...,cobanco inc cbco year net shr ct dlr net asset...,earn,1,"[-0.20690399408340454, -0.27663424611091614, 0..."
3,intern inc qtr jan oper shr loss two ct profit...,intern inc qtr jan oper shr loss two ct profit...,earn,1,"[-0.16384631395339966, -0.21608641743659973, 0..."
4,brown forman inc bfd qtr net shr dlr ct net ml...,brown forman inc bfd qtr net shr dlr ct net ml...,earn,1,"[-0.006501348689198494, -0.39689308404922485, ..."


# embeddings : vanilla KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np

df_train['embedding'] = df_train['embedding'].apply(lambda x: np.array(x))
df_test['embedding'] = df_test['embedding'].apply(lambda x: np.array(x))

X_train = np.vstack(df_train['embedding'].values)
X_test = np.vstack(df_test['embedding'].values)

le = LabelEncoder()
y_train = le.fit_transform(df_train['label'])
y_test = le.transform(df_test['label'])

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
predicted_labels = knn.predict(X_test)

accuracy = np.mean(predicted_labels == y_test)
print(f'Accuracy: {accuracy}')


Accuracy: 0.8990406578346277


# embeddings : KNN_NCD

In [None]:
from joblib import Parallel, delayed
import multiprocessing as mp
from collections import Counter
from tqdm import tqdm
import numpy as np
import gzip

# Process a subset of the dataset
def process_dataset_subset(df_train_subset, test_embedding, c_test_embedding):
    distances_to_test = []
    for _, row_train in df_train_subset.iterrows():
        train_embedding = np.array(row_train["embedding"], dtype=np.float32)
        c_train_embedding = len(gzip.compress(train_embedding.tobytes()))

        train_plus_test = np.concatenate([test_embedding, train_embedding])
        c_train_plus_test = len(gzip.compress(train_plus_test.tobytes()))

        ncd = ((c_train_plus_test - min(c_train_embedding, c_test_embedding)) / max(c_test_embedding, c_train_embedding))
        distances_to_test.append(ncd)
    return distances_to_test

# Divide the dataset into num_processes chunks
def divide_range_into_chunks(start, end, num_chunks):
    chunk_size = (end - start) // num_chunks
    ranges = [(i, i + chunk_size) for i in range(start, end, chunk_size)]
    ranges[-1] = (ranges[-1][0], end)  # Ensure the last chunk includes the end
    return ranges

num_processes = mp.cpu_count()
k = 2
ranges = divide_range_into_chunks(0, df_train.shape[0], num_processes)
predicted_classes = []

for _, row_test in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    test_embedding = np.array(row_test["embedding"], dtype=np.float32)
    c_test_embedding = len(gzip.compress(test_embedding.tobytes()))
    all_train_distances_to_test = []

    with Parallel(n_jobs=num_processes, backend="loky") as parallel:
        results = parallel(
            delayed(process_dataset_subset)(df_train[range_start:range_end], test_embedding, c_test_embedding)
            for range_start, range_end in ranges
        )
        for distances in results:
            all_train_distances_to_test.extend(distances)

    sorted_idx = np.argsort(np.array(all_train_distances_to_test))
    top_k_labels = df_train.iloc[sorted_idx[:k]]["label"].values
    predicted_class = Counter(top_k_labels).most_common(1)[0][0]
    predicted_classes.append(predicted_class)

print("Accuracy:", np.mean(np.array(predicted_classes) == df_test["label"].values))


100%|██████████| 2189/2189 [43:40<00:00,  1.20s/it]

Accuracy: 0.3682046596619461





# gzipped_embeddings : vanilla KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Preprocess the compressed embeddings
train_compressed = [len(gzip.compress(np.array(row['embedding'], dtype=np.float32).tobytes())) for _, row in df_train.iterrows()]
test_compressed = [len(gzip.compress(np.array(row['embedding'], dtype=np.float32).tobytes())) for _, row in df_test.iterrows()]

train_compressed = np.array(train_compressed).reshape(-1, 1)
test_compressed = np.array(test_compressed).reshape(-1, 1)

scaler = StandardScaler()
train_compressed = scaler.fit_transform(train_compressed)
test_compressed = scaler.transform(test_compressed)


knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(train_compressed, df_train['label'].values)

predicted_classes = knn.predict(test_compressed)
print("Accuracy:", accuracy_score(df_test['label'].values, predicted_classes))


Accuracy: 0.30242119689355873


# gzipped_embeddings : KNN_NCD

In [None]:
uniq = list(set(df_train["intent"].values))
labels = {j:i for i,j in zip(range(len(uniq)), uniq)}
labels

{'acq': 0,
 'crude': 1,
 'trade': 2,
 'ship': 3,
 'grain': 4,
 'earn': 5,
 'money-fx': 6,
 'interest': 7}

In [None]:
df_train["label"] = df_train["intent"].apply(lambda x: labels[x])
df_test["label"] = df_test["intent"].apply(lambda x: labels[x])

In [None]:
from joblib import Parallel, delayed
import numpy as np
import gzip

def process_dataset_subset(df_train_subset, test_embedding_bytes, c_test_embedding):
    distances_to_test = []
    for row_train in df_train_subset.iterrows():
        train_embedding = np.array(row_train[1]['embedding'], dtype=np.float32)
        train_embedding_bytes = train_embedding.tobytes()
        c_train_embedding = len(gzip.compress(train_embedding_bytes))

        train_plus_test_embedding = np.concatenate((test_embedding, train_embedding))
        train_plus_test_embedding_bytes = train_plus_test_embedding.tobytes()
        c_train_plus_test_embedding = len(gzip.compress(train_plus_test_embedding_bytes))

        ncd = ((c_train_plus_test_embedding - min(c_train_embedding, c_test_embedding))
               / max(c_test_embedding, c_train_embedding))

        distances_to_test.append(ncd)

    return distances_to_test


def divide_range_into_chunks(start, end, num_chunks):
    chunk_size = (end - start) // num_chunks
    ranges = [(i, i + chunk_size) for i in range(start, end, chunk_size)]
    ranges[-1] = (ranges[-1][0], end)  # Ensure the last chunk includes the end
    return ranges

In [None]:
from collections import Counter
import gzip
import multiprocessing as mp
import os.path as op

from joblib import Parallel, delayed
import numpy as np
import pandas as pd
from tqdm import tqdm

num_processes = mp.cpu_count()
k = 2
predicted_classes = []

# Pre-calculate and cache the compressions for the training set
d = [len(gzip.compress(np.array(row['embedding'], dtype=np.float32).tobytes())) for _, row in df_train.iterrows()]

# Divide the dataset into num_processes chunks
ranges = divide_range_into_chunks(0, df_train.shape[0], num_processes)

predicted_classes = []

for row_test in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    test_embedding = np.array(row_test[1]['embedding'], dtype=np.float32)
    test_label = row_test[1]['label']
    test_embedding_bytes = test_embedding.tobytes()
    c_test_embedding = len(gzip.compress(test_embedding_bytes))
    all_train_distances_to_test = []

    with Parallel(n_jobs=num_processes, backend="loky") as parallel:
        results = parallel(
            delayed(process_dataset_subset)(df_train[range_start:range_end], test_embedding_bytes, c_test_embedding)
            for range_start, range_end in ranges
        )
        for p in results:
            all_train_distances_to_test.extend(p)

    sorted_idx = np.argsort(np.array(all_train_distances_to_test))
    top_k_class = np.array(df_train["label"])[sorted_idx[:k]]
    predicted_class = Counter(top_k_class).most_common()[0][0]

    predicted_classes.append(predicted_class)

print("Accuracy:", np.mean(np.array(predicted_classes) == df_test["label"].values))


100%|██████████| 2189/2189 [1:24:30<00:00,  2.32s/it]

Accuracy: 0.3682046596619461





# PCA'ed embeddings : vanilla KNN

In [20]:
!pip install zstd

Collecting zstd
  Downloading zstd-1.5.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.8 MB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m1.1/1.8 MB[0m [31m15.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: zstd
Successfully installed zstd-1.5.5.1


In [None]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from scipy.spatial.distance import euclidean
from sklearn.metrics import accuracy_score
import zstd

# Extracting embeddings and labels from the training and test sets
X_train = list(df_train['embedding'])
y_train = df_train['label']
X_test = list(df_test['embedding'])
y_test = df_test['label']

# Applying PCA to reduce dimensionality
pca = PCA(n_components=100) # You can adjust the number of components
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

# Vanilla KNN
knn = KNeighborsClassifier()
knn.fit(X_train_reduced, y_train)
y_pred_knn = knn.predict(X_test_reduced)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f'Vanilla KNN Accuracy: {accuracy_knn}')

In [21]:
# KNN NCD
def ncd(x, y):
    combined = np.concatenate((x, y))
    compressed_combined = len(zstd.compress(combined))
    compressed_x = len(zstd.compress(x))
    compressed_y = len(zstd.compress(y))
    return (compressed_combined - min(compressed_x, compressed_y)) / max(compressed_x, compressed_y)

knn_ncd = KNeighborsClassifier(metric=ncd)
knn_ncd.fit(X_train_reduced, y_train)
y_pred_knn_ncd = knn_ncd.predict(X_test_reduced)
accuracy_knn_ncd = accuracy_score(y_test, y_pred_knn_ncd)
print(f'KNN NCD Accuracy: {accuracy_knn_ncd}')


Vanilla KNN Accuracy: 0.9081772498857926
KNN NCD Accuracy: 0.4540886249428963


In [22]:
# KNN NCD using gzip
def ncd(x, y):
    compressed_x = gzip.compress(x.tobytes())
    compressed_y = gzip.compress(y.tobytes())
    combined = np.concatenate((x, y))
    compressed_combined = gzip.compress(combined.tobytes())
    return (len(compressed_combined) - min(len(compressed_x), len(compressed_y))) / max(len(compressed_x), len(compressed_y))

knn_ncd = KNeighborsClassifier(metric=ncd)
knn_ncd.fit(X_train_reduced, y_train)
y_pred_knn_ncd = knn_ncd.predict(X_test_reduced)
accuracy_knn_ncd = accuracy_score(y_test, y_pred_knn_ncd)
print(f'KNN NCD Accuracy: {accuracy_knn_ncd}')

Vanilla KNN Accuracy: 0.9086340794883508
KNN NCD Accuracy: 0.4577432617633623
