In [1]:
import gzip
import pickle
import numpy as np
from joblib import Parallel, delayed
from sklearn.neighbors import KNeighborsClassifier

In [2]:
with open(f"sentiment-dataset-500.pickle", "rb") as f:
    train_x, train_y, test_x, test_y = pickle.load(f)
# pre-encode the data, thank you Ken Schutte for this trick
train_x = [s.encode() for s in train_x]
test_x = [s.encode() for s in test_x]

def compressed_len(s: bytes) -> int:
    return len(gzip.compress(s))

def ncd_cdist_parallel(x1: list[bytes], x2: list[bytes]) -> np.ndarray:
    n, m = len(x1), len(x2)
    # cache the compressed lengths for min max normalization
    len_x1 = Parallel(n_jobs=-1)(delayed(compressed_len)(s)for s in x1)  
    len_x2 = Parallel(n_jobs=-1)(delayed(compressed_len)(s)for s in x2)
    # compute the distance matrix instead of a slow lambda function
    def compute_dist(i: int, j: int) -> float:
        return (compressed_len(x1[i] + b' ' + x2[j]) - min(len_x1[i], len_x2[j])) / max(len_x1[i], len_x2[j])
    # compute the distance matrix in parallel
    dist_mat = Parallel(n_jobs=-1)(delayed(compute_dist)(i, j) for i in range(n) for j in range(m))
    # reshape the distance matrix from flattened to desired shape
    return np.reshape(dist_mat, (n, m))


# process the data
train_ncd = ncd_cdist_parallel(train_x, train_x)
test_ncd = ncd_cdist_parallel(test_x, train_x)
# KNN
neigh = KNeighborsClassifier(n_neighbors=7) 
neigh.fit(train_ncd, train_y)
print("Accuracy:", neigh.score(test_ncd, test_y))

Accuracy: 0.7029702970297029


time: 8.31 s
vs 
time: 39 s for video non Parallel code 
vs
time: 16.3 s  for video Parallel code