In [5]:
from sentence_transformers import SentenceTransformer 
from datasets import load_dataset

In [6]:
sst_dataset = load_dataset('sst', 'default')

Downloading builder script: 100%|█████████████████████████████████████████████████████████| 9.13k/9.13k [00:00<?, ?B/s]
Downloading readme: 100%|█████████████████████████████████████████████████████████████████| 6.68k/6.68k [00:00<?, ?B/s]
Downloading data: 100%|███████████████████████████████████████████████████████████| 6.37M/6.37M [00:04<00:00, 1.39MB/s]
Downloading data: 100%|██████████████████████████████████████████████████████████████| 790k/790k [00:00<00:00, 840kB/s]
Generating train split: 100%|█████████████████████████████████████████████| 8544/8544 [00:02<00:00, 3927.49 examples/s]
Generating validation split: 100%|█████████████████████████████████████████| 1101/1101 [00:01<00:00, 754.07 examples/s]
Generating test split: 100%|██████████████████████████████████████████████| 2210/2210 [00:01<00:00, 1470.52 examples/s]


In [2]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [3]:
embeddings = model.encode(['hi'])

In [4]:
embeddings.shape

(1, 384)

In [8]:
print(sst_dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 8544
    })
    validation: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 1101
    })
    test: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 2210
    })
})


In [12]:
print(sst_dataset['train'])

Dataset({
    features: ['sentence', 'label', 'tokens', 'tree'],
    num_rows: 8544
})


In [14]:
print(sst_dataset['train']['label'][0])

0.6944400072097778


In [347]:
import random

X_train_target_text = []
y_train_labels = []
X_train_background_test = []

for i in random.sample(range(0,len(sst_dataset['train']['sentence'])),1000):
    if 0.4 < sst_dataset['train']['label'][i] < 0.6:
        X_train_background_test.append(sst_dataset['train']['sentence'][i])
    else:
        X_train_target_text.append(sst_dataset['train']['sentence'][i])
        if sst_dataset['train']['label'][i] < 0.4:
            y_train_labels.append(0)
        else:
            y_train_labels.append(1)

In [348]:
X_target_train_emb = model.encode(X_train_target_text)
X_background_train_emb = model.encode(X_train_background_test)

In [349]:
import numpy as np
for i in X_background_train_emb:
    # print(i)
    if np.inf in i or np.nan in i:
        print(i)
    # print(i)

In [271]:
X_target_train_emb.shape

(815, 384)

In [272]:
print(np.bincount(np.asarray(y_train_labels)))

[392 423]


In [273]:
X_background_train_emb.shape

(185, 384)

In [274]:
np.isfinite(X_background_train_emb).all()

True

In [275]:
from contrastive import CPCA
from sklearn.decomposition import PCA

In [276]:
X_target = np.asarray([[0,1,2],[0,1,2],[0,1,2]])
X_back = np.asarray([[0,1,2],[0,1,2],[0,1,2]])

In [350]:
cpca_model = CPCA(n_components=2)
transformed_data = cpca_model.fit_transform(X_target_train_emb[:,0:120], X_background_train_emb[:,0:120])
transformed_data = np.asarray(transformed_data)
print(transformed_data.shape)

pca = PCA(n_components=2)
pca_compress = pca.fit_transform(X_target_train_emb[:,0:120])
print(pca_compress.shape)

(4, 824, 2)
(824, 2)


In [351]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import sklearn as sk

X_data, y_data = sk.utils.shuffle(X_target_train_emb[:,0:120], y_train_labels)
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,shuffle=True)
clf = KNeighborsClassifier()
scores = cross_val_score(clf, X_data, y_data, cv=5)
print("No preprocessing:")
print(f"Accuracy: {round(scores.mean(),3)} +/- {round(scores.std(),3)}")

No preprocessing:
Accuracy: 0.629 +/- 0.039


In [352]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import sklearn as sk

X_data_pca, y_data_pca = sk.utils.shuffle(pca_compress, y_train_labels)
X_train, X_test, y_train, y_test = train_test_split(X_data_pca, y_data_pca,shuffle=True)
clf = KNeighborsClassifier()
scores = cross_val_score(clf, X_data_pca, y_data_pca, cv=5)
print("PCA:")
print(f"Accuracy: {round(scores.mean(),3)} +/- {round(scores.std(),3)}")

PCA:
Accuracy: 0.528 +/- 0.029


In [353]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import sklearn as sk

print("cPCA")
for i in range(4):
    X_data, y_data = sk.utils.shuffle(transformed_data[i], y_train_labels)
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,shuffle=True)
    clf = KNeighborsClassifier()
    scores = cross_val_score(clf, X_data, y_data, cv=5)
    print(f"Accuracy: {round(scores.mean(),3)} +/- {round(scores.std(),3)}")


# TODO: ref. last week, use average of X_data as background noise; use k-NN classifier to get most representative images as background
# TODO: CIFAR-100 (good benchmark), images

cPCA
Accuracy: 0.538 +/- 0.032
Accuracy: 0.563 +/- 0.023
Accuracy: 0.551 +/- 0.033
Accuracy: 0.502 +/- 0.023
