In [1]:
import numpy as np
import plotly.express as px

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from common import get_input_data

In [2]:
X, Y =  get_input_data('prompts/big_dataset/prompts.csv', 'prompts/big_dataset/llama/llama-2-7b-f16-mean-out.bin', 4096)
X.shape, Y.shape

((24800, 4096), (24800, 2))

In [9]:
X = np.load('/Users/cranete/_workspace/_HiPPO/prompts/big_dataset/train_prompts_llama_13b_v1_mean.npz')
a, b, c, d = X['arr_0'], X['arr_1'], X['arr_2'], X['arr_3']

X = np.concatenate((a, b, c, d), axis=0)
Y = np.concatenate((np.zeros(a.shape[0]), np.ones(b.shape[0]), np.ones(c.shape[0]) * 2, np.ones(d.shape[0]) * 3))
X.shape, Y.shape

((9600, 5120), (9600,))

In [3]:
# X_coins, Y_coins = X[np.where(Y[:, 1] == 1)], Y[np.where(Y[:, 1] == 1), 0]
# X_no_coins, Y_no_coins = X[np.where(Y[:, 1] == 0)], Y[np.where(Y[:, 1] == 0), 0]
# X_coins.shape, X_no_coins.shape

((12544, 4096), (12256, 4096))

In [4]:
# Y_coins = Y_coins.reshape(-1, 1)
# Y_no_coins = Y_no_coins.reshape(-1, 1)

# COINS

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, 
                                                            random_state=42, stratify=Y)

scaler = StandardScaler()
X_norm = scaler.fit_transform(X_train)

pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)

pca.score(X_test)

-1291.270144940027

In [11]:
fig = px.scatter(x=X_train_pca[:, 0], y=X_train_pca[:, 1], color=y_train.reshape(-1))
fig.update_layout(
    title="PCA visualization of the embedded data",
    xaxis_title="First Principal Component",
    yaxis_title="Second Principal Component",
)
fig.show()

In [12]:
perplexity = np.arange(5, 55, 5)
divergence = []

for i in perplexity:
    model = TSNE(n_components=2, init="pca", perplexity=i)
    reduced = model.fit_transform(X_train)
    divergence.append(model.kl_divergence_)
fig = px.line(x=perplexity, y=divergence, markers=True)
fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
fig.update_traces(line_color="red", line_width=1)
fig.show()

In [14]:
perplexity = np.arange(5, 55, 5)
divergence = []

for i in perplexity:
    model = TSNE(n_components=3, init="pca", perplexity=i, metric="cosine")
    reduced = model.fit_transform(X_train)
    divergence.append(model.kl_divergence_)
fig = px.line(x=perplexity, y=divergence, markers=True)
fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
fig.update_traces(line_color="red", line_width=1)
fig.show()

In [20]:
tsne = TSNE(n_components=3, perplexity=10, random_state=42, metric="cosine", init="pca") 
X_train_tsne = tsne.fit_transform(X_train)

tsne.kl_divergence_

0.3606402277946472

In [21]:
fig = px.scatter_3d(x=X_train_tsne[:, 0], y=X_train_tsne[:, 1], z=X_train_tsne[:, 2], color=y_train)
fig.update_layout(
    title="t-SNE visualization of embedding dataset",
    # xaxis_title="First t-SNE",
    # yaxis_title="Second t-SNE",
    # zaxis_title="Third t-SNE",
)
fig.show()

In [22]:
2 ** 11

2048

In [18]:
km4 = KMeans(n_clusters=4, random_state=42)
km4.fit(X_train)

In [19]:
accuracy_score(y_train, km4.labels_)

0.271484375

In [27]:
accuracy_score(y_train, km4.labels_)

0.26150341685649203