In [1]:
%load_ext autoreload
%autoreload 2
%pdb

Automatic pdb calling has been turned ON


In [2]:
from dss.env import DollyEnv
from dss.transformation import SentenceTransformerFeature

import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from torch.utils.data import Dataset
from os import path
import torch
import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_distances

  from .autonotebook import tqdm as notebook_tqdm


# Try this new env

In [3]:
env = DollyEnv(use_raw=False)

Loading precomputed features...


In [4]:
obs, _ = env.reset()
obs.shape

(768,)

In [5]:
env = DummyVecEnv([lambda: DollyEnv(use_raw=False, cov_metric="dissimilarity")])

Loading precomputed features...


In [6]:
# Train the policy using PPO
retrain = False
agent = PPO("MlpPolicy", env, verbose=1)
if path.exists("playground/ppo_dolly_instruct.zip") and not retrain:
    agent = PPO.load("playground/ppo_dolly_instruct.zip", env=env)
else:
    agent.learn(total_timesteps=int(1e5))
    agent.save("playground/ppo_dolly_instruct")

Using cuda device




# Let us measure diversity

In [None]:
train_dataset = env.envs[0].dataset
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=False)

X = []
data_values = []
for datum in tqdm.tqdm(train_loader):
    state = datum['feature']
    _, log_likelihoods, _ = agent.policy.evaluate_actions(state.to(0), torch.tensor([0, 1]).to(0))
    
    datum_value = log_likelihoods[1].cpu().item()

    X.append(datum)
    data_values.append(datum_value)

  0%|          | 0/15011 [00:00<?, ?it/s]

 15%|█▍        | 2233/15011 [00:10<00:55, 231.38it/s]

In [None]:
X_ = [x for x, _ in sorted(zip(X, data_values), key=lambda x: x[-1], reverse=True)]
X_ = np.array(X_)

In [11]:
for size in range(1, 101, 5):

    indices = np.arange(int(size/100*len(X_)))
    samples = X_[indices]
    samples = np.array([s['feature'].numpy() for s in samples])
    samples = samples.reshape(samples.shape[0], -1)
    samples = torch.from_numpy(samples)
    #d = torch.diag(torch.cov(samples.T)).sum()
    d = np.mean(cosine_distances(samples.cpu().numpy()))
    print(f"Size: {size}, Covariance: {d}")

Size: 1, Covariance: 0.9719694256782532
Size: 6, Covariance: 0.9806336164474487
Size: 11, Covariance: 0.9769279956817627
Size: 16, Covariance: 0.9724605679512024
Size: 21, Covariance: 0.9680395126342773
Size: 26, Covariance: 0.9637230634689331
Size: 31, Covariance: 0.9596344232559204
Size: 36, Covariance: 0.9561107754707336
Size: 41, Covariance: 0.952923595905304
Size: 46, Covariance: 0.9498034715652466
Size: 51, Covariance: 0.9470767974853516
Size: 56, Covariance: 0.944887638092041
Size: 61, Covariance: 0.9426226019859314
Size: 66, Covariance: 0.9405233263969421
Size: 71, Covariance: 0.9385395646095276
Size: 76, Covariance: 0.9367333650588989
Size: 81, Covariance: 0.9348382949829102
Size: 86, Covariance: 0.9330253601074219
Size: 91, Covariance: 0.931355357170105
Size: 96, Covariance: 0.9298263192176819


In [12]:
for size in range(1, 101, 5):

    indices = -np.arange(int(size/100*len(X_)))
    samples = X_[indices]
    samples = np.array([s['feature'].numpy() for s in samples])
    samples = samples.reshape(samples.shape[0], -1)
    samples = torch.from_numpy(samples)
    #d = torch.diag(torch.cov(samples.T)).sum()
    d = np.mean(cosine_distances(samples.cpu().numpy()))
    print(f"Size: {size}, Covariance: {d}")

Size: 1, Covariance: 0.8834326267242432
Size: 6, Covariance: 0.8833936452865601
Size: 11, Covariance: 0.8862953782081604
Size: 16, Covariance: 0.8891157507896423
Size: 21, Covariance: 0.8910645246505737
Size: 26, Covariance: 0.8927051424980164
Size: 31, Covariance: 0.8949320912361145
Size: 36, Covariance: 0.8967922329902649
Size: 41, Covariance: 0.8986470103263855
Size: 46, Covariance: 0.9008399844169617
Size: 51, Covariance: 0.9025696516036987
Size: 56, Covariance: 0.9038946628570557
Size: 61, Covariance: 0.9059028625488281
Size: 66, Covariance: 0.9076675772666931
Size: 71, Covariance: 0.9097478985786438
Size: 76, Covariance: 0.9118421673774719
Size: 81, Covariance: 0.9143750667572021
Size: 86, Covariance: 0.9172683954238892
Size: 91, Covariance: 0.9207983613014221
Size: 96, Covariance: 0.9248338937759399


In [13]:
for size in range(1, 101, 5):

    indices = np.random.choice(len(X_), int(size/100*len(X_)), replace=False)
    samples = X_[indices]
    samples = np.array([s['feature'].numpy() for s in samples])
    samples = samples.reshape(samples.shape[0], -1)
    samples = torch.from_numpy(samples)
    #d = torch.diag(torch.cov(samples.T)).sum()
    d = np.mean(cosine_distances(samples.cpu().numpy()))
    print(f"Size: {size}, Covariance: {d}")

Size: 1, Covariance: 0.9211809635162354
Size: 6, Covariance: 0.9246454238891602
Size: 11, Covariance: 0.928205668926239
Size: 16, Covariance: 0.9297683238983154
Size: 21, Covariance: 0.927932620048523
Size: 26, Covariance: 0.927312970161438
Size: 31, Covariance: 0.9288177490234375
Size: 36, Covariance: 0.9287832379341125
Size: 41, Covariance: 0.9281821846961975
Size: 46, Covariance: 0.9292672276496887
Size: 51, Covariance: 0.9290285706520081
Size: 56, Covariance: 0.9288315773010254
Size: 61, Covariance: 0.9286008477210999
Size: 66, Covariance: 0.9289616942405701
Size: 71, Covariance: 0.9288148880004883
Size: 76, Covariance: 0.9281294941902161
Size: 81, Covariance: 0.9289678931236267
Size: 86, Covariance: 0.9287742376327515
Size: 91, Covariance: 0.9290452003479004
Size: 96, Covariance: 0.9289625883102417


## Compare it with DPP

In [14]:
from sklearn.metrics.pairwise import rbf_kernel

F_ = [x['feature'].flatten() for x in X_]
F_ = torch.stack(F_)

kernel_matrix = rbf_kernel(F_.cpu().numpy(), gamma=0.5)

In [15]:
from dss.utils import dpp

max_size = int(0.5 * len(X_))
selected_indices_ = dpp(kernel_matrix, max_size)

In [16]:
size = 600
for percentage in range(1, 52, 5):
    size_ = len(X_) * percentage // 100
    samples = F_[selected_indices_[:size_]]
    d = np.mean(cosine_distances(samples.cpu().numpy()))
    print(d) 

0.9862835
0.98327124
0.9802927
0.97673607
0.9731184
0.9693479
0.965731
0.9623316
0.95870185
0.95539033
0.952868


## Save the dataset

In [38]:
to_save = [{k:v for k,v in d.items() if k!="feature"} for d in X_]

In [39]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/llama-2-7b-hf")

to_save = [entry for entry in to_save if tokenizer(entry['context'])["input_ids"][0].__len__() <= 1024]

In [40]:
import json

with open ("playground/dolly_6k_dss.json", "w") as f:
    json.dump(to_save[:6000], f)

In [41]:
import json

random_save = np.array(to_save)
random_save = random_save[np.random.choice(len(random_save), 6000, replace=False)]
with open ("playground/dolly_6k_random.json", "w") as f:
    json.dump(random_save.tolist(), f)