In [9]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

from tools import embedding_pipeline, save_collection, load_collection
from chromadb import Documents, EmbeddingFunction, Embeddings
import chromadb

import pickle

# Semantic Sampling

To support GPT4 few-shot learning, it would be helpful to have sufficient sampling across the semantic space. This notebook seeks to compute a semantic space from all source data, then uniformly sample from this space to create a label-ready dataset.

In [10]:
master_dataset = pd.read_csv('./data/headlines.csv')
rules_dataset = master_dataset['Headline'].drop_duplicates()

In [11]:
DUPLICATE_SUBSET = ['Publication','Headline']

In [12]:
chroma = chromadb.PersistentClient(path="./chromadb")

class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self):
        self.embedder = embedding_pipeline()
        
    def __call__(self, input: Documents) -> Embeddings:
        return self.embedder(input)[0] # long_text_mode="mean" ?
    
from chromadb.utils import embedding_functions
default_ef = embedding_functions.DefaultEmbeddingFunction()
ef         = MyEmbeddingFunction()
collection = chroma.get_or_create_collection("all-headlines", embedding_function=ef)

<All keys matched successfully>


In [13]:
# Compute Embedding-DB
# Takes ~7hrs

i = 0 # Manually index so tqdm can show progress
space_start = None
space_end = None
for rule in tqdm(rules_dataset):
    if not rule or not isinstance(rule, str): continue
    
    embedding = ef([rule])
    if space_start is None or space_end is None:
        space_start = embedding[0]
        space_end = embedding[0]
    else:
        space_start = np.minimum(space_start, embedding[0])
        space_end = np.maximum(space_end, embedding[0])
    
    # While we don't need to delete, chromadb will print a warning
    # This leads to vscode crashing
    #if collection.get(ids=[f"rule-{i}"])['ids']: collection.delete(ids=[f"rule-{i}"])
    collection.upsert(ids=[f"rule-{i}"], documents=[rule], embeddings=embedding)
    i+=1

  0%|                                               | 0/2548095 [00:00<?, ?it/s]

 39%|██████████▉                 | 994651/2548095 [11:40:59<15:26:28, 27.95it/s]

In [None]:
# Collection will autosave

In [None]:
# Sample entire space
NUM_SAMPLES = 200
samples = []

sample_df = pd.DataFrame(columns=['rule'])
for sample in tqdm(np.linspace(space_start, space_end, NUM_SAMPLES)):
    query = collection.query(
        query_embeddings=[list(sample)],
        include=['documents'],
        n_results = 1
    )
    rule_idx = query['ids'][0][0]
    rule_text = query['documents'][0][0]
    idx = int(rule_idx.split('-')[1])
    
    # Append to samples
    sample_df.loc[idx] = rule_text
sample_df = sample_df.drop_duplicates()
sample_df = master_dataset.loc[sample_df.index].drop_duplicates(subset=DUPLICATE_SUBSET)
sample_df.to_csv('sampled_rules_uniform.csv', index=False)

100%|████████████████████████████████████████| 200/200 [00:00<00:00, 961.65it/s]


In [None]:
# sample with nearest-neighbor rejection
sample_df = pd.DataFrame(columns=['rule'])
step_size = np.linalg.norm(space_end - space_start) / NUM_SAMPLES

for _ in tqdm(range(NUM_SAMPLES)):
    sample = np.random.uniform(space_start, space_end)
    
    query = collection.query(
        query_embeddings=[list(sample)],
        include=['documents', 'embeddings'],
        n_results = 1
    )
    embedding = query['embeddings'][0][0]
    
    # Move away from the nearest neighbor
    diff = embedding - sample
    step = step_size * diff / np.linalg.norm(diff)
    sample += step
    
    # Find the nearest neighbor of this new sample & append
    query = collection.query(
        query_embeddings=[list(sample)],
        include=['documents'],
        n_results = 1
    )
    rule_idx = query['ids'][0][0]
    rule_text = query['documents'][0][0]
    idx = int(rule_idx.split('-')[1])
    
    sample_df.loc[idx] = rule_text
sample_df = sample_df.drop_duplicates()
sample_df = master_dataset.loc[sample_df.index].drop_duplicates(subset=DUPLICATE_SUBSET)
sample_df.to_csv('sampled_rules_nn_rejection.csv', index=False)

 18%|███████▍                                 | 36/200 [00:00<00:00, 354.01it/s]

100%|████████████████████████████████████████| 200/200 [00:00<00:00, 390.69it/s]


In [None]:
# sample with k-means
from sklearn.cluster import KMeans

all_embeddings = collection.get(include=['embeddings'])['embeddings']
k_means = KMeans(n_clusters=NUM_SAMPLES, random_state=0, n_init="auto").fit(all_embeddings)

In [None]:
sample_df = pd.DataFrame(columns=['rule'])

for cluster in k_means.cluster_centers_:
    query = collection.query(
        query_embeddings=[list(cluster)],
        include=['documents'],
        n_results = 1
    )
    rule_idx = query['ids'][0][0]
    rule_text = query['documents'][0][0]
    idx = int(rule_idx.split('-')[1])
    
    sample_df.loc[idx] = rule_text

sample_df = sample_df.drop_duplicates()
sample_df = master_dataset.loc[sample_df.index].drop_duplicates(subset=DUPLICATE_SUBSET)
sample_df.to_csv('sampled_rules_k_means.csv', index=False)