In [9]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

from tools import embedding_pipeline, save_collection, load_collection
from chromadb import Documents, EmbeddingFunction, Embeddings
import chromadb

import pickle

# Semantic Sampling

To support GPT4 few-shot learning, it would be helpful to have sufficient sampling across the semantic space. This notebook seeks to compute a semantic space from all source data, then uniformly sample from this space to create a label-ready dataset.

In [10]:
master_dataset = pd.read_csv('./data/headlines.csv')
rules_dataset = master_dataset['Headline'].drop_duplicates()

In [11]:
DUPLICATE_SUBSET = ['Publication','Headline']

In [12]:
chroma = chromadb.PersistentClient(path="./chromadb")

class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self):
        self.embedder = embedding_pipeline()
        
    def __call__(self, input: Documents) -> Embeddings:
        return self.embedder(input)[0] # long_text_mode="mean" ?
    
from chromadb.utils import embedding_functions
default_ef = embedding_functions.DefaultEmbeddingFunction()
ef         = MyEmbeddingFunction()
collection = chroma.get_or_create_collection("all-headlines", embedding_function=ef)

<All keys matched successfully>


In [13]:
# Compute Embedding-DB
# Takes ~7hrs

i = 0 # Manually index so tqdm can show progress
space_start = None
space_end = None
for rule in tqdm(rules_dataset):
    if not rule or not isinstance(rule, str): continue
    
    embedding = ef([rule])
    if space_start is None or space_end is None:
        space_start = embedding[0]
        space_end = embedding[0]
    else:
        space_start = np.minimum(space_start, embedding[0])
        space_end = np.maximum(space_end, embedding[0])
    
    # While we don't need to delete, chromadb will print a warning
    # This leads to vscode crashing
    #if collection.get(ids=[f"rule-{i}"])['ids']: collection.delete(ids=[f"rule-{i}"])
    collection.upsert(ids=[f"rule-{i}"], documents=[rule], embeddings=embedding)
    i+=1

  0%|                                               | 0/2548095 [00:00<?, ?it/s]

100%|██████████████████████████████| 2548095/2548095 [31:49:52<00:00, 22.24it/s]


In [14]:
# Collection will autosave

NameError: name 'pd' is not defined

: 

: 

: 

: 

: 

  0%|                                                   | 0/200 [00:00<?, ?it/s]

100%|█████████████████████████████████████████| 200/200 [00:02<00:00, 69.23it/s]


: 

: 