In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
tqdm.pandas()

from tools import embedding_pipeline, save_collection, load_collection
from chromadb import Documents, EmbeddingFunction, Embeddings
import chromadb

import pickle

In [2]:
master_dataset = pd.read_csv('./data/headlines.csv')


In [3]:
DUPLICATE_SUBSET = ['Publication','Headline']
NUM_SAMPLES = 200

In [4]:
chroma = chromadb.PersistentClient(path="./chromadb")
collection = chroma.get_collection("all-headlines")

all_embeddings = list()
space_start, space_end = None, None

existing_count = collection.count()
batch_size = 50_000
for i in trange(0, existing_count, batch_size):
    batch = collection.get(include=['embeddings'], limit=batch_size, offset=i)
    for embd in batch['embeddings']:
        all_embeddings.append(embd)
        if space_start is None or space_end is None:
            space_start = embd
            space_end = embd
        else:
            space_start = np.minimum(space_start, embd)
            space_end = np.maximum(space_end, embd)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51/51 [07:21<00:00,  8.66s/it]


In [5]:
# Sample entire space

samples = []
sample_df = pd.DataFrame(columns=['rule'])
for sample in tqdm(np.linspace(space_start, space_end, NUM_SAMPLES)):
    query = collection.query(
        query_embeddings=[list(sample)],
        include=['documents'],
        n_results = 1
    )
    rule_idx = query['ids'][0][0]
    rule_text = query['documents'][0][0]
    idx = int(rule_idx.split('-')[1])
    
    # Append to samples
    sample_df.loc[idx] = rule_text
sample_df = sample_df.drop_duplicates()
sample_df = master_dataset.loc[sample_df.index].drop_duplicates(subset=DUPLICATE_SUBSET)
sample_df.to_csv('sampled_rules_uniform.csv', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 297.75it/s]


In [14]:
# Sample randomly based on articles containing keywords related to data breaches
import re
keywords = {'breach', 'data', 'hack', 'record', 'leak'}
regex = re.compile('|'.join(map(lambda x: f"(\s|^){x}",keywords)), re.IGNORECASE)
sample_df = master_dataset[master_dataset['Headline'].apply(lambda s: bool(regex.search(str(s))))]
sample_df = sample_df.drop_duplicates(subset=DUPLICATE_SUBSET).sample(NUM_SAMPLES)
sample_df.to_csv('sampled_rules_keyword_random.csv', index=False)

In [8]:
raise Exception("the following code crashes the kernel")

Exception: the following code crashes the kernel

In [9]:
# sample with k-means (usually crashes on PC due to no ram)
from sklearn.cluster import KMeans

# Sample random embeddings from all_embeddings
all_embeddings = np.array(all_embeddings)
np.random.shuffle(all_embeddings)
k_means = KMeans(n_clusters=NUM_SAMPLES, random_state=0, n_init="auto").fit(all_embeddings[:200_000])

In [11]:
sample_df = pd.DataFrame(columns=['rule'])

for cluster in k_means.cluster_centers_:
    query = collection.query(
        query_embeddings=[list(cluster)],
        include=['documents'],
        n_results = 1
    )
    rule_idx = query['ids'][0][0]
    rule_text = query['documents'][0][0]
    idx = int(rule_idx.split('-')[1])
    
    sample_df.loc[idx] = rule_text

sample_df = sample_df.drop_duplicates()
sample_df = master_dataset.loc[sample_df.index].drop_duplicates(subset=DUPLICATE_SUBSET)
sample_df.to_csv('sampled_rules_k_means.csv', index=False)