In [68]:
import pandas as pd
import numpy as np
from tenacity import retry, wait_random_exponential, stop_after_attempt
import gzip
import json
from collections import defaultdict 
import tqdm
import openai
from openai import OpenAI
import os
os.environ['OPENAI_API_KEY'] = "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"
client = OpenAI()

In [69]:
## similarity using cosine similarity:  
# np.dot(embedded1, embedded2)

# all embeddings are already normalized
# response = client.embeddings.create(
#     input=["I like apple", "I"],
#     model="text-embedding-3-small"
# )

# embedded1 = response.data[0].embedding
# dimension is 1536

In [80]:
def read_corpus(fname):
    """
    Read gzip file with one json string in each line

    Returns
    -------
    A generator with dictionary as values
    """
    with gzip.open(fname, 'r') as fp:
        for line in fp:
            yield json.loads(line)
            
def read(fname):
    labels = []
    text = []
    fp = read_corpus(fname)
    for line in fp:
        # different dataset might use different fields
        text.append(line['title'] + line['content'])
        labels.append(line['target_ind'])
    return text, labels

def get_communities(labels):
    """
    `labels[i]` is the label for node i
    Returns a list of list where each inner list is a community. Duplicated communities are removed.
    """
    communities = defaultdict(set)
    for i, clusters in enumerate(labels):
        for c in clusters:
            communities[c].add(i)
    # Convert each inner list into a tuple and create a set of tuples
    unique_set = set(tuple(inner_list) for inner_list in communities.values())

    # Convert the set of tuples back into a list of lists
    unique_list = [list(inner_tuple) for inner_tuple in unique_set]
    return unique_list

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
def get_embeddings(texts, model="text-embedding-3-small"):
    """
    Fetches embeddings for a batch of texts using a specified OpenAI embedding model.
    
    Args:
    - texts (list of str): The texts to encode.
    - model (str): The model to use for encoding.
    
    Returns:
    - list of list: A list of embedding lists, each representing the embedding for a text.
    """
    responses = client.embeddings.create(input=texts, model=model)
    embeddings = [response.embedding for response in responses.data]
    return embeddings

def embed_texts(text, batch_size = 2000):
    """
    `text` is a list of strings, return the embeddings for each string in the list.
    """
    n = len(text)
    embeddings = []

    for start in tqdm.tqdm(range(0, n, batch_size)):
        end = start + batch_size
        batch_texts = text[start:end]
        batch_embeddings = get_embeddings(batch_texts, model='text-embedding-3-small')
        embeddings.extend(batch_embeddings)
    return embeddings

In [57]:
# text, labels = read(data_dir)
# communities =  get_communities(labels)
# print("num communities", len(communities))
# Count tokens for each string
# token_counts = [len(string.split()) for string in text]
# print("price: ", np.sum(token_counts)/1000 * 0.00002)
# print("Minute Limit:", np.sum(token_counts)/ 1e6)

In [10]:
def get_vectors_and_labels(train_dir, test_dir):
    all_labels = []
    all_embeddings = []
    for data_dir in [train_dir, test_dir]:
        text, labels = read(data_dir)
        all_labels.extend(labels)
        
        token_counts = [len(string.split()) for string in text]
        print("price: ", np.sum(token_counts)/1000 * 0.00002)
        print("Minute Limit:", np.sum(token_counts)/ 1e6)
        embeddings = embed_texts(text)
        all_embeddings.extend(embeddings)
    communities =  get_communities(all_labels)
    print("num communities", len(communities))
    return all_embeddings, all_labels

In [13]:
base_dir = "/Users/sy/Desktop/MIT/clusterer/data"
#, "WikiSeeAlsoTItles-350K", "Amazon-670K.raw", "Wikipedia-500K.raw"
datasets = ["AmazonTitles-670K"]
for dataset in datasets:
    print(dataset)
    train_dir = f"{base_dir}/{dataset}/trn.json.gz"
    test_dir = f"{base_dir}/{dataset}/tst.json.gz"
    if ".raw" in dataset:
        train_dir = f"{base_dir}/{dataset}/trn.raw.json.gz"
        test_dir = f"{base_dir}/{dataset}/tst.raw.json.gz"
        
    all_embeddings, all_labels = get_vectors_and_labels(train_dir, test_dir)
    all_embeddings = np.array(all_embeddings)
    dataset = dataset.split("-")[0]
    
    print("saving to, ", f'{base_dir}/{dataset}.npy')
    with open(f'{base_dir}/{dataset}.npy', 'wb') as f:
        np.save(f, all_embeddings)
    
    communities = get_communities(all_labels)
    lines_to_write = []
    for cluster_list in communities:
        lines_to_write.append("\t".join(str(x) for x in cluster_list))
    
    print("saving to, ", f'{base_dir}/{dataset}.cmty')
    with open(f'{base_dir}/{dataset}.cmty', 'w') as f:
        f.write('\n'.join(lines_to_write) + '\n')

AmazonTitles-670K
price:  0.06873082000000001
Minute Limit: 3.436541


100%|█████████████████████████████████████████████| 243/243 [17:58<00:00,  4.44s/it]


price:  0.02069852
Minute Limit: 1.034926


100%|███████████████████████████████████████████████| 76/76 [06:33<00:00,  5.18s/it]


num communities 214674
saving to,  /Users/sy/Desktop/MIT/clusterer/data/AmazonTitles.npy


NameError: name 'embeddings' is not defined

In [14]:
dataset

'AmazonTitles'

In [17]:
all_labels

[[34141, 119299, 126600, 128716, 187372, 218742],
 [465536, 465553, 615429],
 [393828],
 [167614, 223686],
 [480528, 480530, 480532, 485144, 485146, 598793],
 [516790, 567615, 670034],
 [50442, 50516, 50647, 50672, 50680, 662538],
 [493095, 499382, 560691, 589867, 591343, 611615, 619231],
 [590298, 609038],
 [453268, 453277, 510450, 516828, 520780, 542684, 634756],
 [347433, 378114, 381544, 475645, 539551, 563138],
 [308934, 348136, 386033, 386351, 418730, 518745],
 [11612, 26972, 31185, 38093, 139020, 166218, 371547],
 [382611, 558996, 573052, 606643, 655442],
 [271434, 275179, 306807, 408909, 563251, 651693],
 [405605, 405608, 502136, 502144, 528526, 658896],
 [70373, 70699, 70700, 70796, 70834, 70875],
 [484372, 514803, 530258, 588795, 591896, 648321],
 [447907, 447908, 447909, 447926, 447927, 447934, 491792],
 [230910],
 [455650, 455680, 611151, 617428, 647462, 649429],
 [34883, 35129, 35505, 35912, 35978, 37234, 576255],
 [465106],
 [475400, 475401, 475402, 475408, 536984, 624190]

In [23]:
np.unique([item for sublist in communities for item in sublist])

array([     0,      1,      2, ..., 636048, 636049, 636050])

In [16]:
communities = get_communities(all_labels)
lines_to_write = []
for cluster_list in communities:
    lines_to_write.append("\t".join(str(x) for x in cluster_list))

print("saving to, ", f'{base_dir}/{dataset}.cmty')
with open(f'{base_dir}/{dataset}.cmty', 'w') as f:
    f.write('\n'.join(lines_to_write) + '\n')

saving to,  /Users/sy/Desktop/MIT/clusterer/data/AmazonTitles.cmty


In [18]:
len(communities)

569402

In [19]:
len(all_embeddings)

636051

In [120]:
np.mean([len(i) for i in communities])

2.8585017282018317

## find KNN

In [18]:
import pynndescent
import numpy as np
import math
from scipy.spatial.distance import cdist

In [19]:
def bruteforce(normalized_points):
    # Compute cosine similarity matrix
    cosine_similarities = 1 - cdist(normalized_points, normalized_points, 'cosine')
    print(cosine_similarities)

    # Set diagonal elements to zero to ignore self-similarity
    np.fill_diagonal(cosine_similarities, 0)

    # Find index of nearest neighbor for each point
    nearest_neighbors = np.argmax(cosine_similarities, axis=1)

    return nearest_neighbors 

In [2]:
embeddings = np.load("/Users/sy/Desktop/MIT/clusterer/data/AmazonTitles.npy")

In [3]:
embeddings.shape

(636051, 1536)

In [54]:
def write_edges(edges, num_nodes, filename, dataset):
    file = open(filename,'w')
    file.write(
    '''# Undirected weighted graph: {dataset}\n# Nodes: {nodes} Edges: {edges}\n# FromNodeId    ToNodeId    Weight\n'''.format(dataset=dataset,nodes=num_nodes, edges=len(edges)))

    for edge in edges:
        i,j,d = edge
        file.write(str(i) + '\t' + str(j) + '\t'+ str(d) + '\n')
    file.close()

def get_knn_graph_cosine(data):
    index = pynndescent.NNDescent(data, metric="cosine", 
                              n_neighbors=101, 
                              pruning_degree_multiplier=2, 
                              diversify_prob=0)
    index.prepare()
    knns = index.neighbor_graph[0]
    dist = index.neighbor_graph[1]
    return knns, dist

def get_edges(knns, dist, k):
#     # normalize to only 
#     dist = dist[:,1:k+1]
#     knns = knns[:,1:k+1]
#     max_similarity = dist.max()
#     print(max_similarity)
#     dist /= max_similarity
    edges = []
    for i in range(len(knns)):
        for j in range(k+1): # exclude itself
            if i != knns[i][j]:
                edges.append([i, knns[i][j], 1-dist[i][j]])
    return edges

In [55]:
base_dir = "/Users/sy/Desktop/MIT/clusterer/data"

dataset = "test"
data = embeddings[:200]
knns, dist = get_knn_graph_cosine(data)
num_nodes = len(data)
for k in [10, 50, 100]:
    edges = get_edges(knns, dist, k)
    write_edges(edges, num_nodes, f'{base_dir}/{dataset}_k{k}.ungraph.txt', f"{dataset}_k{k}")

In [25]:
index = pynndescent.NNDescent(embeddings[:10], metric="cosine", 
                              n_neighbors=100, 
                              pruning_degree_multiplier=2, 
                              diversify_prob=0)
index.prepare()

In [27]:
index.neighbor_graph[0]

array([[  0,  51,  59, ...,  86,  21, 104],
       [  1,  22, 179, ...,  14,  39, 143],
       [  2, 145,   8, ..., 124, 167,  35],
       ...,
       [197, 196, 185, ...,  54,  65,  77],
       [198, 185,  84, ...,  92, 118,  68],
       [199,  50, 143, ..., 136,  53,  39]], dtype=int32)

In [26]:
index.neighbor_graph[0].shape

(200, 100)

In [23]:
1 - np.dot(embeddings[0], embeddings[51])

0.6671844809428699

In [17]:
math.dist(embeddings[0], embeddings[51])

1.155148867076092

In [20]:
true_nei = bruteforce(embeddings[:200])

[[1.         0.01979014 0.06704203 ... 0.19413969 0.20490331 0.21184945]
 [0.01979014 1.         0.20438195 ... 0.17931427 0.13776467 0.10686553]
 [0.06704203 0.20438195 1.         ... 0.15171931 0.11791393 0.16102659]
 ...
 [0.19413969 0.17931427 0.15171931 ... 1.         0.1573595  0.19118627]
 [0.20490331 0.13776467 0.11791393 ... 0.1573595  1.         0.18619191]
 [0.21184945 0.10686553 0.16102659 ... 0.19118627 0.18619191 1.        ]]


In [22]:
true_nei

array([ 51,  22, 145, 149,  69, 144,  21,  74,  40, 136,  68, 132,  83,
       161, 167, 106,  38,  61, 122, 196, 122,   6,  86, 140,  38, 185,
       177, 180,  64,  14,  73, 185,  29,  83,  90, 185,  64,   5,  57,
       185, 175, 139,  81,  74, 115, 160, 184, 107, 127,  98, 199,  93,
        75, 190, 122, 184, 198, 157, 142, 123, 168, 132, 193, 143,  84,
       103, 114, 112,  71, 130, 186,  68, 185,  30,   7, 177,  77, 126,
       196,  64, 109,  42, 113, 168, 185, 140, 110, 106,  93, 158, 110,
        63, 100,  88, 126, 126, 122, 190,  49, 164,  92, 103, 108, 101,
        60, 155,  87,  47,  88,  80,  86, 142, 162,  82, 135, 172,  75,
       135, 105,  49,  73, 135, 146,  59,  21, 164,  77,  48,  24, 117,
       142,  49,  11,  84,  88, 117,   9, 194, 173,  41, 174,  23,  58,
        63, 106,  40, 122, 155, 106,   3, 154, 117,  15, 175, 150, 147,
       196,  57, 178,  86, 193,  13, 112, 157, 125, 150, 122,  14,  83,
         6, 140, 185, 115, 189, 140, 122, 154,  75, 158,  49,  2

## truncate

In [61]:
import tiktoken

In [None]:
def get_vectors_and_labels(train_dir, test_dir):
    token_limit =  8192
    encoding = tiktoken.encoding_for_model("text-embedding-3-small")
    all_labels = []
    all_embeddings = []
    for data_dir in [train_dir, test_dir]:
        text, labels = read(data_dir)
        text = text[:5000] ## TODO: remove
        all_labels.extend(labels)

        num_tokens = 0
        for i in tqdm.tqdm(range(len(text)), "check_encoding_length"):
            tokens = encoding.encode(text[i])
            if len(tokens) > token_limit:
                print(text[i][:100])
                print
                new_text =  encoding.decode(tokens[:token_limit])
                text[i] = new_text
            num_tokens += min(token_limit, len(tokens))
                
        print("price: ", num_tokens/1000 * 0.00002)
        print("Minute Limit:", num_tokens/ 1e6)
        embeddings = embed_texts(text)
        all_embeddings.extend(embeddings)
    communities =  get_communities(all_labels)
    print("num communities", len(communities))
    return all_embeddings, all_labels

base_dir = "/Users/sy/Desktop/MIT/clusterer/data"
#, "WikiSeeAlsoTItles-350K",, "Wikipedia-500K.raw""AmazonTitles-670K"
datasets = [ "Amazon-670K.raw"]
for dataset in datasets:
    print(dataset)
    train_dir = f"{base_dir}/{dataset}/trn.json.gz"
    test_dir = f"{base_dir}/{dataset}/tst.json.gz"
    if ".raw" in dataset:
        train_dir = f"{base_dir}/{dataset}/trn.raw.json.gz"
        test_dir = f"{base_dir}/{dataset}/tst.raw.json.gz"
        
    all_embeddings, all_labels = get_vectors_and_labels(train_dir, test_dir)

['a']

## Analyze labels

In [82]:
def get_labels(train_dir, test_dir):
    all_labels = []
    for data_dir in [train_dir, test_dir]:
        text, labels = read(data_dir)
        all_labels.extend(labels)

    return all_labels

base_dir = "/Users/sy/Desktop/MIT/clusterer/data"
#, "WikiSeeAlsoTItles-350K", "Amazon-670K.raw", "Wikipedia-500K.raw"
datasets = ["AmazonTitles-670K"]
for dataset in datasets:
    print(dataset)
    train_dir = f"{base_dir}/{dataset}/trn.json.gz"
    test_dir = f"{base_dir}/{dataset}/tst.json.gz"
    if ".raw" in dataset:
        train_dir = f"{base_dir}/{dataset}/trn.raw.json.gz"
        test_dir = f"{base_dir}/{dataset}/tst.raw.json.gz"
        
    all_labels = get_labels(train_dir, test_dir)


AmazonTitles-670K
num communities 569402


In [83]:
all_labels

[[34141, 119299, 126600, 128716, 187372, 218742],
 [465536, 465553, 615429],
 [393828],
 [167614, 223686],
 [480528, 480530, 480532, 485144, 485146, 598793],
 [516790, 567615, 670034],
 [50442, 50516, 50647, 50672, 50680, 662538],
 [493095, 499382, 560691, 589867, 591343, 611615, 619231],
 [590298, 609038],
 [453268, 453277, 510450, 516828, 520780, 542684, 634756],
 [347433, 378114, 381544, 475645, 539551, 563138],
 [308934, 348136, 386033, 386351, 418730, 518745],
 [11612, 26972, 31185, 38093, 139020, 166218, 371547],
 [382611, 558996, 573052, 606643, 655442],
 [271434, 275179, 306807, 408909, 563251, 651693],
 [405605, 405608, 502136, 502144, 528526, 658896],
 [70373, 70699, 70700, 70796, 70834, 70875],
 [484372, 514803, 530258, 588795, 591896, 648321],
 [447907, 447908, 447909, 447926, 447927, 447934, 491792],
 [230910],
 [455650, 455680, 611151, 617428, 647462, 649429],
 [34883, 35129, 35505, 35912, 35978, 37234, 576255],
 [465106],
 [475400, 475401, 475402, 475408, 536984, 624190]

In [84]:
np.mean(np.array([len(c) for c in all_labels]))

5.3874626405744195