## Initialization

### Imports

In [1]:
from tve.utils import load_data_from_json
import pandas as pd
import numpy as np
import cupy as cp
from sklearn.manifold import TSNE

from tve.documents import FIPSAPILoader, FSLoader
from tve.pipeline import FIPS_API_KEY, DATA_PATH, DOCS_PATH
from tve.utils import CircularTaskGroup, batched, ForgivingTaskGroup
import chromadb
from sentence_transformers import SentenceTransformer
import tqdm
from itertools import compress
import logging
import torch
import umap

from operator import itemgetter 
from itertools import product
import os

### Global Variables

In [2]:

doc_path = DOCS_PATH / "EMB"
res_path = DATA_PATH / "EMB"
os.makedirs(res_path, exist_ok=True)
api = FIPSAPILoader(api_key=FIPS_API_KEY)
loader = FSLoader(doc_path)

chroma_client = chromadb.PersistentClient(path="data/db")

task_names = ["text", "abstract", "claims", "description"]

db = {
    k: chroma_client.get_or_create_collection(name=k, embedding_function=None, metadata={"hnsw:space": "cosine"})
    for k in task_names
}

clusters = await load_data_from_json(DATA_PATH / "ten_clusters.json")
all_docs = sum([sum(i.values(), []) for i in clusters.values()], [])


model = SentenceTransformer(
    "HIT-TMG/KaLM-embedding-multilingual-mini-v1",
)
model.max_seq_length = 131000


chromadb.telemetry.product.posthog - 2025-02-06 19:19:29,223 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
sentence_transformers.SentenceTransformer - 2025-02-06 19:19:29,378 - INFO - Use pytorch device_name: cuda
sentence_transformers.SentenceTransformer - 2025-02-06 19:19:29,380 - INFO - Load pretrained SentenceTransformer: HIT-TMG/KaLM-embedding-multilingual-mini-v1
sentence_transformers.SentenceTransformer - 2025-02-06 19:19:34,818 - INFO - 2 prompts are loaded, with the keys: ['query', 'document']


### Stuff

In [3]:
def id_norm(text):
    text = text.replace("/", "")
    text = text[:2] + text[2:].lstrip("0")
    # text = text[:text.rfind("_")]
    return text

def l1_dist(a, b):
    return cp.sum(cp.abs(a - b), axis=1)
def l2_dist(a, b):
    return cp.linalg.norm(a - b, axis=1)
def cos_dist(a, b):
    return 1 - (cp.dot(b, a) / (cp.linalg.norm(b, axis=1) * cp.linalg.norm(a)))

def query_similar(emb, dataframe, dist_func=cos_dist, n_results=10):
    distances = dist_func(cp.array(emb), cp.vstack(dataframe["embedding"]))
    most_sim = sorted(enumerate(distances.get()), reverse=False, key=itemgetter(1))[:n_results]
    mini_df = dataframe.loc[[i[0] for i in most_sim]]
    return {
        "ids": list(mini_df["id"]),
        "distances": [i[1] for i in most_sim]
    }

logging.getLogger("chromadb.segment.impl.vector.local_persistent_hnsw").setLevel(
    logging.ERROR
)
logging.getLogger("chromadb.segment.impl.metadata.sqlite").setLevel(
    logging.ERROR
)

## Download + Embed

In [51]:
docs_list = list(dict.fromkeys(all_docs))[:]
# docs_list = list(clusters.keys())
bar = tqdm.tqdm(total=len(docs_list))
batch_size = 8


async def find_doc(id_date, timeout=10):
    try:
        doc = await api.get_doc_by_id_date(id_date, timeout=timeout)
    except Exception as e:
        doc = None

    if doc is None:
        try:
            doc = await api.get_doc(id_date, timeout=timeout)
        except Exception as e:
            doc = None

        orig_id = ""
        new_id = ""
        if doc is not None:
            orig_id = id_norm(id_date)
            new_id = id_norm(doc.id_date)

            if orig_id != new_id:
                doc = None
        if doc is None:
            # print(f"Error: doc not found for id_date: {id_date}")
            return None
    return doc


async def download_task(doc_id, retries=5):
    doc = await loader.get_doc(doc_id)
    # return doc
    if doc is not None:
        return doc
    for i in range(retries):
        doc = await find_doc(doc_id)
        if doc is not None:
            return doc
    return None


async def batch_download_task(doc_id_batch, retries=5):
    tasks = []
    async with ForgivingTaskGroup(bar) as tg:
        for doc_id in doc_id_batch:
            tasks.append(tg.create_task(download_task(doc_id, retries)))

    batch = []
    for task in tasks:
        try:
            res = task.result()
            if res is not None:
                res.save_file(doc_path)
                batch.append(res)
        except Exception as e:
            pass

    return batch


async def process_batch(doc_batch):
    for task in task_names:
        # if task != "abstract":
        #     continue
        batch_to_process = {
            doc.id_date: doc.__getattribute__(task)
            for doc in doc_batch
            if len(doc.__getattribute__(task).strip()) > 20
            # and len(db[task].get(doc.id_date)["ids"]) == 0
        }

        items = batch_to_process.items()
        if len(items) > 0:
            ids = [doc_id for doc_id, _ in items]
            texts = [text for _, text in items]

            try:
                embeddings_gpu = model.encode(
                    texts,
                    normalize_embeddings=True,
                    batch_size=batch_size,
                    show_progress_bar=False,
                    convert_to_numpy=False,
                    convert_to_tensor=True,
                )
                embeddings = embeddings_gpu.cpu().numpy()
                del embeddings_gpu
                db[task].add(ids, embeddings)
                del embeddings
            except RuntimeError as ex:
                print("EX!!!: ", ex)
                print(
                    [len(i.__getattribute__(task)) for i in doc_batch],
                    [i.id_date for i in doc_batch],
                )


async def super_task(doc_id_batch):
    batch = await batch_download_task(doc_id_batch)
    await process_batch(batch)
    # return batch


async with CircularTaskGroup(2, exception_handler=None) as super_tg:
    for doc_id_batch in batched(docs_list, batch_size):
        await super_tg.create_task(super_task(doc_id_batch))
        torch.cuda.empty_cache()

bar.close()

100%|██████████| 14/14 [00:05<00:00,  2.33it/s]


### Check number

In [52]:
for task in task_names:
    print(task, len(db[task].get()["ids"]))

text 1474
abstract 1468
claims 612
description 740


## EVALUATION

### Prepare dataset + imports

In [4]:
# Visualization imports

import bokeh.models as bm
import bokeh.plotting as pl
from bokeh.io import output_notebook
import bokeh
import umap

output_notebook()

In [60]:
# determine global variables

# task = "text"
# task = "abstract"
# task = "claims"
task = "description"

# metric = "cosine" 
# metric = "euclidean"
metric = "cityblock"

In [61]:
# preparing dataset

t = {
    "cosine": cos_dist,
    "euclidean": l2_dist,
    "cityblock": l1_dist
    }
dist_func = t[metric]


all_ids = db["text"].get()["ids"]
db_data = db[task].get(include=["embeddings"])

id_to_db_id = {id_norm(k): k for k in all_ids}
id_to_emb = {k: v for k, v in zip(db_data["ids"], db_data["embeddings"])}
norm_id_to_emb = {id_norm(k): v for k, v in id_to_emb.items()}


norm_cluters = {
    id_norm(k): {
        k1: [i for i in list(dict.fromkeys(map(id_norm, v1)))]
        # k1: [i for i in list(dict.fromkeys(map(norm, v1))) if i in id_to_emb]
        for k1, v1 in v.items()
    }
    for k, v in clusters.items() if k in id_to_emb
}
print(len(id_to_db_id), "IDs overall")
print( len(set(id_to_db_id) & set(norm_cluters)), "clusters found of", len(clusters), "clusters overall")
all_colors = "Red, Blue, Yellow, Green, Purple, Orange, Cyan, Magenta, Lime, Teal, Maroon, Navy, Gold, Gray".lower().split(", ")

master_df = pd.DataFrame(columns=["id", "relation", "parent", "embedding", "color"])
# df = df.(["id", "rel", None, 1])
for n, (parent, d) in enumerate(norm_cluters.items()):
    color = all_colors[n]
    emb = norm_id_to_emb.get(parent, None)
    if emb is None:
        print("Error, missing parent:", parent)
        continue
    master_df.loc[len(master_df)] = [parent, "document", parent, emb, color]
    for relation, id_list in d.items():
        for doc_id in id_list:
            emb = norm_id_to_emb.get(doc_id, None)
            if emb is not None:
                master_df.loc[len(master_df)] = [doc_id, relation, parent, emb, color]
master_df = master_df.drop_duplicates(["id"], ignore_index=True)
assert (len(master_df[master_df["relation"] == "document"]) == len(norm_cluters))
master_df.head()

input_embeddings = np.vstack(master_df["embedding"].to_numpy())
print(input_embeddings.shape, "- shape of embeddings")

1474 IDs overall
14 clusters found of 14 clusters overall
(740, 896) - shape of embeddings


###

In [28]:
def draw_vectors(
    x,
    y,
    colors = "blue",
    radius=0.0005,
    alpha=0.5,
    width=600,
    height=400,
    show=True,
    line_width=2,
    dynamic=True,
    **kwargs,
):
    fig = pl.figure(active_scroll="wheel_zoom", width=width, height=height)
    p = fig
    p.title.text = f"{task} embeddings, reduced to 2D with tSNE, metric: {metric}"
    p.title.align = "center"


    parent_to_idx = { d["id"]:n for n, d in master_df[master_df["relation"] == "document"].iterrows()}
    id_to_parent_idx = { d["id"]:parent_to_idx[d["parent"]] for n, d in master_df.iterrows() }
    radiuses = np.array([radius] * len(x))
    radiuses += radiuses * (master_df["relation"] == "document")
    data = {
        "x": x,
        "y": y,
        "id": master_df["id"],
        "parent": master_df["parent"],
        "parent_x": np.array([x[id_to_parent_idx[d["id"]]] for _, d in master_df.iterrows()]),
        "parent_y": np.array([y[id_to_parent_idx[d["id"]]] for _, d in master_df.iterrows()]),
        "radius": radiuses,
        "relation": master_df["relation"],
        "color": colors
    }

    source = bm.ColumnDataSource(data=data)
    parent_source = bm.ColumnDataSource(data={k:[] for k in data if "parent" not in k})

    if not dynamic:
        parent_source = bm.ColumnDataSource(
            {
                k: v[master_df["relation"] == "document"] for k, v in data.items()
            }
        )


    parent_renderers = []
    parent_renderer = p.circle('x', 'y', radius="radius", source=parent_source, fill_color="color", line_color="black", line_width=line_width)
    # parent_renderers.append(parent_renderer)

    child_renderers = []
    child_renderer = p.circle('x', 'y', radius="radius", source=source, color="color")
    child_renderers.append(child_renderer)

    hover = bm.HoverTool(renderers=child_renderers + parent_renderers,
                    tooltips=[("doc_type", "@relation"), 
                              ("id", "@id"), 
                              ("parent", "@parent")
                              ])
    p.add_tools(hover)

    callback = bm.CustomJS(args=dict(source=source, parent_source=parent_source, threshold=radius*2), code="""
        // 'cb_obj' is the event with the mouse coordinates (in data space)
        const x_mouse = cb_obj.x;
        const y_mouse = cb_obj.y;
        const data = source.data;
        let min_dist = Infinity;
        let hovered_index = null;
        // Loop through each child point to find the closest one (using a threshold)
        for (let i = 0; i < data['x'].length; i++) {
            const dx = data['x'][i] - x_mouse;
            const dy = data['y'][i] - y_mouse;
            const dist = Math.sqrt(dx*dx + dy*dy);
            if (dist < min_dist && dist < threshold) {  // threshold may need adjustment
                min_dist = dist;
                hovered_index = i;
            }
        }
        if (hovered_index != null && data['relation'][hovered_index] != "document") {
            // Set the parental dot to the coordinates stored in the data
            parent_source.data['x'] = [data['parent_x'][hovered_index]];
            parent_source.data['y'] = [data['parent_y'][hovered_index]];
            parent_source.data['color'] = [data['color'][hovered_index]];
            parent_source.data['radius'] = [data['radius'][hovered_index]*3];
            parent_source.data['id'] = [data['id'][hovered_index]];
            parent_source.data['relation'] = ["document"];
        } else {
            // Clear the parental dot if no child point is close enough
            parent_source.data['x'] = [];
            parent_source.data['y'] = [];
            parent_source.data['color'] = [];
            parent_source.data['id'] = [];
            parent_source.data['radius'] = [];
        }
        parent_source.change.emit();
    """)
    if dynamic:
        p.js_on_event('mousemove', callback)

    if show:
        pl.show(fig)
    return fig
    # return source

# s = draw_vectors(embeddings[:, 0], embeddings[:, 1], df["color"], radius=0.03, width=700, height=500)


In [62]:
# embeddings = umap.UMAP(n_neighbors=20, metric=metric, output_metric=metric).fit_transform(input_embeddings)
embeddings = TSNE(n_components=2, metric=metric).fit_transform(input_embeddings)

In [63]:
rad = 0.5
draw_vectors(embeddings[:, 0], embeddings[:, 1], master_df["color"], radius=rad, width=800, height=500, line_width=4, show=True)
f = draw_vectors(embeddings[:, 0], embeddings[:, 1], master_df["color"], radius=rad, width=1500, height=1000, line_width=4, show=False)
pl.save(f, filename=res_path / f"{task}_{metric}.html")


f = draw_vectors(embeddings[:, 0], embeddings[:, 1], master_df["color"], radius=rad, width=1500, height=1000, line_width=4, show=False, dynamic=False)
bokeh.io.export_png(f, filename=res_path / f"{task}_{metric}.png")
# bokeh.io.export_png(f, filename="data/tsne.png")
# bokeh.io.__dict__.keys()

  pl.save(f, filename=res_path / f"{task}_{metric}.html")
  pl.save(f, filename=res_path / f"{task}_{metric}.html")


'c:\\prog\\py\\fips\\tve\\data\\data\\EMB\\description_cityblock.png'

In [64]:
info_df = pd.DataFrame(columns=['id', 'relation', 'parent', 'score', 'candidate'])
print(info_df.columns.values)
parent_df = master_df[master_df["relation"] == "document"]
n_results = 20

for n, parent_d in parent_df.iterrows():
    # res = db[task].query(parent_d["embedding"], n_results=n_results)
    res = query_similar(parent_d["embedding"], master_df, dist_func=dist_func, n_results=n_results)
    for doc_id, score in zip(map(id_norm, res["ids"]), res["distances"]):
        # print(doc_id, score)
        child_d = master_df[master_df["id"] == doc_id]
        if len(child_d) == 0:
            continue
        else:
            child_d = child_d.iloc[0]
        if child_d["parent"] == parent_d["id"]:
            rel = child_d["relation"]
            parent = child_d["parent"]
            candidate = True
        else:
            parent = ""
            rel = ""
            candidate = False
        info_df.loc[len(info_df)] = [doc_id, rel, parent, score, candidate]
    info_df.loc[len(info_df)] = ["nan"] * len(info_df.columns)
info_df["score"] = info_df["score"].astype(float).round(5)
# info_df
# info_df["score"] = info_df["score"] * -1 + 1
info_df.to_csv(f"C:\\Users\\Roman\\Desktop\\{task}_{metric}.csv", sep=";", decimal=",")
# info_df[(info_df["candidate"] == False) | (info_df["relation"] == "document")]
info_df.head()

['id' 'relation' 'parent' 'score' 'candidate']


Unnamed: 0,id,relation,parent,score,candidate
0,US7778783B2_20100817,document,US7778783B2_20100817,0.0,True
1,US7152002B2_20061219,analogs,US7778783B2_20100817,4e-05,True
2,US7193711B2_20070320,,,4.36233,False
3,US8706180B2_20140422,,,6.18121,False
4,US8463351B2_20130611,,,6.18121,False


## Debug

In [67]:
doc_id = id_to_db_id[id_norm("US7778783B2_20100817")]
d = db["text"].get(ids=[doc_id], include=["embeddings"])
emb = d["embeddings"][0]
print(doc_id)

US0007778783B2_20100817


In [75]:
from operator import itemgetter
a = cos_dist(np.array(emb), np.vstack(master_df["embedding"]))
most_sim = [sorted(enumerate(a.get()), reverse=True, key=itemgetter(1))]

query_similar(emb, master_df)
    # return [i[0] for i in most_sim[:n_results]]
# master_df[master_df["id"] == id_norm(doc_id)]
# master_df.loc[1832]
    

{'ids': ['US7778783B2_20100817',
  'US2007095157A1_20070503',
  'CA2317738A1_19990617',
  'US7152002B2_20061219',
  'US2005004763A1_20050106',
  'CA2413758A1_20020110',
  'US10086342B2_20181002',
  'US9458451B2_20161004',
  'AU766378B2_20031016',
  'US8260392B2_20120904'],
 'distances': [0.9999999999999998,
  0.9694400857199195,
  0.7284975039670717,
  0.7112778487290318,
  0.6827014164261065,
  0.6153781528889847,
  0.6080877628973724,
  0.6080877628973724,
  0.6067414111298294,
  0.6047783783934403]}

In [73]:
list(master_df.loc[[1,2,3,4]]["id"])

['US20070095157A1_20070503',
 'US2007095157A1_20070503',
 'US20030225531A1_20031204',
 'US7152002B2_20061219']

In [77]:
a = np.array([[1,2], [0,0]])
b = np.array([[1,2], [0,0],[4,5], [6,7]])

ValueError: operands could not be broadcast together with shapes (2, 2) (4, 2)

In [81]:
master_df["embedding"].iloc[0]

array([-6.37576804e-02, -1.34848217e-02, -1.86995976e-02, -2.53216699e-02,
       -1.14438701e-02, -5.25651965e-03, -2.14608181e-02, -4.04433720e-02,
       -1.30426148e-02,  7.31209619e-03,  5.18239220e-04, -1.65248737e-02,
       -1.37642780e-02, -5.87026821e-04, -1.20682837e-02, -1.62567627e-02,
        2.50691921e-02,  1.27622131e-02, -5.78513779e-02,  5.52375317e-02,
        5.50495163e-02, -9.18118097e-03,  1.08068166e-02,  9.46725905e-02,
       -2.48028524e-03,  2.78391093e-02,  1.15477275e-02, -3.93828079e-02,
       -3.27761024e-02, -1.23250037e-02, -1.42227691e-02,  2.61616707e-02,
       -2.90740710e-02, -1.19950268e-02,  2.43369956e-02, -3.67871784e-02,
       -1.67958327e-02, -1.09403278e-03, -4.81399428e-03, -6.66145189e-03,
        1.78800728e-02,  1.10100247e-02, -2.81034559e-02,  1.96128190e-02,
       -1.34123617e-03,  5.95514253e-02, -5.01423143e-02, -6.66220905e-03,
       -5.86063787e-02,  1.65527165e-02,  5.40920254e-03, -3.42389718e-02,
       -2.37151608e-02, -

In [54]:
a = cp.array([1,2])
b = cp.array([[-100,-100],[4,5]])
# l1_dist(a,b)
a = cp.array(master_df.loc[0]["embedding"])
b = cp.vstack(master_df["embedding"])
l1_dist(a, b)
query_similar(a, master_df, l1_dist)

{'ids': ['US7778783B2_20100817',
  'US2007095157A1_20070503',
  'CA2317738A1_19990617',
  'US7152002B2_20061219',
  'US2005004763A1_20050106',
  'CA2413758A1_20020110',
  'US8588882B2_20131119',
  'US8206297B2_20120626',
  'AU766378B2_20031016',
  'US8260392B2_20120904'],
 'distances': [0.0,
  5.894138363353704,
  17.73947816897271,
  18.214324958011275,
  19.204525223111887,
  20.864452632062694,
  20.984150816118927,
  21.081998012887198,
  21.101152957242448,
  21.14392031697207]}

In [97]:
master_df["embedding"]

0       [-0.06375768035650253, -0.013484821654856205, ...
1       [-0.04467740282416344, -0.031102880835533142, ...
2       [-0.0689607709646225, -0.010727322660386562, -...
3       [-0.07692772895097733, -0.01998787187039852, 0...
4       [-0.07537534087896347, -0.0343899242579937, -0...
                              ...                        
1828    [-0.010679994709789753, -0.00810873694717884, ...
1829    [-0.014296182431280613, -0.0036465979646891356...
1830    [-0.004858179949223995, -0.012885243631899357,...
1831    [-0.025193514302372932, -0.01929137483239174, ...
1832    [-0.009783292189240456, -0.008788174018263817,...
Name: embedding, Length: 1833, dtype: object