In [1]:
%load_ext autoreload

import sys
import os
sys.path.append(os.path.abspath("../.."))

from rich.console import Console
from functools import partial

from sentence_transformers import SentenceTransformer

from mypackage.elastic import ElasticDocument, Session
from mypackage.sentence import doc_to_sentences, iterative_merge
from mypackage.clustering import chain_clustering

console = Console()

2025-05-08 16:56:10.725990: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746712570.743050   64601 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746712570.747727   64601 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-08 16:56:10.764308: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
sess = Session("pubmed-index", no_connect=True)
docs_to_retrieve = [1923, 4355, 4166, 3611, 6389, 272, 2635, 2581, 372, 6415]
docs = list(map(partial(ElasticDocument, sess, text_path="article", cache_dir = "../../usercode/cache"), docs_to_retrieve))

In [3]:
import pickle
from collections import namedtuple
ProcessedDocument = namedtuple("ProcessedDocument", ["chains", "labels", "clusters"])

In [None]:
from multiprocessing import Process

ProcessedDocument = namedtuple("ProcessedDocument", ["doc", "chains", "labels", "clusters"])

#We need to process these documents in parallel
#We need to create the chains, as well as cluster them

os.makedirs("pickles", exist_ok=True)

def work(doc: ElasticDocument):
    sentences = doc_to_sentences(doc, model)
    merged = iterative_merge(sentences, threshold=0.6, round_limit=None, pooling_method="average")
    labels, clusters = chain_clustering(merged, n_components=25)
    
    with open(f"pickles/{doc.id}.pkl", "wb") as f:
        pickle.dump(ProcessedDocument(doc, merged, labels, clusters), f)

procs = []

for i, doc in enumerate(docs):
    p = Process(target=work, args=(doc,))
    p.start()
    procs.append(p)

for p in procs:
    p.join()



In [29]:
pkl = []

for fname in map(lambda x: f"pickles/{x}.pkl", docs_to_retrieve):
    with open(fname, "rb") as f:
        pkl.append(pickle.load(f))

print(len(pkl))

10


In [30]:
%autoreload 2

from mypackage.clustering import visualize_clustering

for i, p in enumerate(pkl):
    visualize_clustering(p.chains, p.labels, save_to=f"images/{i:02}_{p.chains[0].doc.id}.png", show=False)



IndexError: tuple index out of range

<Figure size 640x480 with 0 Axes>

In [None]:
from rich.panel import Panel
from rich.console import Console

console = Console()

for cluster in pkl.clusters.items():
    text = "\n\n".join([chain.text for chain in cluster[1]])
    console.print(Panel(text, title = f"Cluster {cluster[0]:02}",border_style="cyan bold"))

In [None]:
print(pkl.clusters[1].kth_most_similar_chain().text)

In [None]:
from mypackage.clustering import cluster_mask

mask = cluster_mask(pkl.clusters)

In [None]:
print(mask)

In [None]:
from matplotlib import pyplot as plt

plt.step(list(range(len(mask))), mask)
plt.show()

We need to evaluate the clustering

In [None]:
from mypackage.clustering.metrics import chain_clustering_silhouette_score

chain_clustering_silhouette_score(pkl.chains, pkl.labels)