# Summarizing by clustering embeddings

### Pubmed data

In [1]:
from datasets import load_dataset
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import pandas as pd
import altair as alt
from backend import to_embeddings, summarize

dataset = load_dataset("ccdv/pubmed-summarization")
dataset = dataset["train"][4]
sentences = dataset['article'].split("\n")


No config specified, defaulting to: pubmed-summarization/section
Found cached dataset pubmed-summarization (/root/.cache/huggingface/datasets/ccdv___pubmed-summarization/section/1.0.0/f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b)


  0%|          | 0/3 [00:00<?, ?it/s]

Embed question and sentences

In [2]:
question = "What are the symptoms of syncope?"
sentences.append(question)
embeddings = to_embeddings(sentences)

Clustering

In [3]:
pc = PCA(n_components=2).fit_transform(embeddings)
clusters = KMeans(n_clusters=7).fit_predict(embeddings)


Make data frame for plotting

In [4]:
df = pd.DataFrame({
    "x": pc[:, 0],
    "y": pc[:, 1],
    "sentence": sentences,
    "cluster": clusters,
})
df['is_question'] = df['sentence'] == question


In [5]:
alt.Chart(df).mark_circle().encode(
    x='x:Q',
    y='y:Q',
    color='cluster:N',
    opacity=alt.value(1),
    size=alt.condition(alt.datum.is_question, alt.value(100), alt.value(30)),
    tooltip='sentence'
)

## Summarize

In [6]:
question_cluster = clusters[-1]
relevant_sentences = [s for s, c in zip(sentences, clusters) if c == question_cluster]
relevant_sentences

['syncope is caused by transient diffuse cerebral hypoperfusion and is characterized by transient loss of consciousness with a rapid onset followed by spontaneous and complete recovery . ',
 ' clinical features of syncope may include myoclonic jerks which are often multifocal and asynchronous , convulsions , and urinary incontinence , making it difficult to differentiate from epileptic seizure by clinical features alone . ',
 ' cough syncope , a rare form of syncope , may be a result of transient failure of the cerebral autoregulatory mechanism to cope with sudden decrease in cerebral blood flow . ',
 ' they were triggered by coughing ( usually a bout of cough ) and were characterized by staring and unresponsiveness as well as stiffening of the body with mild shaking of both upper extremities . ',
 ' all started with a bout of cough when the patient was lying in bed ( in supine or in lateral position ) which was followed by brief ( less than a minute ) distal upper extremity tremor and

In [7]:
summarize(relevant_sentences)



'<pad> cough syncope, a rare form of syncope, may be a result of transient failure of the cerebral autoregulatory mechanism to cope with sudden decrease in cerebral blood flow. this case highlights the fact that cough syncope, a rare form of syncope, may be associated with intracranial mass lesions that indirectly exaggerate the increase in icp in response to cough.</s>'