# Recursive Clustering and Summarization

Plan:
- recursively cluster collections
- create tree of clusters (the HDBSCAN does this anyways but likely not as we want)
- cluster until max-depth is reached or (better) until each leaf only has one "plausible" cluster (based on thresholds or probabilities)
- try summarizing to get "main idea" out of cluster


- cluster on keywords ( randomize all grammar + stop words )
- topic clusters
- context: title, abstract, etc. keywords



## Recursively cluster 

Based on the topic_clustering notebook, we will try with Agglomerative Clustering 

In [10]:
import pandas as pd

df = pd.read_csv("downloads/40k_balanced_pm_acl.csv")#.sample(frac=0.5)

In [11]:
pos = df[df.labels == 1]


sentences = list(pos["text"]) #otherwise key error

In [65]:
pos_h = pos[pos["source"].isin(["Oct1_clinical_studies_pm",'oct3_labels', 'labels_oct7'])]

In [103]:
sents_h = list(pos_h["text"]) #otherwise key error

In [48]:
from sentence_transformers import SentenceTransformer, util

print("Encode the corpus ... get a coffee in the meantime")
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True)

Encode the corpus ... get a coffee in the meantime


Batches:   0%|          | 0/342 [00:00<?, ?it/s]

In [49]:
len(embeddings)

21835

In [50]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np

def cluster(embeddings, **kwargs):
    embeddings = embeddings.cpu()
    # Normalize the embeddings to unit length
    corpus_embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True)

    # Perform kmean clustering
    clustering_model = AgglomerativeClustering(**kwargs) #, affinity='cosine', linkage='average', distance_threshold=0.4)
    clustering_model.fit(corpus_embeddings)
   # cluster_assignment = clustering_model.labels_
    return clustering_model

In [51]:
def get_clusters(clustering_model):
    
    clusters = {}
    for sentence_id, cluster_id in enumerate(clustering_model.labels_):
        if cluster_id not in clusters:
            clusters[cluster_id] = []
        try:
            clusters[cluster_id].append(sentences[sentence_id])
        except:
            print(sentence_id, "sentence_id")
    return clusters
   
#     for i, cluster in clustered_sentences.items():
#         print("Cluster ", i+1)
#         print(cluster)
#         print("\n")

NameError: name 'docs' is not defined

In [52]:
sample = embeddings

In [53]:
cluster_model = cluster(sample, n_clusters=None, distance_threshold=1.4)

Cluster Model attributes

   n_clusters_ : int
        The number of clusters found by the algorithm. If
        ``distance_threshold=None``, it will be equal to the given
        ``n_clusters``.

    labels_
    n_leaves_

    n_connected_components_ : The estimated number of connected components in the graph.

    children_ : array-like of shape (n_samples-1, 2)
        The children of each non-leaf node. Values less than `n_samples`
        correspond to leaves of the tree which are the original samples.
        A node `i` greater than or equal to `n_samples` is a non-leaf
        node and has children `children_[i - n_samples]`. Alternatively
        at the i-th iteration, children[i][0] and children[i][1]
        are merged to form node `n_samples + i`

    distances_ : array-like of shape (n_nodes-1,)
        Distances between nodes in the corresponding place in `children_`.
        Only computed if `distance_threshold` is used or `compute_distances`
        is set to `True`.

In [13]:
# import itertools

# ii = itertools.count(sample.shape[0])
# [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in cluster_model.children_]

In [None]:
len(get_clusters(cluster_model).keys())

In [None]:
clusters = get_clusters(cluster_model)

In [None]:
import matplotlib.pyplot as plt

for thresh in [1, 1.2, 1.4, 1.8]:
    cluster_model = cluster(sample, n_clusters=None, distance_threshold=thresh)
    plt.hist([len(v) for v in get_clusters(cluster_model).values()], bins='auto', label=str(thresh))
    plt.title("Threshold " + str(thresh))

In [None]:
def collection_to_clusters(texts, model=model,  **kwargs):
    embs = model.encode(texts, batch_size=64, show_progress_bar=True, convert_to_tensor=True)
    cluster_model = cluster(embs, **kwargs)
    return get_clusters(cluster_model).values()

In [None]:
cluster_tree = {}
lens = [len(v) for v in clusters.values()]
for i, v in clusters.items():
    if len(v) > 25:
        cluster_tree[i] = {"parent" : v}
        cluster_tree[i] = {"children" :[*collection_to_clusters(v, n_clusters=None, distance_threshold=0.4)]}
        #get values, embed and sample again with lower threshold
        

In [None]:
for i,v in cluster_tree.items():
    print(i)
    for sent in v["children"]:
        print(sent)
        print("\n\n")
    print("-------------- \n\n")

In [66]:
#dict(enumerate(cluster_model.children_, cluster_model.n_leaves_))

## Summarization

**Tried: Google Pegasus**. Result: Does a terrible job of keeping the important information and doesn't retain the question but guesses at a conclusion


In [35]:
torch.cuda.is_available()

NameError: name 'torch' is not defined

### Pegasus Setup


In [36]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

model_name = 'google/pegasus-xsum'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)


Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [37]:

def summarize(sentences):
    batch = tokenizer(sentences, truncation=True, padding='longest', return_tensors="pt").to(device)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

### T5 Setup

In [39]:
from transformers import pipeline

summarizer = pipeline("summarization")

ARTICLE = """ Background: Trust is a critical component of competency committees given their high-stakes decisions. Research from outside of medicine on group trust has not focused on trust in group decisions, and "group trust" has not been clearly defined. The purpose was twofold: to examine the definition of trust in the context of group decisions and to explore what factors may influence trust from the perspective of those who rely on competency committees through a proposed group trust model. Methods: The authors conducted a literature search of four online databases, seeking articles published on trust in group settings. Reviewers extracted, coded, and analyzed key data including definitions of trust and factors pertaining to group trust. Results: The authors selected 42 articles for full text review. Although reviewers found multiple general definitions of trust, they were unable to find a clear definition of group trust and propose the following: a group-directed willingness to accept vulnerability to actions of the members based on the expectation that members will perform a particular action important to the group, encompassing social exchange, collective perceptions, and interpersonal trust. Additionally, the authors propose a model encompassing individual level factors (trustor and trustee), interpersonal interactions, group level factors (structure and processes), and environmental factors."""
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("snrspeaks/t5-one-line-summary") #snrspeaks/t5-one-line-summary
tokenizer = AutoTokenizer.from_pretrained("snrspeaks/t5-one-line-summary")

# T5 uses a max_length of 512 so we cut the article to 512 tokens.
inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
tokenizer.decode(outputs[0])
def summarize(text):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0])

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


Downloading:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

In [40]:
summarize("I have seen a ghost in my shed")

"<pad> Ghosts in sheds: a ghost in a ghost's kiloparsec-parsec-parsec-parsec-parsec-parsec-parsec-parsec</s>"

In [41]:
"I have seen a ghost in my shed"[:5]

'I hav'

### Print Results

In [42]:
for ID, cluster in get_clusters(cluster_model).items():
    sentences = ".".join(cluster)
    print(sentences)
    print( "\n\n", "sum:::", summarize(sentences[:256]), "\n\n\n")
    

The difficulty with this task lies in the fact that prosodic cues are never absolute ; they are relative to individual speakers , gender , dialect , discourse context , local context , phonological environment , and many other factors.Apart from system delay , another current limitation that will influence future interactive speech systems is the unavailability of full prosodic analysis.One shortcoming of LCSeg is that it ignores speaker information when segmenting transcripts.One obvious shortcoming is that some information gets lost in the thresholding that converts posterior probabilities from the prosodic model and the auxiliary LM into binary features.One major time and cost limitation in developing LVCSR systems in Indian languages is the need for large training data.While voice selection mode allows the presenter greater mobility , it has the drawback of allowing the audience to see thumbnails of every slide returned by a content-based query , regardless of whether the presenter

## BERTopic topic modeling + set intersection

In [13]:
from bertopic import BERTopic
topic_model = BERTopic(embedding_model='sentence-transformers/all-mpnet-base-v2', calculate_probabilities=True, verbose=True)

In [67]:
topics, probs = topic_model.fit_transform(sents_h)

Batches:   0%|          | 0/683 [00:00<?, ?it/s]

2021-11-30 00:29:24,647 - BERTopic - Transformed documents to Embeddings
2021-11-30 00:29:38,385 - BERTopic - Reduced dimensionality with UMAP
2021-11-30 00:34:24,509 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [68]:
m = topic_model

In [69]:
sents_h[:5]

['The difficulty with this task lies in the fact that prosodic cues are never absolute ; they are relative to individual speakers , gender , dialect , discourse context , local context , phonological environment , and many other factors',
 'The problem with rich annotations is that they increase the state space of the grammar substantially',
 'As a consequence , when adapting existing methods and techniques to a new domain , researchers and users are faced with the problem of absence of annotated material that could be used for training',
 'The second problem of traditional word alignment approaches is the fact that parameter estimations are usually based on plain text items only',
 'The main drawback of these systems is that they fail to recognize terms which are not included in the dictionary']

In [15]:
topic_model.get_topic(0)  # Select the most frequent topic

[('tuberculosis', 0.05225474070797056),
 ('tb', 0.047865700260262975),
 ('mycobacterium', 0.018176145894951493),
 ('multidrug', 0.01088994709029186),
 ('hiv', 0.005498794299864568),
 ('isoniazid', 0.005431969825081743),
 ('tuberculous', 0.005311402792430819),
 ('rifampicin', 0.003815369257470624),
 ('immunodeficiency', 0.0034702228144216155),
 ('mycobacterial', 0.0031799701599826273)]

In [70]:
topic_model.get_topic(1)  # Select the most frequent topic

[('falls', 0.1253171930689208),
 ('fall', 0.04793925646883541),
 ('falling', 0.03371042500660863),
 ('elderly', 0.02680032690738284),
 ('dizziness', 0.01399673744689186),
 ('slipping', 0.011499171955298289),
 ('sclerosis', 0.008216865222804817),
 ('vertigo', 0.0076661146368655255),
 ('postural', 0.006760039891035532),
 ('aging', 0.006620583495495226)]

In [78]:
d = m.transform(sents_h[:10])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [79]:
d

(array([ -1, 408,  -1, 408, 408,  -1, 408, 408, 407, 408]),
 array([[2.49939067e-004, 3.24441665e-004, 2.41090716e-004, ...,
         3.07923695e-002, 1.49185510e-002, 1.33008220e-002],
        [1.88566710e-004, 2.41623874e-004, 1.81828842e-004, ...,
         5.71791485e-001, 3.73762336e-003, 3.22414919e-003],
        [5.43511442e-011, 6.96912521e-011, 5.34284256e-011, ...,
         4.69269560e-001, 5.90258449e-005, 4.40073601e-005],
        ...,
        [4.16276022e-004, 5.32320834e-004, 4.00122210e-004, ...,
         3.33637472e-001, 7.32416143e-003, 6.48891003e-003],
        [1.14346829e-010, 1.46397550e-010, 1.12165323e-010, ...,
         9.99778763e-001, 1.25969251e-004, 9.51139923e-005],
        [4.77047857e-173, 6.09602091e-173, 4.59459281e-173, ...,
         1.00000000e+000, 8.33672465e-172, 7.26802413e-172]]))

In [82]:
for i,x in enumerate(d[0]):
    print(i, x)

0 -1
1 408
2 -1
3 408
4 408
5 -1
6 408
7 408
8 407
9 408


In [83]:
["qw", "asd"] + ["2332"]

['qw', 'asd', '2332']

In [91]:
{"asd":5}.get("ss")

In [108]:
samp = sentences[1100]

def extract_topics(texts, distance_thresh=0.035, model=topic_model):
    topic_labels = model.transform(texts)[0]
    topic_dict = {}
    for idx, ID in enumerate(topic_labels):
        topics = model.get_topic(ID)
        for t in topics:
            topic = t[0]
            dist = t[1]
            if dist > distance_thresh:
                if not topic_dict.get(topic):
                    topic_dict[topic] = [ texts[idx] ] 
                else:
                    sents = topic_dict[topic] + [texts[idx]]
                    topic_dict[topic] = sents
    return topic_dict

In [109]:
topic_dict = extract_topics(sents_h)

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

In [110]:
import json

with open("topic_dict.json", "w") as f:
    json.dump(topic_dict, f)

In [None]:
#{topic: [sources]}

In [None]:
pos_h["topics"].map()

In [42]:
topic_model.get_topics_freq()

AttributeError: 'BERTopic' object has no attribute 'get_topics_freq'

In [39]:
topic_model.get_topics()

{-1: [('method', 0.0014814478834525645),
  ('cells', 0.001323868143253531),
  ('system', 0.0012572647262012273),
  ('methods', 0.0012512956646694813),
  ('data', 0.0012256677581375015),
  ('cell', 0.0011555938649874535),
  ('technique', 0.0011545544791560533),
  ('results', 0.0011523602907599164),
  ('long', 0.0010836101882556297),
  ('blood', 0.0010690328921589187)],
 0: [('tuberculosis', 0.05225474070797056),
  ('tb', 0.047865700260262975),
  ('mycobacterium', 0.018176145894951493),
  ('multidrug', 0.01088994709029186),
  ('hiv', 0.005498794299864568),
  ('isoniazid', 0.005431969825081743),
  ('tuberculous', 0.005311402792430819),
  ('rifampicin', 0.003815369257470624),
  ('immunodeficiency', 0.0034702228144216155),
  ('mycobacterial', 0.0031799701599826273)],
 1: [('malaria', 0.11684425077345159),
  ('pregnancy', 0.09163146166109488),
  ('placental', 0.06133723498811276),
  ('plasmodium', 0.04343155169315712),
  ('pyrimethamine', 0.03479449356881796),
  ('foetus', 0.0347944935688179