# Recursive Clustering and Summarization

Plan:
- recursively cluster collections
- create tree of clusters (the HDBSCAN does this anyways but likely not as we want)
- cluster until max-depth is reached or (better) until each leaf only has one "plausible" cluster (based on thresholds or probabilities)
- try summarizing to get "main idea" out of cluster


- cluster on keywords ( randomize all grammar + stop words )
- topic clusters
- context: title, abstract, etc. keywords



## Recursively cluster 

Based on the topic_clustering notebook, we will try with Agglomerative Clustering 

In [4]:
import pandas as pd

df = pd.read_csv("downloads/40k_balanced_pm_acl.csv")#.sample(frac=0.5)

In [5]:
!python --version

Python 3.6.13


In [6]:
pos = df[df.labels == 1]


sentences = list(pos["text"]) #otherwise key error

In [7]:
pos_h = pos[pos["source"].isin(["Oct1_clinical_studies_pm",'oct3_labels', 'labels_oct7'])]

In [8]:
sents_h = list(pos_h["text"]) #otherwise key error

In [None]:
from sentence_transformers import SentenceTransformer, util

print("Encode the corpus ... get a coffee in the meantime")
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True)

In [None]:
len(embeddings)

In [None]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np

def cluster(embeddings, **kwargs):
    embeddings = embeddings.cpu()
    # Normalize the embeddings to unit length
    corpus_embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True)

    # Perform kmean clustering
    clustering_model = AgglomerativeClustering(**kwargs) #, affinity='cosine', linkage='average', distance_threshold=0.4)
    clustering_model.fit(corpus_embeddings)
   # cluster_assignment = clustering_model.labels_
    return clustering_model

In [None]:
def get_clusters(clustering_model):
    
    clusters = {}
    for sentence_id, cluster_id in enumerate(clustering_model.labels_):
        if cluster_id not in clusters:
            clusters[cluster_id] = []
        try:
            clusters[cluster_id].append(sentences[sentence_id])
        except:
            print(sentence_id, "sentence_id")
    return clusters
   
#     for i, cluster in clustered_sentences.items():
#         print("Cluster ", i+1)
#         print(cluster)
#         print("\n")

In [None]:
sample = embeddings

In [None]:
cluster_model = cluster(sample, n_clusters=None, distance_threshold=1.4)

Cluster Model attributes

   n_clusters_ : int
        The number of clusters found by the algorithm. If
        ``distance_threshold=None``, it will be equal to the given
        ``n_clusters``.

    labels_
    n_leaves_

    n_connected_components_ : The estimated number of connected components in the graph.

    children_ : array-like of shape (n_samples-1, 2)
        The children of each non-leaf node. Values less than `n_samples`
        correspond to leaves of the tree which are the original samples.
        A node `i` greater than or equal to `n_samples` is a non-leaf
        node and has children `children_[i - n_samples]`. Alternatively
        at the i-th iteration, children[i][0] and children[i][1]
        are merged to form node `n_samples + i`

    distances_ : array-like of shape (n_nodes-1,)
        Distances between nodes in the corresponding place in `children_`.
        Only computed if `distance_threshold` is used or `compute_distances`
        is set to `True`.

In [None]:
# import itertools

# ii = itertools.count(sample.shape[0])
# [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in cluster_model.children_]

In [None]:
len(get_clusters(cluster_model).keys())

In [None]:
clusters = get_clusters(cluster_model)

In [None]:
import matplotlib.pyplot as plt

for thresh in [1, 1.2, 1.4, 1.8]:
    cluster_model = cluster(sample, n_clusters=None, distance_threshold=thresh)
    plt.hist([len(v) for v in get_clusters(cluster_model).values()], bins='auto', label=str(thresh))
    plt.title("Threshold " + str(thresh))

In [None]:
def collection_to_clusters(texts, model=model,  **kwargs):
    embs = model.encode(texts, batch_size=64, show_progress_bar=True, convert_to_tensor=True)
    cluster_model = cluster(embs, **kwargs)
    return get_clusters(cluster_model).values()

In [None]:
cluster_tree = {}
lens = [len(v) for v in clusters.values()]
for i, v in clusters.items():
    if len(v) > 25:
        cluster_tree[i] = {"parent" : v}
        cluster_tree[i] = {"children" :[*collection_to_clusters(v, n_clusters=None, distance_threshold=0.4)]}
        #get values, embed and sample again with lower threshold
        

In [None]:
for i,v in cluster_tree.items():
    print(i)
    for sent in v["children"]:
        print(sent)
        print("\n\n")
    print("-------------- \n\n")

In [None]:
#dict(enumerate(cluster_model.children_, cluster_model.n_leaves_))

## Summarization

**Tried: Google Pegasus**. Result: Does a terrible job of keeping the important information and doesn't retain the question but guesses at a conclusion


In [None]:
torch.cuda.is_available()

### Pegasus Setup


In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

model_name = 'google/pegasus-xsum'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)


In [None]:

def summarize(sentences):
    batch = tokenizer(sentences, truncation=True, padding='longest', return_tensors="pt").to(device)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

### T5 Setup

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization")

ARTICLE = """ Background: Trust is a critical component of competency committees given their high-stakes decisions. Research from outside of medicine on group trust has not focused on trust in group decisions, and "group trust" has not been clearly defined. The purpose was twofold: to examine the definition of trust in the context of group decisions and to explore what factors may influence trust from the perspective of those who rely on competency committees through a proposed group trust model. Methods: The authors conducted a literature search of four online databases, seeking articles published on trust in group settings. Reviewers extracted, coded, and analyzed key data including definitions of trust and factors pertaining to group trust. Results: The authors selected 42 articles for full text review. Although reviewers found multiple general definitions of trust, they were unable to find a clear definition of group trust and propose the following: a group-directed willingness to accept vulnerability to actions of the members based on the expectation that members will perform a particular action important to the group, encompassing social exchange, collective perceptions, and interpersonal trust. Additionally, the authors propose a model encompassing individual level factors (trustor and trustee), interpersonal interactions, group level factors (structure and processes), and environmental factors."""
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("snrspeaks/t5-one-line-summary") #snrspeaks/t5-one-line-summary
tokenizer = AutoTokenizer.from_pretrained("snrspeaks/t5-one-line-summary")

# T5 uses a max_length of 512 so we cut the article to 512 tokens.
inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
tokenizer.decode(outputs[0])
def summarize(text):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0])

In [None]:
summarize("I have seen a ghost in my shed")

In [None]:
"I have seen a ghost in my shed"[:5]

### Print Results

In [None]:
for ID, cluster in get_clusters(cluster_model).items():
    sentences = ".".join(cluster)
    print(sentences)
    print( "\n\n", "sum:::", summarize(sentences[:256]), "\n\n\n")
    

In [None]:
!pip bertopic --version

In [None]:
!pip uninstall bertopic --yes

In [None]:
pip uninstall umap
pip install umap-learn

In [5]:
!pip uninstall bertopic

[33mDEPRECATION: Python 2.7 reached the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 is no longer maintained. pip 21.0 will drop support for Python 2.7 in January 2021. More details about Python 2 support in pip can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support pip 21.0 will remove support for this functionality.[0m


In [7]:
!pip uninstall numpy --yes

Found existing installation: numpy 1.19.5
Uninstalling numpy-1.19.5:
  Successfully uninstalled numpy-1.19.5


In [10]:
!pip install bertopic

Collecting bertopic
  Using cached bertopic-0.9.3-py2.py3-none-any.whl (57 kB)
  Using cached bertopic-0.9.2-py2.py3-none-any.whl (57 kB)
  Using cached bertopic-0.9.1-py2.py3-none-any.whl (55 kB)
  Using cached bertopic-0.9.0-py2.py3-none-any.whl (55 kB)
  Using cached bertopic-0.8.1-py2.py3-none-any.whl (53 kB)
  Using cached bertopic-0.8.0-py2.py3-none-any.whl (53 kB)
  Using cached bertopic-0.7.0-py2.py3-none-any.whl (40 kB)
  Using cached bertopic-0.6.0-py2.py3-none-any.whl (25 kB)
Collecting torch>=1.4.0
  Using cached torch-1.10.0-cp36-cp36m-manylinux1_x86_64.whl (881.9 MB)
Collecting umap-learn>=0.5.0
  Using cached umap_learn-0.5.2-py3-none-any.whl
Collecting hdbscan>=0.8.27
  Using cached hdbscan-0.8.27-cp36-cp36m-linux_x86_64.whl
Collecting sentence-transformers>=0.4.1
  Using cached sentence_transformers-2.1.0-py3-none-any.whl
[31mERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: '/home/ec2-user/anaconda3/envs/python3/lib/python3.6/s

In [5]:
!pip install bertopic==0.9.2

Collecting bertopic==0.9.2
  Using cached bertopic-0.9.2-py2.py3-none-any.whl (57 kB)
Collecting plotly<4.14.3,>=4.7.0
  Using cached plotly-4.14.2-py2.py3-none-any.whl (13.2 MB)
[31mERROR: Could not find a version that satisfies the requirement numpy>=1.20.0 (from bertopic) (from versions: 1.3.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0, 1.6.1, 1.6.2, 1.7.0, 1.7.1, 1.7.2, 1.8.0, 1.8.1, 1.8.2, 1.9.0, 1.9.1, 1.9.2, 1.9.3, 1.10.0.post2, 1.10.1, 1.10.2, 1.10.4, 1.11.0, 1.11.1, 1.11.2, 1.11.3, 1.12.0, 1.12.1, 1.13.0rc1, 1.13.0rc2, 1.13.0, 1.13.1, 1.13.3, 1.14.0rc1, 1.14.0, 1.14.1, 1.14.2, 1.14.3, 1.14.4, 1.14.5, 1.14.6, 1.15.0rc1, 1.15.0rc2, 1.15.0, 1.15.1, 1.15.2, 1.15.3, 1.15.4, 1.16.0rc1, 1.16.0rc2, 1.16.0, 1.16.1, 1.16.2, 1.16.3, 1.16.4, 1.16.5, 1.16.6, 1.17.0rc1, 1.17.0rc2, 1.17.0, 1.17.1, 1.17.2, 1.17.3, 1.17.4, 1.17.5, 1.18.0rc1, 1.18.0, 1.18.1, 1.18.2, 1.18.3, 1.18.4, 1.18.5, 1.19.0rc1, 1.19.0rc2, 1.19.0, 1.19.1, 1.19.2, 1.19.3, 1.19.4, 1.19.5)[0m
[31mERROR: No matching distribution found for

## BERTopic topic modeling + set intersection

In [19]:
from bertopic import BERTopic
m = BERTopic(embedding_model='sentence-transformers/all-mpnet-base-v2', calculate_probabilities=True, verbose=True)

In [10]:
topics, probs = m.fit_transform(sents_h)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

2021-11-30 14:45:52,908 - BERTopic - Transformed documents to Embeddings
2021-11-30 14:46:00,986 - BERTopic - Reduced dimensionality with UMAP
2021-11-30 14:46:01,026 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [11]:
m.get_topic(0)  # Select the most frequent topic

[('health', 0.06385706474143353),
 ('eating', 0.05910121405216769),
 ('children', 0.055809648558593584),
 ('diabetes', 0.04518032493648633),
 ('overweight', 0.04229698871174154),
 ('adherence', 0.03940080936811179),
 ('obesity', 0.03940080936811179),
 ('medication', 0.03940080936811179),
 ('adolescents', 0.034449758078397896),
 ('social', 0.03155357873476815)]

In [12]:
m.get_topic(1)  # Select the most frequent topic

[('diabetes', 0.0727039711621619),
 ('disease', 0.05182890475539249),
 ('diabetic', 0.05151163782348641),
 ('cardiovascular', 0.04751690769434193),
 ('myocardial', 0.0407594579670122),
 ('hypertension', 0.029170337042580374),
 ('ischemia', 0.029170337042580374),
 ('atherosclerotic', 0.029170337042580374),
 ('diseases', 0.026037067587527916),
 ('hospital', 0.025755818911743205)]

In [17]:
samp = sentences[1100]

def extract_topics(texts, distance_thresh=0.035, model=m):
    topic_labels = model.transform(texts)[0]
    topic_dict = {}
    for idx, ID in enumerate(topic_labels):
        topics = model.get_topic(ID)
        for t in topics:
            topic = t[0]
            dist = t[1]
            if dist > distance_thresh:
                if not topic_dict.get(topic):
                    topic_dict[topic] = [ texts[idx] ] 
                else:
                    sents = topic_dict[topic] + [texts[idx]]
                    topic_dict[topic] = sents
    return topic_dict

In [20]:
td = extract_topics(sents_h)

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

In [21]:
td

{'disease': ['Idiopathic guttate hypomelanosis (IGH) is a common hypopigmentation affecting a large amount of older population.',
  'Survival rates of out-of-hospital cardiac arrest remain poor.',
  'Patients with unresectable esophageal cancer require palliation for dysphagia.',
  'An otherwise successfully performed endoscopic thoracic sympathectomy (ETS) to treat palmar hyperhidrosis (PH) often has a serious side effect: compensatory sweating (CS).',
  'Treatment of benign familial pemphigus or Hailey-Hailey disease (HHD), a rare inherited condition associated with a significant impairment of quality of life, is often challenging and disappointing with frequent relapses and infectious complications.',
  'Neuropathy is a common diabetic complication that can result in significant disability.',
  'High blood pressure is related to cardiovascular diseases.',
  'Gastrointestinal symptoms seem to affect more women, due to hormonal and emotional issues, impacting the quality of life.',
  

In [None]:
import json

with open("topic_dict.json", "w") as f:
    json.dump(topic_dict, f)

## Hierachy by source text overlap


In [22]:
#take td
#if there's overlap of t1 and t2 t1 is parent if t2 has less of the sources (and 80%+) of t2 sources are in t1 (otherwise duplicate or not that realted))

In [58]:
def sources_relation(s1, s2, min_overlap=0.8):
    s1 = set(s1)
    s2 = set(s2)
    l1 = len(s1)
    l2 = len(s2)

    common = s1.intersection(s2) #order doesn't matter
    
#     cl1 = len(s1.intersect(common))
#     cl2 = len(s2.intersect(common))
    
    relation="TBA"
    #if only a few sources are common and it's not all/most of one topics sources then -> different ideas
    #if many are common and one topic is much bigger, the smaller is a subtopic
    #if almost all are common and there's 50:50 or 30:70 split then they are duplicates
    #either unrelated, duplicate, or hypernomy
    
    cl = len(common)

    ol2 = cl/l2
    ol1 = cl/l1
    if l1 > l2:
        ol = ol2
    else:
        ol = ol1
    
    if ol < 0.8: #little in common (one topic is enough to decide)
        relation="different"
        
        #-- comparative
    if ol > 0.8: #if lots of overlap
        if l2/l1 < 7/10:
            relation="subset"
            
        elif l1/l2 < 7/10:
            relation="superset"
        else: #approx similar size
            #topics similar size and much of overlap (synonyms)
            relation = "duplicates"
    
    return relation


In [59]:
from itertools import combinations

topic_pairs = combinations(td.keys(), 2)
result = [(t1, t2, sources_relation(td[t1], td[t2]) ) for t1,t2 in topic_pairs]


In [60]:
result

[('disease', 'inflammatory', 'subset'),
 ('disease', 'gastrointestinal', 'subset'),
 ('disease', 'esophageal', 'subset'),
 ('disease', 'eosinophilic', 'subset'),
 ('disease', 'diseases', 'subset'),
 ('disease', 'disorders', 'subset'),
 ('disease', 'depression', 'different'),
 ('disease', 'mental', 'different'),
 ('disease', 'health', 'different'),
 ('disease', 'suicide', 'different'),
 ('disease', 'stroke', 'different'),
 ('disease', 'disability', 'different'),
 ('disease', 'parkinson', 'different'),
 ('disease', 'pd', 'different'),
 ('disease', 'diabetes', 'different'),
 ('disease', 'diabetic', 'subset'),
 ('disease', 'cardiovascular', 'subset'),
 ('disease', 'myocardial', 'subset'),
 ('disease', 'cancer', 'different'),
 ('disease', 'skin', 'subset'),
 ('disease', 'infection', 'different'),
 ('disease', 'tuberculosis', 'different'),
 ('disease', 'rice', 'different'),
 ('disease', 'nursing', 'different'),
 ('disease', 'students', 'different'),
 ('disease', 'eating', 'different'),
 ('di

In [23]:

for topic, sources in td.items():
    #every combination gets overlaps
    

disease
inflammatory
gastrointestinal
esophageal
eosinophilic
diseases
disorders
depression
mental
health
suicide
stroke
disability
parkinson
pd
diabetes
diabetic
cardiovascular
myocardial
cancer
skin
infection
tuberculosis
rice
nursing
students
eating
children
overweight
adherence
obesity
medication
