# Clustering Sentences (non-hierachical)

In [1]:
import pandas as pd

df = pd.read_csv("downloads/40k_balanced_pm_acl.csv").sample(frac=0.5)
sentences = list(df["text"])

## Sentence Transformers

In [3]:
from sentence_transformers import SentenceTransformer, util

print("Encode the corpus ... get a coffee in the meantime")
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True)

Encode the corpus ... get a coffee in the meantime


Batches:   0%|          | 0/330 [00:00<?, ?it/s]

In [4]:
len(embeddings)

21070

In [28]:
import time
print("Start clustering")
start_time = time.time()

clusters = util.community_detection(embeddings, min_community_size=25, threshold=0.75)
print("Clustering took {:.2f} sec".format(time.time() - start_time))

Start clustering
Clustering done after 1.12 sec

Cluster 1, #137 Elements 
	 tensor([ 5.3498e-02,  8.9477e-02,  1.4012e-02, -1.5032e-02,  7.1370e-02,
         8.9495e-03,  2.3687e-02,  3.1170e-02,  1.6417e-02, -7.7270e-03,
         4.7722e-02,  3.3254e-02, -9.3222e-03,  4.8507e-03, -1.0017e-02,
         2.8978e-03,  1.5487e-02,  1.8295e-02, -2.1779e-02,  2.0944e-02,
        -5.2399e-02,  1.8726e-03,  7.8426e-03,  1.0363e-02, -3.5122e-02,
         1.3721e-02,  2.5020e-02,  1.5207e-02, -1.3431e-02, -5.8160e-02,
         4.1508e-02, -1.2360e-03, -3.7348e-02, -2.1269e-02,  1.9564e-06,
         8.7001e-03, -2.3176e-02,  3.1739e-02, -3.2843e-02,  1.3695e-02,
         4.6471e-02, -3.9887e-02, -1.4030e-02, -3.0312e-03,  7.2742e-03,
        -2.5850e-02,  8.2416e-02,  2.3422e-02, -7.9729e-03, -1.9996e-02,
         8.4943e-03, -6.6905e-02, -8.1571e-02,  1.3539e-03, -7.6157e-03,
         2.6643e-02,  8.9292e-03,  1.3997e-02, -2.4854e-03, -9.7843e-03,
         1.4260e-02,  2.2840e-02, -9.9288e-03, 

In [31]:
#Print for all clusters the top 3 and bottom 3 elements
for i, cluster in enumerate(clusters):
    print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
    for sentence_id in cluster[0:3]:
        print("\t", sentences[sentence_id])
    print("\t", "...")
    for sentence_id in cluster[-3:]:
        print("\t", sentences[sentence_id])


Cluster 1, #137 Elements 
	 Obesity is a serious public health concern with an increasing prevalence worldwide .
	 Obesity is a serious public health problem that is growing alarmingly worldwide .
	 Obesity is a serious public health problem , the prevalence of which is increasing dramatically all over the world .
	 ...
	 The survey findings suggest the majority of the Australian population recognises obesity to be a serious health problem , and support government regulation of the food environment as a population-level preventative strategy .
	 Obesity , a serious public health problem , relates to a chronic low-grade systemic inflammation and is involved in the development of obesity-linked disorders including insulin resistance , type 2 diabetes , cardiovascular diseases , dyslipidemia , and metabolic syndrome .
	 Childhood obesity represents a social burden.

Cluster 2, #132 Elements 
	 In recent years , the increasing number of antibiotic-resistant bacteria has become a serious h

## BERTopic

On AWS sa

In [36]:
from bertopic import BERTopic
topic_model = BERTopic(embedding_model='sentence-transformers/all-mpnet-base-v2', calculate_probabilities=True, verbose=True)

In [37]:
topics, probs = topic_model.fit_transform(sentences)

In [38]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,7142,-1_rrb_lrb_significantly_compared
1,92,524,92_preterm_birth_pregnancy_breastfeeding
2,325,331,325_language_lexical_parse_parsing
3,103,244,103_antibiotic_antibiotics_bacteria_antimicrobial
4,347,228,347_randomized_randomised_placebo_conducted
...,...,...,...
356,94,10,94_coli_escherichia_shiga_toxin
357,352,10,352_55_age_sem_sg
358,283,10,283_foot_knee_brace_braces
359,279,10,279_recruitment_retention_acceptability_interv...


In [48]:
for t in [1,2,3,4,5]:
    print('\n')
    print(topic_model.get_topic(t))



[('chagas', 0.14040831661877815), ('trypanosoma', 0.05764363064256777), ('trypanosomiasis', 0.051785035239295306), ('african', 0.02603955761615896), ('africa', 0.012058647228130544), ('disease', 0.01141601879882132), ('trypanosomes', 0.010201767290954742), ('trypanosoniasis', 0.010201767290954742), ('trypanosomosis', 0.010201767290954742), ('trypanocidal', 0.010201767290954742)]


[('iodine', 0.20204989105710383), ('idd', 0.09070876944995189), ('deficiency', 0.08313450682477758), ('goiter', 0.04023838967351731), ('iccidd', 0.02501776148382541), ('retardation', 0.01814175388999038), ('biochemical', 0.016544633311393005), ('bangladesh', 0.016237172615316573), ('egypt', 0.015954336192441885), ('saudi', 0.015448679501132085)]


[('toxoplasmosis', 0.12927631099306094), ('toxoplasma', 0.08192420343601008), ('protozoan', 0.051555123096351206), ('toxoplasmic', 0.04594877208174565), ('toxocara', 0.04594877208174565), ('toxocariasis', 0.04594877208174565), ('parasite', 0.04166709361569485), ('

In [74]:
topic_model.visualize_topics()

In [64]:
topic_model.visualize_distribution()

TypeError: visualize_distribution() missing 1 required positional argument: 'probabilities'

In [63]:
topic_model.visualize_hierarchy()

AttributeError: 'BERTopic' object has no attribute 'visualize_hierarchy'

## Try above after UMAP to reduce dimensionality ... most cluster methods don't handle dimensionality well

In [52]:
import umap
umap_embeddings = umap.UMAP(n_neighbors=15, 
                            n_components=5, 
                            metric='cosine').fit_transform(embeddings.cpu()) #cuda error

In [None]:
!pip uninstall bertopic --yes

Found existing installation: bertopic 0.6.0
Uninstalling bertopic-0.6.0:
  Successfully uninstalled bertopic-0.6.0


In [None]:
pip install bertopic==0.9.2

In [87]:
pip show bertopic

Name: bertopic
Version: 0.6.0
Summary: BERTopic performs topic Modeling with state-of-the-art transformer models.
Home-page: https://github.com/MaartenGr/BERTopic
Author: Maarten Grootendorst
Author-email: maartengrootendorst@gmail.com
License: UNKNOWN
Location: /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages
Requires: umap-learn, tqdm, numpy, scikit-learn, torch, hdbscan, pandas, sentence-transformers
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip show numpy