# Clustering Sentences (non-hierachical)

In [53]:
import pandas as pd

df = pd.read_csv("downloads/Aug_13_psy_envsci.csv")#.sample(frac=0.6)
df = df[df.text.notna() & df.doi.notna()]

In [54]:
len(df)

38608

In [55]:
df = df[df.fields == "Environmental Science"]

In [56]:
df = df.drop_duplicates(subset=["doi"])
df

Unnamed: 0,text,doi,fields,predicts
2,<p>Mediterranean areas are vulnerable and at h...,10.5194/egusphere-egu2020-18914,Environmental Science,1
3,The sediment quality of Ribeira de Iguape Rive...,10.1007/s10661-017-5971-x,Environmental Science,1
6,Thermal groundwater is of great economic and s...,10.1007/s12517-021-06632-3,Environmental Science,1
7,<p>Soil water storage (SWC) is a major spatio-...,10.5194/egusphere-egu2020-10944,Environmental Science,1
8,The success of variable rate N fertilizer appl...,10.2136/SSSAJ2002.1549,Environmental Science,1
...,...,...,...,...
38597,Although SAGD is a very popular in-situ extrac...,10.2118/196676-ms,Environmental Science,1
38602,Soil carbon management at landscape scale requ...,10.1002/jsfa.5593,Environmental Science,1
38605,Active soil organic matter (ASOM) has a main e...,10.1007/BF02856848,Environmental Science,1
38606,Freshwater availability is changing worldwide.,10.1038/s41586-018-0123-1,Environmental Science,1


In [6]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
     |████████████████████████████████| 85 kB 7.6 MB/s             
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=126546 sha256=6dff74965eb65101a522e7e9d27206654db65503e133855148ba4f754812e48f
  Stored in directory: /home/ec2-user/.cache/pip/wheels/c9/90/11/0e58d454669bc8daf94e04a8da9956aa6f78eb10cddb16dd4e
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2


In [60]:
from sentence_transformers import SentenceTransformer, util

In [61]:
sentences = list(df.text)

## Sentence Transformers

In [62]:
#12:08

import time
start_time = time.time()

print("Encode the corpus of {}... get a coffee in the meantime".format(len(sentences)))
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True)

print("--- %s seconds ---" % (time.time() - start_time))

Encode the corpus of 15929... get a coffee in the meantime


Batches:   0%|          | 0/249 [00:00<?, ?it/s]

--- 40.32659721374512 seconds ---


In [63]:
len(embeddings)

15929

In [64]:
#df.to_csv("downloads/pos_preds_sample_Jun162022.csv", index = False, header=True)

In [65]:
#df = pd.read_csv("downloads/pos_preds_sample_Jun162022.csv")

In [66]:
import torch
#torch.save(embeddings, 'downloads/23june_non_medicine.pt')

In [67]:
embeddings = embeddings[:230000]
len(embeddings)

15929

In [68]:
import numpy as np
embeddings = embeddings.cpu()
# Normalize the embeddings to unit length
corpus_embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True)

### Agglomerative Clustering

In [69]:
!free
#2154789126400 needed
#231729824

             total       used       free     shared    buffers     cached
Mem:     261126736   27455400  233671336      48576    1264084    9279996
-/+ buffers/cache:   16911320  244215416
Swap:            0          0          0


### Fast Clustering (40k + sentences)

In [70]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [71]:
import time
start_time = time.time()

print("Start clustering at ", start_time)


clusters = util.community_detection(corpus_embeddings, threshold=0.7)

#0.75 -> 1874 clusters
print("Clustering took {:.2f} sec".format(time.time() - start_time))

Start clustering at  1660402253.5010705
Clustering took 1.80 sec


In [72]:
len(clusters)

251

In [73]:
# import json
# with open("downloads/non_med_230k_clusters.json") as f:
#     excl = json.loads(f.read())

In [74]:
def cluster2D_to_center_dict(cluss, print_sample=False):
    #Print for all clusters the top 3 and bottom 3 elements
    clus = {}
    center_keys = {}
    for i, cluster in enumerate(cluss):
        clus[str(i)] = cluster
        center_keys[cluster[0]] = cluster[1:]
        
    if print_sample:     
        for sentence_id in clusters[0:3]:
            print("\t", sentences[sentence_id])
        print("\t", "...")
        for sentence_id in clusters[-3:]:
            print("\t", sentences[sentence_id])
            
    return center_keys

In [75]:
def idxs_to_text_clusters(clus, df):
    try:
        df = df.reset_index()
    except:
        pass
    cl = {}
    for k, vals in clus.items():
        center_text= df.iloc[k].text
#        
        if isinstance(vals, dict):
            cl[center_text] = idxs_to_text_clusters(vals, df)
        else:
            surround = list(df.iloc[vals].text)
            cl[center_text] = surround
    return cl


def idxs_to_details(clus, df):
    try:
        df = df.reset_index()
    except:
        pass
    cl = []
    for k, vals in clus.items():
        details=df.iloc[k].to_dict()
        #doi = details["doi"]
        #center_text= df.iloc[k].text
        details["children"] = []
        for child_idx in vals:
            details["children"] += [df.iloc[child_idx].to_dict()]
#         details["children"] = df.iloc[vals].to_dict()
        cl += [details]
#         cl += []
#         if isinstance(vals, dict):
#             cl[center_text] = idxs_to_text_clusters(vals, df)
#         else:
#             surround = list(df.iloc[vals].text)
#             cl[center_text] = surround
    return cl


In [76]:
center_keys = cluster2D_to_center_dict(clusters)

In [77]:
from IPython.display import JSON


In [78]:
# cll = idxs_to_text_clusters(center_keys, df)
cl2 = idxs_to_details(center_keys,df)


In [79]:
#cl2[0]["children"]

In [80]:
JSON(cl2[:10])

<IPython.core.display.JSON object>

## Recursive Cluster:
1. Take Clusters with more than >50 entries
2. Rerun community detection with different threshold
3. Print New Child Sentences

"cluster_0" -> DOIs of child centers
"cluster_1" -> DOIs of grandchild problem statements

In [81]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np

def clustering(embeddings, **kwargs):
    embeddings = embeddings.cpu()
    # Normalize the embeddings to unit length
    corpus_embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True)

    # Perform kmean clustering
    clustering_model = AgglomerativeClustering(**kwargs) #, affinity='cosine', linkage='average', distance_threshold=0.4)
    clustering_model.fit(corpus_embeddings)
   # cluster_assignment = clustering_model.labels_
    return clustering_model

def get_clusters(clustering_model): 
    clusters = {}
    for sentence_id, cluster_id in enumerate(clustering_model.labels_):
        if cluster_id not in clusters:
            clusters[cluster_id] = []
        try:
            clusters[cluster_id].append(sentence_id)
        except:
            print(sentence_id, "sentence_id")
    return clusters

In [82]:
# e = embeddings[center_keys[32634]]
# clusters2 = clustering(e, n_clusters=None, distance_threshold=1.5)
# clx = get_clusters(clusters2)

In [83]:
def child_clusters(center_keys, max_size, embeddings, thresh=1.5, min_variance=0.3):
    clus = {}

    for k,vs in list(center_keys.items()):
        c_size=len(vs)
        #print(k, c_size)
        embs = embeddings[vs]
        lookup = dict([*enumerate(vs)])

        if c_size > max_size and torch.var(embs, axis=0).sum() > min_variance:
            embs = embs.cpu()

            # Normalize the embeddings to unit length
            embs = embs /  np.linalg.norm(embs, axis=1, keepdims=True)

    #       clusters = util.community_detection(embs, threshold=threshold)
            cl = clustering(embs, n_clusters=None, distance_threshold=thresh)
            clx = get_clusters(cl)

            #get global index for cluster_item
            clusters = {lookup[k]: [lookup[v] for v in vs] for k,vs in clx.items()}
            children_count = len(clusters)
            print("Cluster with {} entries has {} children cluster nodes".format(c_size, children_count))
            
            if children_count > 1:
                #print(idxs_to_text_clusters(clusters, df))
                clus[k] = clusters
            else:
                
                #if just one child then no nesting
                clus[k] = vs
                pass

        else:
            clus[k] = vs #normal
            pass

            
    return clus


In [84]:
len(center_keys)

251

In [85]:
len(df)

15929

In [86]:
torch.var(embeddings, axis=0).sum()

tensor(0.8368)

In [87]:
cclus = child_clusters(center_keys, 30, embeddings, thresh=1.9, min_variance=0.32)

Cluster with 108 entries has 2 children cluster nodes
Cluster with 99 entries has 3 children cluster nodes
Cluster with 93 entries has 3 children cluster nodes
Cluster with 89 entries has 2 children cluster nodes
Cluster with 75 entries has 2 children cluster nodes
Cluster with 74 entries has 1 children cluster nodes
Cluster with 69 entries has 1 children cluster nodes
Cluster with 69 entries has 2 children cluster nodes
Cluster with 60 entries has 2 children cluster nodes
Cluster with 52 entries has 2 children cluster nodes
Cluster with 52 entries has 2 children cluster nodes
Cluster with 48 entries has 1 children cluster nodes
Cluster with 45 entries has 1 children cluster nodes
Cluster with 44 entries has 1 children cluster nodes
Cluster with 42 entries has 1 children cluster nodes
Cluster with 42 entries has 1 children cluster nodes
Cluster with 40 entries has 1 children cluster nodes
Cluster with 40 entries has 1 children cluster nodes
Cluster with 36 entries has 1 children cluste

In [88]:
c_flat = {}

for k,vs in cclus.items():
    outer = []
    inner = []
#     for num_or_list in vs:
        #if it is a dict, get out the child dicts/lists
        #print(type(num_or_list))
    if isinstance(vs, dict):
        for k2, vs2 in vs.items():
            c_flat[k2]=vs2
    else:
        #else keep as is
        c_flat[k] = vs
        pass

In [89]:
len(c_flat), len(cclus)

(264, 251)

In [90]:
JSON(cclus)

<IPython.core.display.JSON object>

In [91]:
flat_map = idxs_to_details(c_flat, df)

In [92]:
def json_norm(fmap):
    m = fmap
    if m.get("predicts"):
        del m["predicts"]
    m["index"] = int(m["index"])
    children =m.get("children")
    if children:
        for c in children:
            c = json_norm(c)
    return m
        
        
        
    
    

In [93]:
#json_norm(flat_map[0])

In [94]:
f_json = [json_norm(c_det) for c_det in flat_map]

In [95]:
JSON(f_json)

<IPython.core.display.JSON object>

In [96]:
# Variance + Size as recursion boolean
# Closest_problem linking [loi] neighbors

In [97]:
len(f_json)

264

In [98]:
import json
with open("downloads/env_clusters.json", "w") as f:
    json.dump(f_json, f)
    #print(data['data'][0])


In [99]:
len(df)


15929

In [100]:
df.to_csv("downloads/env_aug13.csv", index = False, header=True)

In [101]:
centers = df.reset_index().iloc[center_idxs]

NameError: name 'center_idxs' is not defined

In [None]:
centers["problem_id"] = center_idxs

In [None]:
centers

In [None]:
def get_neighbors(problem_id, clus=clus):
    n = []
    for c in clus.values():
        if c[0] == problem_id:
            n = c[1:]
    return n

In [None]:
centers["same_cluster"] = centers["problem_id"].map(get_neighbors)

df=df.reset_index()

In [None]:
centers["DOIs"] = centers["same_cluster"].map(lambda cls: list(df.iloc[cls]["doi"]))

In [None]:
centers['text'].iloc[3]

KEYBERT

In [None]:
!pip install keybert

In [None]:
from keybert import KeyBERT
kw_model = KeyBERT()

In [None]:
def get_kws(text, kw_model=kw_model):
    kws = kw_model.extract_keywords(text, 
                          keyphrase_ngram_range=(1, 2), 
                          stop_words='english',
                          use_mmr=True, 
                          diversity=0.25)
    
    kws.sort(key=lambda x: x[1], reverse=True)
    return kws

In [None]:
import random

#get a sample from the cluster and sum all the keywords
min(len(excl.keys()), 10))

In [None]:
kwc2= {k: get_kws(k)for k in excl.keys()}

In [None]:
JSON(kwc2)

In [None]:
kwc = {k: get_kws(k)for k in excl.keys()}

In [None]:
from IPython.display import JSON
JSON(kwc)

In [None]:
def compare(a, b):
    item1, item2 = a[1], b[1]
    if item1 < item2:
        return -1
    elif item1 > item2:
        return 1
    else:
        return 0
from functools import cmp_to_key
#sorted(mylist, key=cmp_to_key(compare))
def get_top_keywords(text):
    print(text, "--")
    return sorted(kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english', nr_candidates=30, top_n=6), key=cmp_to_key(compare), reverse=True)[0][0]

In [None]:
centers

In [None]:
len(df.drop_duplicates(subset=["text"])), len(df)

In [None]:
df.iloc[clus['3']]

In [None]:
#10:05
centers["topic"] = centers["text"].map(get_top_keywords)

In [None]:
df2 = centers[["text", "topic", "DOIs", "doi", "title", "journalName", "fieldsOfStudy"]]

In [None]:
df2.to_csv("downloads/final110k_16052022.csv", index = False, header=True)

## BERTopic

Visualizations have to be done in Colab (python3.8)
On AWS sagemaker Python version is 3.6 and Python3.8 is bad to install because of cuda version...


In [None]:
!pip install hdbscan umap 

In [None]:
!pip uninstall umap-learn

In [None]:
!pip uninstall umap-learn --yes

In [None]:
!pip install umap-learn

In [None]:
!pip3 uninstall umap --yes


In [None]:
from bertopic import BERTopic
topic_model = BERTopic(embedding_model='sentence-transformers/all-mpnet-base-v2', calculate_probabilities=True, verbose=True, nr_topics=4)

In [None]:
topics, probs = topic_model.fit_transform(sentences)

In [None]:
topic_model.get_topic_info()

In [None]:
for t in [1,2,3,4,5]:
    print('\n')
    print(topic_model.get_topic(t))

In [None]:
topic_model.visualize_topics()

In [None]:
#topic_model.visualize_hierarchy()

## Try above after UMAP to reduce dimensionality ... most cluster methods don't handle dimensionality well

In [None]:
import umap
umap_embeddings = umap.UMAP(n_neighbors=15, 
                            n_components=5, 
                            metric='cosine').fit_transform(embeddings.cpu()) #cuda error

In [None]:
!pip uninstall bertopic==0.9.2--yes

In [None]:
!pip install bertopic --no-dependencies

In [None]:
pip show bertopic

## Top2Vec

In [None]:
from top2vec import Top2Vec

model = Top2Vec(sentences, embedding_model='universal-sentence-encoder')

In [None]:
model.get_num_topics()

In [None]:
topic_words, word_scores, topic_nums = model.get_topics(77)


In [None]:
topic_words

In [None]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["medicine"], num_topics=5)


In [None]:
topic_words

In [None]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["medicine"], num_topics=5)
for topic in topic_nums:
    model.generate_topic_wordcloud(topic)

Since most clustering methods (like LDA) are based on words and word counts ... our dataset (which was done with extracting rules) makes problems:

In topic_num 38 for example, we can see it decides the topic to be: "serious problem" related
...we could filter these topics. There are more topics than rules, so they might just be a small problem (or the rules infiltrate multiple clusters ... since cluster size is set a prior ...)

In [None]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=38, num_docs=15)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

In [None]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=33, num_docs=5)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

In [None]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=31, num_docs=5)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

In [None]:
documents, document_scores, document_ids = model.search_documents_by_keywords(keywords=["diabetes"], num_docs=20)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

In [None]:
words, word_scores = model.similar_words(keywords=["cancer"], keywords_neg=[], num_words=20)
for word, score in zip(words, word_scores):
    print(f"{word} {score}")

Since we know the different sources in our dataset, let's try the clustering AGAIN but without the algorithmically labeled sentences (data programming)

In [None]:
cols = [source for source in df["source"].unique() if "..." not in source] #{...} was the give-away for algorithmic generated
cols

In [None]:
w

In [None]:
df2 = df[df['source'].isin(cols)]
sent2 = list(df2["text"])

In [None]:
len(sent2)

In [None]:
from top2vec import Top2Vec

model2 = Top2Vec(sent2, embedding_model='universal-sentence-encoder')

In [None]:
model2.get_num_topics()

In [None]:
documents, document_scores, document_ids = model2.search_documents_by_topic(topic_num=0, num_docs=5)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

In [None]:
from sklearn.cluster import AgglomerativeClustering



# Perform kmean clustering
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5) #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [None]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(sentences[sentence_id])

# for i, cluster in clustered_sentences.items():
#     print("Cluster ", i+1)
#     print(cluster)
#     print("\n")