In [1]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from tqdm import tqdm
import pickle

model = SentenceTransformer('all-MiniLM-L6-v2')

# Our sentences we like to encode
sentences = []

with open("filtered_predicates.txt", "r") as file:
    for line in file:
        sentences.append(line.rstrip())

# Sentences are encoded by calling model.encode()
sentence_embeddings = model.encode(sentences)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(sentences[:20])

['are declared authors', 'have any conflict of interest in publication', 'established as a novel mast cell stabilizer , the capacity preventing of sta-', 'contracted by are contracted', 'was no conflict of interest any of the authors was', 'conceived the performed most experiments Shasha Yin ,', 'declare there no', 'received grant funding from received grant funding from', 'supplied the materials for testing , and the Center for Occupational / Environ- mental Risk', 'has consulted with studies for Grace Co', 'is are employees of director of S Fry', 'declare no competing financial interest', 'is interest document', 'is consultant for has has received speaker fees from', 'performing the assays', 'was supported by a Mary Anne Stock Student Research Award from the Simon Watkins for the use of the electron microscopes at the', 'are employed by Inc ,', 'ticals Conflict of interest', 'is Jianjian Tao', 'was supported in part by a startup fund from The']


In [4]:
def kmeans(kmeans_model, n):
    cluster_labels = kmeans_model.labels_
    silhouette_avg = silhouette_score(sentence_embeddings, cluster_labels)
    sample_silhouette_values = silhouette_samples(sentence_embeddings, cluster_labels)
    count = 0
    for i in range(n):
        cluster_score = np.array(sample_silhouette_values[cluster_labels == i]).mean()
        if cluster_score <= silhouette_avg:
            count += 1
    model_rate = count/n
    return model_rate

In [5]:
models = {}
satisfied_models = []
min_rate = 1
k = -1
optimal_number = -1
cache = {}
for n_cluster in tqdm(range(10, 201, 10)):
    current_model = KMeans(n_clusters=n_cluster, init='k-means++', random_state=42)
    current_model.fit(sentence_embeddings)
    models[n_cluster] = current_model
    failing_rate = kmeans(current_model, n_cluster)
    cache[n_cluster] = failing_rate
    if failing_rate == 0:
        if k != -1:
            k = n_cluster
        print(f'With {n_cluster} clusters, the silhouette constraint is satisfied.')
    else:
        if min_rate == 1:
            min_rate = failing_rate
        else:
            if failing_rate < min_rate:
                min_rate = failing_rate
                optimal_number = n_cluster
        print(f'With {n_cluster} clusters, there are {100*failing_rate}% of the clusters with silhouette score below average score.')
    if k == -1:
        k = optimal_number

# kmeans_models = list(models.values())
# with open('kmeans_models.pkl', 'wb') as file:
#     pickle.dump(kmeans_models, file)

  5%|▌         | 1/20 [00:01<00:35,  1.85s/it]

With 10 clusters, there are 50.0% of the clusters with silhouette score below average score.


 10%|█         | 2/20 [00:03<00:35,  1.99s/it]

With 20 clusters, there are 50.0% of the clusters with silhouette score below average score.


 15%|█▌        | 3/20 [00:06<00:38,  2.28s/it]

With 30 clusters, there are 46.666666666666664% of the clusters with silhouette score below average score.


 20%|██        | 4/20 [00:09<00:42,  2.68s/it]

With 40 clusters, there are 40.0% of the clusters with silhouette score below average score.


 25%|██▌       | 5/20 [00:13<00:44,  2.95s/it]

With 50 clusters, there are 36.0% of the clusters with silhouette score below average score.


 30%|███       | 6/20 [00:16<00:44,  3.19s/it]

With 60 clusters, there are 45.0% of the clusters with silhouette score below average score.


 35%|███▌      | 7/20 [00:24<00:58,  4.52s/it]

With 70 clusters, there are 47.14285714285714% of the clusters with silhouette score below average score.


 40%|████      | 8/20 [00:31<01:04,  5.37s/it]

With 80 clusters, there are 48.75% of the clusters with silhouette score below average score.


 45%|████▌     | 9/20 [00:37<01:01,  5.60s/it]

With 90 clusters, there are 48.888888888888886% of the clusters with silhouette score below average score.


 50%|█████     | 10/20 [00:44<01:00,  6.07s/it]

With 100 clusters, there are 41.0% of the clusters with silhouette score below average score.


 55%|█████▌    | 11/20 [00:52<00:59,  6.58s/it]

With 110 clusters, there are 42.72727272727273% of the clusters with silhouette score below average score.


 60%|██████    | 12/20 [00:59<00:54,  6.83s/it]

With 120 clusters, there are 42.5% of the clusters with silhouette score below average score.


 65%|██████▌   | 13/20 [01:06<00:48,  6.90s/it]

With 130 clusters, there are 46.15384615384615% of the clusters with silhouette score below average score.


 70%|███████   | 14/20 [01:14<00:42,  7.14s/it]

With 140 clusters, there are 47.14285714285714% of the clusters with silhouette score below average score.


 75%|███████▌  | 15/20 [01:22<00:37,  7.44s/it]

With 150 clusters, there are 46.0% of the clusters with silhouette score below average score.


 80%|████████  | 16/20 [01:31<00:31,  7.92s/it]

With 160 clusters, there are 48.75% of the clusters with silhouette score below average score.


 85%|████████▌ | 17/20 [01:40<00:24,  8.06s/it]

With 170 clusters, there are 45.294117647058826% of the clusters with silhouette score below average score.


 90%|█████████ | 18/20 [01:49<00:16,  8.35s/it]

With 180 clusters, there are 45.0% of the clusters with silhouette score below average score.


 95%|█████████▌| 19/20 [01:58<00:08,  8.67s/it]

With 190 clusters, there are 45.78947368421053% of the clusters with silhouette score below average score.


100%|██████████| 20/20 [02:07<00:00,  6.40s/it]

With 200 clusters, there are 43.5% of the clusters with silhouette score below average score.





In [7]:
print(cache)

{10: 0.5, 20: 0.5, 30: 0.4666666666666667, 40: 0.4, 50: 0.36, 60: 0.45, 70: 0.4714285714285714, 80: 0.4875, 90: 0.4888888888888889, 100: 0.41, 110: 0.42727272727272725, 120: 0.425, 130: 0.46153846153846156, 140: 0.4714285714285714, 150: 0.46, 160: 0.4875, 170: 0.45294117647058824, 180: 0.45, 190: 0.45789473684210524, 200: 0.435}


In [8]:
# {10: 0.5, 20: 0.5, 30: 0.4666666666666667, 40: 0.4, 50: 0.36, 60: 0.45, 70: 0.4714285714285714, 80: 0.4875, 90: 0.4888888888888889, 100: 0.41, 110: 0.42727272727272725, 120: 0.425, 130: 0.46153846153846156, 140: 0.4714285714285714, 150: 0.46, 160: 0.4875, 170: 0.45294117647058824, 180: 0.45, 190: 0.45789473684210524, 200: 0.435}


In [6]:
optimal_n = 50

In [7]:
def distance(l1, l2):
    n = len(l1)
    return np.sqrt(np.sum(np.square((np.array(l1)-np.array(l2)))))

distance([1,0,2,3], [5,0,-1,3])

5.0

In [8]:
working_model = KMeans(n_clusters=optimal_n, init='k-means++', random_state=42)
working_model.fit(sentence_embeddings)
cluster_labels = working_model.labels_
index_dict = {}
for i in range(optimal_n):
    index_dict[i] = []
for i in range(len(cluster_labels)):
    index_dict[cluster_labels[i]].append(i)

In [11]:
cluster_distance = []
def representative():
    centers = working_model.cluster_centers_
    rep = []
    for cluster,index_list in index_dict.items():
        min_distance, optimal_index = -1, -1
        center = centers[cluster]
        for index in index_list:
            d = distance(center, sentence_embeddings[index])
            cluster_distance.append(d)
            optimal_index = index
            if min_distance == -1:
                min_distance = d
            else:
                if d < min_distance:
                    optimal_index = index
                    min_distance = d
        rep.append(sentences[optimal_index])
    return rep



In [13]:
representative()
cluster_distance

[0.8167127,
 0.6405485,
 0.60156476,
 0.67891896,
 0.781663,
 0.67911106,
 0.6037236,
 0.75456154,
 0.85098606,
 0.36623496,
 0.41218135,
 0.5266544,
 0.3840952,
 0.7844107,
 0.80208737,
 0.6026588,
 0.5604279,
 0.5016287,
 0.72509867,
 0.56052446,
 0.42505485,
 0.6213612,
 0.5569222,
 0.35462525,
 0.6827434,
 0.6333616,
 0.5512688,
 0.39999673,
 0.6159101,
 0.35666457,
 0.61047924,
 0.54173106,
 0.65159637,
 0.4462665,
 0.6544379,
 0.42219636,
 0.7949206,
 0.44896957,
 0.42143172,
 0.47145364,
 0.80826294,
 0.32160643,
 0.5516192,
 0.6882394,
 0.704184,
 0.45612255,
 0.4052241,
 0.8181527,
 0.720499,
 0.48753786,
 0.5191831,
 0.36623496,
 0.45056948,
 0.8017569,
 0.47507375,
 0.5200323,
 0.5874928,
 0.60876054,
 0.40215242,
 0.6716741,
 0.43834272,
 0.48262677,
 0.8456758,
 0.7749148,
 0.7627704,
 0.43846855,
 0.5688198,
 0.5118384,
 0.8379794,
 0.49840423,
 0.8058474,
 0.63861877,
 0.7991041,
 0.79454345,
 0.4771055,
 0.5054319,
 0.51412404,
 0.88151044,
 0.53991526,
 0.8279822,
 0.6

In [20]:
distance_with_index = [[cluster_distance[i], i] for i in range(50)]
distance_with_index.sort()
print(distance_with_index)
index_sorted = [distance_with_index[i][1] for i in range(50)]

[[0.32160643, 41], [0.35462525, 23], [0.35666457, 29], [0.36623496, 9], [0.3840952, 12], [0.39999673, 27], [0.4052241, 46], [0.41218135, 10], [0.42143172, 38], [0.42219636, 35], [0.42505485, 20], [0.4462665, 33], [0.44896957, 37], [0.45612255, 45], [0.47145364, 39], [0.48753786, 49], [0.5016287, 17], [0.5266544, 11], [0.54173106, 31], [0.5512688, 26], [0.5516192, 42], [0.5569222, 22], [0.5604279, 16], [0.56052446, 19], [0.60156476, 2], [0.6026588, 15], [0.6037236, 6], [0.61047924, 30], [0.6159101, 28], [0.6213612, 21], [0.6333616, 25], [0.6405485, 1], [0.65159637, 32], [0.6544379, 34], [0.67891896, 3], [0.67911106, 5], [0.6827434, 24], [0.6882394, 43], [0.704184, 44], [0.720499, 48], [0.72509867, 18], [0.75456154, 7], [0.781663, 4], [0.7844107, 13], [0.7949206, 36], [0.80208737, 14], [0.80826294, 40], [0.8167127, 0], [0.8181527, 47], [0.85098606, 8]]


In [18]:
representatives = representative()
print(representatives[41])
print([sentences[i] for i in index_dict[41]])

would also like to thank assistance
['like to thankAndrea Stoller andHyazinthDobrowinski for assistance', 'would also like to thank technical assistance', 'is thank the', 'would to thank', 'would like to acknowledge Amanda in completing work', 'thank the of', 'thank the', 'thank for helpful discussions', 'would also like to thank Christine ( FHNW ) for her technical assistance', 'would like to acknowledge Julie their assistance in completing work', 'provided review', 'can be found online', 'would like to acknowledge Julie their assistance in completing Mielke , H', 'thank', 'competing thank for critical editing of', 'thank Dr', 'would also like to thank Weisbrod , ( FHNW ) for her technical assistance', 'thank for critical editing of', 'would like to acknowledge Amanda in completing Mielke , H', 'would like to acknowledge Julie for their assistance in completing work', 'would also like to thank Christine Weisbrod , ( FHNW ) for her technical assistance', 'thank assistance', 'are greatl

In [21]:
for i in range(50):
    print(representatives[index_sorted[i]])

would also like to thank assistance
are employees of LLC ( Andrew R Cook , Charles B Breckenridge , Daniel J Minnema , Kim Z Travis , Nicholas C Sturgess and
is part
supplement
were performed
is founder of P Statistics Ltd
prolongs survival in mice by inhibiting spontaneous tumor attributed regulating
conceived the experiments , Xiao Yang performed the experiments ,
was supported by the potential work was supported by the
is the Yaksh , faculty member ,
have made a significant contribution approved
do not necessarily represent the official position of
are employees of The is a wholly owned subsidiary of The
be developmental signaling :
was funded by the provided by China Scholarship Council for Ji - Liang Zhang ( No
provides expert testimony in litigation involving ex- posure to silica and has been engaged to provide consultation , expert advice , on scientific matters related to these subjects by both public entities , including the National Stone , Sand , and
are no competing are no 

In [17]:
for rep in representatives:
    print(rep)

was supported by a Mary Anne Stock Student Research Award from the Simon Watkins for the use of the processing of tissue at the
are employees
are employees of the study ;
conceived the performed most experiments assisted someex- periments
was supported by
result analysis
is conflict interest
reports grants from the
has received consultant fees from Advocacy Solutions , Cohn
supplement
conceived the experiments , Xiao Yang performed the experiments ,
decreases cellular Rad51 pro- tein levels Combined treatment arachidin-1 further decreases the expression of that are associated with enhanced chemo - sensitivity to
were performed
is director of Ltd
supplied the materials for testing , and the Center for Occupational / Environ- mental Risk Analysis
was provided by The authors declare that they have no conflict of interest
wrote the manuscript
are no competing are no competing
are employed by a supplier of
is / 56 ( 2010 ) 357–364
have made a significant contribution approved
induced is ass

In [20]:
def cluster_distance(i):
    indexes = index_dict[i]
    cluster_size = len(indexes)
    total_distance = 0
    for i1 in range(cluster_size-1):
        for i2 in range(i1+1, cluster_size):
            total_distance += distance(sentence_embeddings[indexes[i1]], sentence_embeddings[indexes[i2]])
    return total_distance*2/cluster_size/(cluster_size-1)


In [21]:
cluster_distance(25)

0.9547890501587014

In [22]:
max_distance, cluster_with_max_distance = -1, -1
for i in range(optimal_n):
    temp = cluster_distance(i)
    if max_distance == -1:
        max_distance = temp
        cluster_with_max_distance = i
    elif max_distance < temp:
        max_distance = temp
        cluster_with_max_distance = i
print(cluster_with_max_distance)
print(max_distance)

5
1.2574970663202052


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN

In [5]:
dbscan = DBSCAN(eps=0.1, min_samples=10)
db_clusters = dbscan.fit_predict(sentence_embeddings)

In [4]:
# Get unique clusters
unique_clusters = np.unique(db_clusters)

# Print the clusters and corresponding predicates
for cluster in unique_clusters:
    cluster_predicates = np.array(sentences)[db_clusters == cluster]
    print(f"Cluster {cluster}:")
    for predicate in cluster_predicates:
        print(predicate)

Cluster -1:
are declared authors
have any conflict of interest in publication
established as a novel mast cell stabilizer , the capacity preventing of sta-
contracted by are contracted
was no conflict of interest any of the authors was
conceived the performed most experiments Shasha Yin ,
declare there no
received grant funding from received grant funding from
supplied the materials for testing , and the Center for Occupational / Environ- mental Risk
has consulted with studies for Grace Co
is are employees of director of S Fry
declare no competing financial interest
is interest document
is consultant for has has received speaker fees from
performing the assays
was supported by a Mary Anne Stock Student Research Award from the Simon Watkins for the use of the electron microscopes at the
are employed by Inc ,
ticals Conflict of interest
is Jianjian Tao
was supported in part by a startup fund from The
like to thankAndrea Stoller andHyazinthDobrowinski for assistance
is a former and all ot

In [22]:
print(unique_clusters)

[-1]
