In [1]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from tqdm import tqdm
import pickle

model = SentenceTransformer('all-MiniLM-L6-v2')

# Our sentences we like to encode
sentences = []

with open("filtered_predicates_v2.txt", "r") as file:
    for line in file:
        sentences.append(line.rstrip())

# Sentences are encoded by calling model.encode()
sentence_embeddings = model.encode(sentences)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(sentences[:20])

['induced', 'is a former', 'projects over', 'thank Professor for us tech-', 'block', 'granted to I', 'presenting at', 'is for from the , from', 'written by', 'written', 'is is the', 'is the Safer Nanomaterials the Oregon', 'supported by an', 'is grateful to', 'supported by the ( ICCA ) Initiative', 'providing stably transfected HEK293 OATP1B3 and Alexandra Heussner the technical', 'isolated from', 'received a partial Graduate Fellowship from the', 'is to ( Cell Physiology Department ,', 'managing']


In [3]:
def kmeans(kmeans_model, n):
    cluster_labels = kmeans_model.labels_
    silhouette_avg = silhouette_score(sentence_embeddings, cluster_labels)
    sample_silhouette_values = silhouette_samples(sentence_embeddings, cluster_labels)
    count = 0
    for i in range(n):
        cluster_score = np.array(sample_silhouette_values[cluster_labels == i]).mean()
        if cluster_score <= silhouette_avg:
            count += 1
    model_rate = count/n
    return model_rate

In [4]:
models = {}
satisfied_models = []
min_rate = 1
k = -1
optimal_number = -1
cache = {}
for n_cluster in tqdm(range(10, 201, 10)):
    current_model = KMeans(n_clusters=n_cluster, init='k-means++', random_state=42)
    current_model.fit(sentence_embeddings)
    models[n_cluster] = current_model
    failing_rate = kmeans(current_model, n_cluster)
    cache[n_cluster] = failing_rate
    if failing_rate == 0:
        if k != -1:
            k = n_cluster
        print(f'With {n_cluster} clusters, the silhouette constraint is satisfied.')
    else:
        if min_rate == 1:
            min_rate = failing_rate
        else:
            if failing_rate < min_rate:
                min_rate = failing_rate
                optimal_number = n_cluster
        print(f'With {n_cluster} clusters, there are {100*failing_rate}% of the clusters with silhouette score below average score.')
    if k == -1:
        k = optimal_number

# kmeans_models = list(models.values())
# with open('kmeans_models.pkl', 'wb') as file:
#     pickle.dump(kmeans_models, file)

  5%|▌         | 1/20 [00:03<01:03,  3.32s/it]

With 10 clusters, there are 60.0% of the clusters with silhouette score below average score.


 10%|█         | 2/20 [00:07<01:07,  3.73s/it]

With 20 clusters, there are 60.0% of the clusters with silhouette score below average score.


 15%|█▌        | 3/20 [00:12<01:14,  4.39s/it]

With 30 clusters, there are 53.333333333333336% of the clusters with silhouette score below average score.


 20%|██        | 4/20 [00:18<01:21,  5.08s/it]

With 40 clusters, there are 57.49999999999999% of the clusters with silhouette score below average score.


 25%|██▌       | 5/20 [00:25<01:28,  5.87s/it]

With 50 clusters, there are 54.0% of the clusters with silhouette score below average score.


 30%|███       | 6/20 [00:33<01:29,  6.42s/it]

With 60 clusters, there are 48.333333333333336% of the clusters with silhouette score below average score.


 35%|███▌      | 7/20 [00:55<02:31, 11.66s/it]

With 70 clusters, there are 47.14285714285714% of the clusters with silhouette score below average score.


 40%|████      | 8/20 [01:16<02:53, 14.43s/it]

With 80 clusters, there are 45.0% of the clusters with silhouette score below average score.


 45%|████▌     | 9/20 [01:39<03:10, 17.35s/it]

With 90 clusters, there are 46.666666666666664% of the clusters with silhouette score below average score.


 50%|█████     | 10/20 [02:02<03:08, 18.87s/it]

With 100 clusters, there are 50.0% of the clusters with silhouette score below average score.


 55%|█████▌    | 11/20 [02:24<02:58, 19.85s/it]

With 110 clusters, there are 50.0% of the clusters with silhouette score below average score.


 60%|██████    | 12/20 [02:49<02:52, 21.53s/it]

With 120 clusters, there are 45.83333333333333% of the clusters with silhouette score below average score.


 65%|██████▌   | 13/20 [03:12<02:33, 21.94s/it]

With 130 clusters, there are 43.84615384615385% of the clusters with silhouette score below average score.


 70%|███████   | 14/20 [03:35<02:13, 22.28s/it]

With 140 clusters, there are 43.57142857142857% of the clusters with silhouette score below average score.


 75%|███████▌  | 15/20 [03:57<01:51, 22.24s/it]

With 150 clusters, there are 50.66666666666667% of the clusters with silhouette score below average score.


 80%|████████  | 16/20 [04:22<01:31, 22.96s/it]

With 160 clusters, there are 50.625% of the clusters with silhouette score below average score.


 85%|████████▌ | 17/20 [04:46<01:10, 23.44s/it]

With 170 clusters, there are 48.8235294117647% of the clusters with silhouette score below average score.


 90%|█████████ | 18/20 [05:10<00:47, 23.56s/it]

With 180 clusters, there are 51.66666666666667% of the clusters with silhouette score below average score.


 95%|█████████▌| 19/20 [05:37<00:24, 24.39s/it]

With 190 clusters, there are 48.421052631578945% of the clusters with silhouette score below average score.


100%|██████████| 20/20 [06:04<00:00, 18.20s/it]

With 200 clusters, there are 52.0% of the clusters with silhouette score below average score.





In [5]:
print(cache)

{10: 0.6, 20: 0.6, 30: 0.5333333333333333, 40: 0.575, 50: 0.54, 60: 0.48333333333333334, 70: 0.4714285714285714, 80: 0.45, 90: 0.4666666666666667, 100: 0.5, 110: 0.5, 120: 0.4583333333333333, 130: 0.43846153846153846, 140: 0.4357142857142857, 150: 0.5066666666666667, 160: 0.50625, 170: 0.48823529411764705, 180: 0.5166666666666667, 190: 0.4842105263157895, 200: 0.52}


In [8]:
# {10: 0.5, 20: 0.5, 30: 0.4666666666666667, 40: 0.4, 50: 0.36, 60: 0.45, 70: 0.4714285714285714, 80: 0.4875, 90: 0.4888888888888889, 100: 0.41, 110: 0.42727272727272725, 120: 0.425, 130: 0.46153846153846156, 140: 0.4714285714285714, 150: 0.46, 160: 0.4875, 170: 0.45294117647058824, 180: 0.45, 190: 0.45789473684210524, 200: 0.435}
# With filtered_predicates.txt

In [6]:
# {10: 0.6, 20: 0.6, 30: 0.5333333333333333, 40: 0.575, 50: 0.54, 60: 0.48333333333333334, 70: 0.4714285714285714, 80: 0.45, 90: 0.4666666666666667, 100: 0.5, 110: 0.5, 120: 0.4583333333333333, 130: 0.43846153846153846, 140: 0.4357142857142857, 150: 0.5066666666666667, 160: 0.50625, 170: 0.48823529411764705, 180: 0.5166666666666667, 190: 0.4842105263157895, 200: 0.52}
# With filtered_predicates_v2.txt

In [7]:
optimal_n = 140

In [8]:
def distance(l1, l2):
    n = len(l1)
    return np.sqrt(np.sum(np.square((np.array(l1)-np.array(l2)))))

distance([1,0,2,3], [5,0,-1,3])

5.0

In [9]:
working_model = KMeans(n_clusters=optimal_n, init='k-means++', random_state=42)
working_model.fit(sentence_embeddings)
cluster_labels = working_model.labels_
index_dict = {}
for i in range(optimal_n):
    index_dict[i] = []
for i in range(len(cluster_labels)):
    index_dict[cluster_labels[i]].append(i)

In [10]:
cluster_distance = []
def representative():
    centers = working_model.cluster_centers_
    rep = []
    for cluster,index_list in index_dict.items():
        min_distance, optimal_index = -1, -1
        center = centers[cluster]
        for index in index_list:
            d = distance(center, sentence_embeddings[index])
            cluster_distance.append(d)
            optimal_index = index
            if min_distance == -1:
                min_distance = d
            else:
                if d < min_distance:
                    optimal_index = index
                    min_distance = d
        rep.append(sentences[optimal_index])
    return rep



In [11]:
representative()
cluster_distance

[0.7645346,
 0.7143166,
 0.7507553,
 0.74552673,
 0.8549099,
 0.8698995,
 0.72851455,
 0.7214784,
 0.8413487,
 0.7432567,
 0.751543,
 0.6972208,
 0.6765127,
 0.72758985,
 0.73130715,
 0.8894827,
 0.74340135,
 0.7297431,
 0.66772974,
 0.6832282,
 0.6635579,
 0.70923305,
 0.84065,
 0.9881564,
 0.73843485,
 0.74246675,
 0.63799524,
 0.8243013,
 0.39840227,
 0.5679412,
 0.5694921,
 0.58934253,
 0.36336225,
 0.3888908,
 0.5632187,
 0.45235646,
 0.41680536,
 0.35589227,
 0.576988,
 0.61738884,
 0.36646464,
 0.5714897,
 0.42259806,
 0.3338802,
 0.5151564,
 0.46313614,
 0.66055965,
 0.37747833,
 0.5917981,
 0.44105566,
 0.73575795,
 0.53755546,
 0.6495886,
 0.85796154,
 0.58725,
 0.4830444,
 0.74462056,
 0.51943475,
 0.5705938,
 0.6817922,
 0.3558922,
 0.43911937,
 0.71809906,
 0.50073415,
 0.81372577,
 0.6215584,
 0.8019395,
 0.63593477,
 0.4525265,
 0.36985782,
 0.26938665,
 0.4337068,
 0.28664535,
 0.85259813,
 0.396731,
 0.32706568,
 0.73266095,
 0.3848059,
 0.26938662,
 0.26938662,
 0.269

In [15]:
distance_with_index = [[cluster_distance[i], i] for i in range(optimal_n)]
distance_with_index.sort()
print(distance_with_index)
index_sorted = [distance_with_index[i][1] for i in range(optimal_n)]

[[0.26938662, 78], [0.26938662, 79], [0.26938662, 91], [0.26938662, 97], [0.26938662, 111], [0.26938665, 70], [0.26938668, 80], [0.26938668, 82], [0.26938668, 94], [0.26938668, 109], [0.26938668, 125], [0.28664517, 104], [0.28664535, 72], [0.32706568, 75], [0.3338802, 43], [0.34860924, 85], [0.3558922, 60], [0.35589227, 37], [0.36336225, 32], [0.36646464, 40], [0.36985782, 69], [0.37404546, 92], [0.3765652, 131], [0.37747833, 47], [0.38333786, 101], [0.3848059, 77], [0.3864682, 127], [0.3888908, 33], [0.3911237, 110], [0.39660603, 136], [0.396731, 74], [0.39840227, 28], [0.40276715, 103], [0.40754777, 129], [0.40800002, 84], [0.4138294, 137], [0.41592902, 89], [0.41680536, 36], [0.42259806, 42], [0.42906722, 102], [0.4329527, 139], [0.4337068, 71], [0.4337068, 119], [0.4390148, 135], [0.43911937, 61], [0.43949094, 96], [0.44105566, 49], [0.45235646, 35], [0.4525265, 68], [0.4538829, 126], [0.45731375, 95], [0.45953074, 117], [0.46073264, 108], [0.46313614, 45], [0.46867606, 115], [0.48

In [16]:
representatives = representative()
print(representatives[78])
print([sentences[i] for i in index_dict[78]])

has been funded by
['was partially funded by', 'was financially supported by', 'was financially supported', 'was financially supported by The', 'were funded through', 'funded by the ,', 'funded by Leading Pro-', 'was funded', 'was funded by', 'was funded for his', 'were funded by the', 'was funded by ,', 'was funded by', 'funded do', 'funded by We the', 'has been funded in', 'has not been funded by', 'was supported by funding from', 'was financed by The', 'was funded under', 'was financed the at', 'was funded in', 'was partially supported funding the', 'funded by the', 'funded the', 'sponsored funded funded', 'funded this', 'been funded through', 'funded', 'funded by the', 'funding this', 'were funded by', 'Funded by', 'were funded by a special', 'funded through', 'funded by We thank not necessarily reflect the', 'was funded from internal', 'funding the', 'funded by', 'was financially supported by a', 'funded by the the', 'funded by', 'was supported financially by', 'was financed by', 

In [17]:
for i in range(optimal_n):
    print(representatives[index_sorted[i]])

has been funded by
related to the nitrite in the
funded by the Ministry of Technology ( MEST ) in
OHIC2019G03 ) of
extend a sincere
involved in the
win
Review the IRIS Process , on , on
reports contractual
attenuate
Cited in
is lower than the maternal
has licensed to
Assessing their
acknowledge the technical
was totally supported by Intramural Program
induces significant
is This has not been
supported by the through the
used for primary
is Associate Professor at Faculty andDentalMedicine , Cairo
was funded by the Spanish MEC ( BIOMTOOLS ,
is a post doctoral
continued
receiving a 10 fold lower
were
acknowledge the ,
reviewed the
were much influenced by
involved with
is at the Columbia Uni-
Cancel Registration of Containing the
absorbed onto
is the '' MajorNew special
arresting them at
work for The
sanctioned to
would like to thank
is express their
was supported by the The Korea Institute of Planning & Evaluation for Technology in
wish , (
thank
1 ) , 1–27
is also indebted to ,
grant the

In [18]:
for rep in representatives:
    print(rep)

flowing into the tidal Anacostia River , Washington , DC ,
radiolabeled
funded by ( São do
were supported , respectively , by a Canadian
Caused by
Regarding of
were employ-
cited by
follow
are thankful to Lindsay Sosinski of The
is a scientific
thank Mrs
were responsible for conducting
is a named
is the
received the ECETOC Stewards Dr
submitted to , Division of Behavioral Science , National Institute for Health , Cincinnati , Ohio , USA , Environmental Health Testing Cincinnati ILSI Europe ,
contributed to the
was provided by
is gratefully acknowledged
sponsored by
grants P01 CA104177 , the Samuel
Department of Chemistry ,
prepared by
is their including :
is on Emergency Guidance Levels for Selected Submarine Contaminants , on Toxicology ,
are acknowledged
the Fundamental Research Funds for the
Cancel Registration of Containing the
served as the
makes
conformwith
supported by the through the
reviewed the
are grateful for the many
Department of Pathology , Branch ,
would like to thank
i

In [19]:
def cluster_distance(i):
    indexes = index_dict[i]
    cluster_size = len(indexes)
    total_distance = 0
    for i1 in range(cluster_size-1):
        for i2 in range(i1+1, cluster_size):
            total_distance += distance(sentence_embeddings[indexes[i1]], sentence_embeddings[indexes[i2]])
    return total_distance*2/cluster_size/(cluster_size-1)


In [20]:
cluster_distance(25)

1.2308489523493513

In [21]:
max_distance, cluster_with_max_distance = -1, -1
for i in range(optimal_n):
    temp = cluster_distance(i)
    if max_distance == -1:
        max_distance = temp
        cluster_with_max_distance = i
    elif max_distance < temp:
        max_distance = temp
        cluster_with_max_distance = i
print(cluster_with_max_distance)
print(max_distance)

16
1.2433329412112686


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN

In [23]:
dbscan = DBSCAN(eps=0.5, min_samples=30)
db_clusters = dbscan.fit_predict(sentence_embeddings)

In [24]:
# Get unique clusters
unique_clusters = np.unique(db_clusters)

# Print the clusters and corresponding predicates
for cluster in unique_clusters:
    cluster_predicates = np.array(sentences)[db_clusters == cluster]
    print(f"Cluster {cluster}:")
    for predicate in cluster_predicates:
        print(predicate)

Cluster -1:
induced
is a former
projects over
thank Professor for us tech-
block
granted to I
presenting at
is for from the , from
written by
written
is is the
is the Safer Nanomaterials the Oregon
supported by an
is grateful to
supported by the ( ICCA ) Initiative
providing stably transfected HEK293 OATP1B3 and Alexandra Heussner the technical
isolated from
received a partial Graduate Fellowship from the
is to ( Cell Physiology Department ,
managing
be linked to
is to acknowledge
Miami Valley Innovation Center , The Procter and Gamble Company ,
would like to thank
wish to express their
Subacute oral
assisting in
vol
must be performed to determine the full
offered by the
transformed standard
are also thankful to Council of Re-
aerosol
could also be an unrecognized
is as
thank valuable
may own
am
Assay with
are also thankful to
may interfere with
provided financial
niehs
methylated
was supported by the National Natural Scientific Founda-
supervised by
ordinated by the
Cancel Registratio

In [25]:
print(unique_clusters)

[-1  0]
