In [18]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN, AgglomerativeClustering

In [19]:
df = pd.read_csv('book.tsv', sep='\t', header=None)
# df = pd.read_csv('book-snippet.tsv', sep='\t', header=None)
source = df.iloc[:, 3]
source = source.dropna()
# authors = source[145:165]
# authors = pd.concat([authors, source[420:440]]) 
authors = source[:5000]

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
author_embeddings = [model.encode(author) for author in authors]

In [20]:
cosine_similarities = cosine_similarity(author_embeddings)
print(cosine_similarities.shape)

(5000, 5000)


In [21]:
complement = np.ones(cosine_similarities.shape)
complement -= cosine_similarities
complement = np.maximum(complement, 0)
print(cosine_similarities)
print(complement)
print(authors)

[[1.0000001  0.13771227 0.13771227 ... 0.0793032  0.06516084 0.06516084]
 [0.13771227 0.99999976 0.99999976 ... 0.24057013 0.27983385 0.27983385]
 [0.13771227 0.99999976 0.99999976 ... 0.24057013 0.27983385 0.27983385]
 ...
 [0.0793032  0.24057013 0.24057013 ... 1.0000002  0.95719796 0.95719796]
 [0.06516084 0.27983385 0.27983385 ... 0.95719796 1.0000001  1.0000001 ]
 [0.06516084 0.27983385 0.27983385 ... 0.95719796 1.0000001  1.0000001 ]]
[[0.00000000e+00 8.62287730e-01 8.62287730e-01 ... 9.20696802e-01
  9.34839159e-01 9.34839159e-01]
 [8.62287730e-01 2.38418579e-07 2.38418579e-07 ... 7.59429872e-01
  7.20166147e-01 7.20166147e-01]
 [8.62287730e-01 2.38418579e-07 2.38418579e-07 ... 7.59429872e-01
  7.20166147e-01 7.20166147e-01]
 ...
 [9.20696802e-01 7.59429872e-01 7.59429872e-01 ... 0.00000000e+00
  4.28020358e-02 4.28020358e-02]
 [9.34839159e-01 7.20166147e-01 7.20166147e-01 ... 4.28020358e-02
  0.00000000e+00 0.00000000e+00]
 [9.34839159e-01 7.20166147e-01 7.20166147e-01 ... 4.280

In [22]:
clustering = AgglomerativeClustering(n_clusters=None, linkage='complete', distance_threshold=0.3).fit(complement)

# Get cluster labels
cluster_labels = clustering.labels_
print(f'Num clusters: {clustering.n_clusters_}')

# Group authors based on cluster labels
clustered_authors = {}
for i, label in enumerate(cluster_labels):
    if label not in clustered_authors:
        clustered_authors[label] = []
    clustered_authors[label].append(authors.iloc[i])

# Print authors in each cluster
for cluster, authors_names in clustered_authors.items():
    print(f'Cluster {cluster}:')
    print(authors_names)

Num clusters: 1145
Cluster 555:
['Not Available', 'Not Available', 'Not Available', 'Not Available', 'Not Available', 'Not Available', 'Not Available', 'Not Available', 'Not Available', 'Not Available']
Cluster 1096:
['Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'KNUTH, DONALD E.', 'Knuth, Donald E.', 'KNUTH, DONALD E.', 'KNUTH, DONALD E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'KNUTH, DONALD E.', 'Knuth, Donald E.', 'KNUTH, DONALD E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'KNUTH, DONALD E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'KNUTH, DONALD E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'KNUTH, DONALD E.', 'Knuth, Donald E.', 'Knuth, Donald E.', 'K

In [23]:
# dclustering = DBSCAN(min_samples=1).fit(complement)
# cosine_similarities = np.maximum(complement, 0)
# dbscan = DBSCAN(eps=0.5, min_samples=2, metric='precomputed')
# labels = dbscan.fit_predict(cosine_similarities)

# # Print the cluster labels
# print("Cluster labels:", labels[:900])


In [27]:
# Brute force approach: Group together all indices where cosine similarity is > 0.8
groups = {} 
for i in range(len(cosine_similarities)):
    groups[i] = []

for i in range(len(cosine_similarities)):
    for j in range(i+1, len(cosine_similarities[0])):
        if cosine_similarities[i][j] > 0.8:
            groups[i].append(j)
            # groups[j].append(i)

In [29]:
graph = groups.copy()
clusters = []
visited = set()

def dfs(v, curr_cluster):
    visited.add(v)
    curr_cluster.append(v)
    for w in graph[v]:
        if w not in visited:
            dfs(w, curr_cluster)

for node in graph:
    if node not in visited:
        cluster = []
        dfs(node, cluster)
        clusters.append(cluster)

print(clusters)

[[0, 104, 1411, 1412, 3707, 3708, 3717, 3720, 3722, 3723], [1, 2, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 37, 38, 39, 41, 42, 43, 44, 45, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 76, 77, 78, 80, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 105, 106, 107, 108, 109, 110, 111, 114, 115, 116, 118, 119, 121, 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 236, 237, 238, 239, 240, 242, 243, 244, 245, 246, 249, 251, 252, 253, 254, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 279, 280, 285, 286, 287, 288, 289, 291, 292, 294, 295, 297, 298, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 313, 314, 315, 316, 317, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 330, 331, 333, 334, 336

In [34]:
with open('output.txt', 'w') as f:
    for clus in clusters:
        f.write('[')
        for idx in clus:
            f.write(f'{authors.iloc[idx]}; ')
        f.write(']\n\n')