In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN, AgglomerativeClustering

In [8]:
df = pd.read_csv('book.tsv', sep='\t', header=None)
source = df.iloc[:, 3]
source = source.dropna()
# authors = source[145:165]
# authors = pd.concat([authors, source[420:440]]) 
authors = source[:5000]

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
author_embeddings = [model.encode(author) for author in authors]

In [9]:
cosine_similarities = cosine_similarity(author_embeddings)
print(cosine_similarities.shape)

(5000, 5000)


In [15]:
complement = np.ones(cosine_similarities.shape)
complement -= cosine_similarities
complement = np.maximum(complement, 0)
print(cosine_similarities)
print(complement)
print(authors)

[[1.0000001  0.13771227 0.13771227 ... 0.0793032  0.06516084 0.06516084]
 [0.13771227 0.99999976 0.99999976 ... 0.24057013 0.27983385 0.27983385]
 [0.13771227 0.99999976 0.99999976 ... 0.24057013 0.27983385 0.27983385]
 ...
 [0.0793032  0.24057013 0.24057013 ... 1.0000002  0.95719796 0.95719796]
 [0.06516084 0.27983385 0.27983385 ... 0.95719796 1.0000001  1.0000001 ]
 [0.06516084 0.27983385 0.27983385 ... 0.95719796 1.0000001  1.0000001 ]]
[[0.00000000e+00 8.62287730e-01 8.62287730e-01 ... 9.20696802e-01
  9.34839159e-01 9.34839159e-01]
 [8.62287730e-01 2.38418579e-07 2.38418579e-07 ... 7.59429872e-01
  7.20166147e-01 7.20166147e-01]
 [8.62287730e-01 2.38418579e-07 2.38418579e-07 ... 7.59429872e-01
  7.20166147e-01 7.20166147e-01]
 ...
 [9.20696802e-01 7.59429872e-01 7.59429872e-01 ... 0.00000000e+00
  4.28020358e-02 4.28020358e-02]
 [9.34839159e-01 7.20166147e-01 7.20166147e-01 ... 4.28020358e-02
  0.00000000e+00 0.00000000e+00]
 [9.34839159e-01 7.20166147e-01 7.20166147e-01 ... 4.280

In [13]:
print(authors.iloc[4999])

Harold, Elliote Rusty


In [None]:
clustering = AgglomerativeClustering(n_clusters=None, linkage='complete', distance_threshold=0.3).fit(complement)

# Get cluster labels
cluster_labels = clustering.labels_
print(f'Num clusters: {clustering.n_clusters_}')

# Group authors based on cluster labels
clustered_authors = {}
for i, label in enumerate(cluster_labels):
    if label not in clustered_authors:
        clustered_authors[label] = []
    clustered_authors[label].append(authors.iloc[i])

# Print authors in each cluster
for cluster, authors_names in clustered_authors.items():
    print(f'Cluster {cluster}:')
    print(authors_names)

In [33]:
# dclustering = DBSCAN(min_samples=1).fit(complement)
cosine_similarities = np.maximum(complement, 0)
dbscan = DBSCAN(eps=0.5, min_samples=2, metric='precomputed')
labels = dbscan.fit_predict(cosine_similarities)

# Print the cluster labels
print("Cluster labels:", labels[:900])


Cluster labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 