In [51]:
from scripts.interactive_search import *
import networkx as nx
import numpy as np
from networkx.algorithms.community.centrality import girvan_newman
import nltk
from nltk.corpus import stopwords

In [52]:
def read_metadata(path):
    df = pd.read_csv(path)
    df = df.astype({'abstract': 'str'})
    df = df[df['abstract'] != 'Unknown']
    df = df[df['abstract'] != 'nan']
    return df

In [53]:
nltk.download('stopwords')
df_meta = read_metadata(METADATA_PATH)
model = SentenceTransformer(MODEL_PATH)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\val\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [54]:
stop_words = set(stopwords.words('english'))
abstracts = [a for a in df_meta['abstract']]
titles = df_meta['title'].reset_index(drop=True)

In [55]:
assert len(abstracts) == 37576 # Ensure that the corpus was computed with the good dataset

In [56]:
abstracts = abstracts[:1000]

In [57]:
embeddings = model.encode(abstracts, show_progress_bar=True)

Batches: 100%|███████████████████████████████| 125/125 [00:22<00:00,  5.55it/s]


In [58]:
def compute_adjacency(embeddings, epsilon):
    cosine_distances = scipy.spatial.distance.cdist(embeddings, embeddings, "cosine")
    cosine_similarities = 1 - cosine_distances
    cosine_similarities[cosine_similarities < epsilon] = 0
    return cosine_similarities

In [59]:
epsilon = 0.85
adjacency = compute_adjacency(embeddings, epsilon)

In [60]:
print("Non-zero weight percentage: " + '{0:.2f}'.format(adjacency[adjacency!=0].size * 100 / adjacency.size) + '%')

Non-zero weight percentage: 2.93%


In [61]:
G = nx.from_numpy_array(adjacency)
print("Number of connected components: " + repr(nx.number_connected_components(G)))

Number of connected components: 369


In [62]:
new_labels = {}
for i in range(len(G)):
    new_labels.update({i: titles[i]})

In [63]:
G = nx.relabel.relabel_nodes(G, new_labels)

In [66]:
def compute_clusters_girvan(G, iterations):
    generator = girvan_newman(G)
    for i in range(0, iterations):
        clusters = next(generator)
        print("Computing clusters, iteration " + repr(i+1) + '/' + repr(iterations), end='\r')
    return clusters

In [67]:
iterations = 2
clusters = compute_clusters_girvan(G, iterations)

Computing clusters, iteration 2/2

In [41]:
min_community_size = 6
top_k = 3

for community in communities:
    if len(community) < min_community_size:
        continue
    word_count = {}
    for title in community:
        title = title.lower()
        for word in title.split(' '):
            if word in stop_words:
                continue
            if word in word_count.keys():
                word_count[word] += 1
            else:
                word_count[word] = 1
    print(sorted(word_count, key=word_count.get, reverse=True)[:top_k])
    print(sorted(word_count.values(), reverse=True)[:top_k])

['covid-19', 'coronavirus', 'novel']
[208, 199, 129]


In [42]:
nx.write_gexf(G, "data/graph.gexf")