In [1]:
from scripts.interactive_search import *
import networkx as nx
import numpy as np
from networkx.algorithms.community.centrality import girvan_newman
import nltk
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')
corpus = cache_corpus()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\val\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Corpus size 37576


In [3]:
assert len(corpus) == 37576 # Ensure that the corpus was computed with the good dataset

In [4]:
corpus = corpus[:1000]

In [5]:
model = SentenceTransformer(MODEL_PATH)

In [6]:
embeddings = model.encode(corpus, show_progress_bar=True)

Batches: 100%|███████████████████████████████| 125/125 [00:23<00:00,  5.34it/s]


In [7]:
def compute_adjacency(embeddings, epsilon):
    cosine_distances = scipy.spatial.distance.cdist(embeddings, embeddings, "cosine")
    cosine_similarities = 1 - cosine_distances
    cosine_similarities[cosine_similarities < epsilon] = 0
    return cosine_similarities

In [23]:
epsilon = 0.85
adjacency = compute_adjacency(embeddings, epsilon)

In [24]:
print("Non-zero weight percentage: " + '{0:.2f}'.format(adjacency[adjacency!=0].size * 100 / adjacency.size) + '%')

Non-zero weight percentage: 2.93%


In [25]:
G = nx.from_numpy_array(adjacency)
print("Number of connected components: " + repr(nx.number_connected_components(G)))

Number of connected components: 369


In [26]:
df = pd.read_csv(METADATA_PATH)

In [27]:
df = df.astype({'abstract': 'str'})

In [28]:
df = df[df['abstract'] != 'Unknown']
df = df[df['abstract'] != 'nan']

In [29]:
titles = df['title']

In [30]:
titles = titles.reset_index(drop=True)

In [31]:
new_labels = {}
for i in range(len(G)):
    new_labels.update({i: titles[i]})

In [32]:
G = nx.relabel.relabel_nodes(G, new_labels)

In [42]:
generator = girvan_newman(G)

In [43]:
iterations = 4
for i in range(0, iterations):
    communities = next(generator)
    print(repr(i+1) + '/' + repr(iterations), end='\r')

4/4

In [44]:
stop_words = set(stopwords.words('english'))

In [49]:
min_community_size = 10
top_k = 3

for community in communities:
    if len(community) < min_community_size:
        continue
    word_count = {}
    for title in community:
        title = title.lower()
        for word in title.split(' '):
            if word in stop_words:
                continue
            if word in word_count.keys():
                word_count[word] += 1
            else:
                word_count[word] = 1
    print(sorted(word_count, key=word_count.get, reverse=True)[:top_k])
    print(sorted(word_count.values(), reverse=True)[:top_k])

['covid-19', 'coronavirus', 'novel']
[207, 197, 128]


In [37]:
nx.write_gexf(G, "data/graph.gexf")