In [60]:
import pandas as pd
import pickle
from scripts.interactive_search import *
import numpy as np
import networkx as nx
from networkx.algorithms import community

In [61]:
# Change this function to select a different paper set
def read_metadata(path):
    df = pd.read_csv(path)
    df = df.astype({'abstract': 'str'})
    df = df[df['abstract'] != 'Unknown']
    df = df[df['abstract'] != 'nan']
    df = df[df['language'] == 'en']
    df = df[df['publish_time'].str[:4] == '2020']
    #df = df.drop_duplicates(['title','abstract'])
    #df = df.drop_duplicates(['title'])
    #df = df.drop_duplicates(['abstract'])
    df = df.reset_index(drop=True)
    df = df[:1000]
    return df

In [62]:
load_from_pickle = True
#EMBEDDINGS_PATH = 'data/embeddings_top5k.pkl'
#EMBEDDINGS_PATH = 'data/embeddings_2020.pkl'
EMBEDDINGS_PATH = 'data/embeddings_2020_top1k.pkl'
#METADATA_PATH = 'data/metadata_new_new.csv'
METADATA_PATH = 'data/metadata_en_20_topics.csv'

df_meta = read_metadata(METADATA_PATH)
abstracts = [a for a in df_meta['abstract']]
titles = df_meta['title'].reset_index(drop=True)

if load_from_pickle:
    with open(EMBEDDINGS_PATH, 'rb') as file:
        embeddings = pickle.load(file)

else:
    model = SentenceTransformer(MODEL_PATH)
    embeddings = model.encode(abstracts, show_progress_bar=True)
    with open(EMBEDDINGS_PATH, 'wb') as file:
        pickle.dump(embeddings, file)

In [63]:
df_meta[df_meta['title'] == "Genomic variance of the 2019-nCoV coronavirus"]

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,...,url,language,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7
64,uqls3p01,,biorxiv,Genomic variance of the 2019-nCoV coronavirus,10.1101/2020.02.02.931162,,,biorxiv,AbstractThere is rising global concern for the...,2020-02-04,...,https://doi.org/10.1101/2020.02.02.931162,en,,,,0.642545,0.340377,,,
888,zel9a3u6,,WHO,Genomic variance of the 2019-nCoV coronavirus,10.1002/jmv.25700,,,unk,Abstract There is rising global concern for th...,2020-02-19,...,https://doi.org/10.1002/jmv.25700,en,,,,0.705986,0.278361,,,


In [64]:
df_meta['title'].value_counts()

In Case You Haven't Heard…                                                                                                                                                                                 2
TWIRLS, an automated topic-wise inference method based on massive literature, suggests a possible mechanism via ACE2 for the pathological changes in the human host after coronavirus infection            2
Genome Detective Coronavirus Typing Tool for rapid identification and characterization of novel coronavirus genomes                                                                                        2
Genomic variance of the 2019-nCoV coronavirus                                                                                                                                                              2
Analysis of psychological state and clinical psychological intervention model of patients with COVID-19                                                                             

In [65]:
def compute_adjacency(embeddings, epsilon):
    cosine_distances = scipy.spatial.distance.cdist(embeddings, embeddings, "cosine")
    cosine_similarities = 1 - cosine_distances
    cosine_similarities[cosine_similarities < epsilon] = 0
    np.fill_diagonal(cosine_similarities, 0.0)
    return cosine_similarities

In [66]:
epsilon = 0.85
adjacency = compute_adjacency(embeddings, epsilon)
G = nx.from_numpy_array(adjacency)

In [67]:
print(len(G))

1000


In [68]:
index_to_title = {}
for i in list(G.nodes):
    index_to_title.update({i: titles[i]})

In [69]:
G = nx.relabel.relabel_nodes(G, index_to_title)

In [70]:
print(len(G))

996


In [71]:
def unordered_connected_components(G):
    for c in nx.connected_components(G):
        yield G.subgraph(c)
        
def connected_components(G):
    return sorted(list(unordered_connected_components(G)), key=len, reverse=True)

In [72]:
subgraphs = connected_components(G)

In [73]:
G = subgraphs[0]  # Keep only the main connected component

In [74]:
def compute_clusters_fluid(G, n_clusters=10, dtype='title'):
    clusters = []
    iterator = community.asyn_fluidc(G, n_clusters, max_iter=1)
    for cluster in iterator:
        clusters.append(cluster)

    return clusters

In [75]:
clusters = compute_clusters_fluid(G, n_clusters=3, dtype='title')  # Works quite well with high epsilon (.85) and only keeping main connected component
sum = 0
for cluster in clusters:
    sum += len(cluster)

In [76]:
title_to_cluster = {}
i = 0
for cluster in clusters:
    for title in cluster:
        if type(title) != str:
            print("Error: title is not a string => " + repr(title))
        title_to_cluster.update({title: i})
    i += 1
nx.set_node_attributes(G, title_to_cluster, "cluster")

In [77]:
for subgraph in subgraphs[1:]:
    G = nx.compose(G, subgraph)

In [78]:
nx.write_gexf(G, "data/graph.gexf")

In [79]:
s = 0
for subgraph in subgraphs:
    s += len(subgraph)
    
print(s)

996


The end.