# Extract info to visualize graph

- Nodes, inward degree, topic
- Edge_list

In [1]:
import pandas as pd
df_compdata_ref_auth = pd.read_csv('./data/compdata_ref_author.csv', index_col=False)

In [2]:
df_compdata_ref_auth_redup = df_compdata_ref_auth.drop_duplicates(subset=['title'])
len(df_compdata_ref_auth_redup)

5686

In [3]:
df_compdata_ref_auth_redup.head(1)

Unnamed: 0.1,Unnamed: 0,title,abstract,journal,DOI,date,collection,published,nauthors,topic.Healthcare,...,numcit,influcit,DL_Pdf,DL_Full,DL_Abstract,ref,authorsId,authorsNames,p_doi,p_journal
0,22,"Multivariate analysis of CT imaging, laborator...",PURPOSE: To develop and externally validate a ...,Abdom Radiol (NY),33098478,2020-10-25,pubmed,,6,0.000988,...,0,0,,,,"10.5152/TJAR.2014.83436, 10.1159/000509274, 10...","4593769, 2001055410, 2001050867, 16053632, 144...","S. Hectors, Sadjad Riyahi, Hreedi Dev, K. Kris...",10.1007/s00261-020-02823-w,Abdominal Radiology


In [4]:
# Generate edge_list
import numpy as np

doi2ref = dict() # dict of doi -> reference doi's
for idx in df_compdata_ref_auth_redup.index: 
    doi2ref[df_compdata_ref_auth_redup['p_doi'].loc[idx]] = df_compdata_ref_auth_redup['ref'].loc[idx]



edge_list = list() # list of doi -> ref doi
for k in doi2ref.keys():
    if type(doi2ref[k]) is float : ## handle nan values
        continue
    
    for val in doi2ref[k].split(', '):
        if val =='NA': # ignore NA values of papers
            continue
        if val not in doi2ref.keys(): # remove edge if ref paper not in compdata
            continue
        edge_list.append((k, val))

edge_list[1]

('10.1007/s00261-020-02823-w', '10.1007/s00261-020-02671-8')

In [5]:
edge_list[1][0]

'10.1007/s00261-020-02823-w'

In [6]:
import networkx as nx

G = nx.DiGraph()
G.add_edges_from(edge_list)

In [7]:
pdoi2topic = dict()
pdoi2title = dict()
for idx in df_compdata_ref_auth_redup.index:
    if df_compdata_ref_auth_redup['p_doi'].loc[idx] in pdoi2topic.keys():
        continue
    
    pdoi2topic[df_compdata_ref_auth_redup['p_doi'].loc[idx]] = df_compdata_ref_auth_redup['topic'].loc[idx]
    pdoi2title[df_compdata_ref_auth_redup['p_doi'].loc[idx]] = df_compdata_ref_auth_redup['title'].loc[idx]

#pdoi2topic

In [8]:
nodes = list(G.nodes())
degree = [ v for (k,v) in list(G.in_degree(nodes)) ]
topic =  [ pdoi2topic[n] for n in nodes]
title =  [ pdoi2title[n] for n in nodes]
pd_node_inf = pd.DataFrame(list(zip(nodes, degree, topic, title)), columns = ['p_doi', 'degree', 'topic', 'title'])
pd_node_inf.to_csv('pd_node_inf.csv')

In [9]:
import csv
with open('edge_list.csv','w') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['source','target'])
    csv_out.writerows(edge_list)

In [10]:
pd_node_inf.sort_values(by='degree', ascending=False).to_csv('top_referenced_papers.tsv')

In [11]:
topiclinks = dict()

for edge in edge_list:
    topic1 = pdoi2topic[edge[0]]
    topic2 = pdoi2topic[edge[1]]
    
    if (topic1, topic2) not in topiclinks.keys():
        topiclinks[(topic1, topic2)] = 1
    else:
        topiclinks[(topic1, topic2)] = topiclinks[(topic1, topic2)] + 1

topics = ['Chest X-Ray', 'Clinics', 'Drug discovery', 'Epidemiology', 'Genomics', 'Healthcare']
mat_topic_links = np.zeros((len(topics),len(topics)))

for idx1 in range(len(topics)):
    for idx2 in range(len(topics)):
        mat_topic_links[idx1, idx2] = topiclinks[(topics[idx1], topics[idx2])]

mat_topic_links

array([[603.,  43.,  12.,  35.,  23.,  23.],
       [ 45., 443.,  28.,   7.,  19.,  25.],
       [  6.,  88., 629.,  21., 156.,  20.],
       [ 52.,  33.,  20., 980.,  65.,  21.],
       [  1.,  19., 108.,  36., 517.,   6.],
       [297.,  74.,  60., 104.,  50., 109.]])

In [12]:
pd_mat_topic_links = pd.DataFrame(mat_topic_links, columns = topics)
pd_topics = pd.DataFrame(topics, columns = ['topics'])

pd_topic_links = pd.concat([pd_topics, pd_mat_topic_links], axis=1)
pd_topic_links.to_csv('topic_links.csv')