# Extract info to visualize graph

## Load citation data

In [1]:
import pandas as pd
inoutpath = 'data/'

df_compdata_ref_auth = pd.read_csv(inoutpath + 'compdata_ext_ref.csv', index_col=False)

In [2]:
df_compdata_ref_auth_redup = df_compdata_ref_auth.drop_duplicates(subset=['title'])
len(df_compdata_ref_auth_redup)

16896

## Generate edge_list
- Edge_list

In [3]:
import numpy as np

doi2ref = dict() # dict of doi -> reference doi's
for idx in df_compdata_ref_auth_redup.index: 
    doi2ref[df_compdata_ref_auth_redup['p_doi'].loc[idx]] = df_compdata_ref_auth_redup['ref'].loc[idx]



edge_list = list() # list of doi -> ref doi
for k in doi2ref.keys():
    if type(doi2ref[k]) is float : ## handle nan values
        continue
    
    for val in doi2ref[k].split(', '):
        if val =='NA': # ignore NA values of papers
            continue
        if val not in doi2ref.keys(): # remove edge if ref paper not in compdata
            continue
        edge_list.append((k, val))

edge_list[1] # source->target

('10.1016/j.jviromet.2021.114197', '10.1016/j.meegid.2020.104351')

## Contruct graph from edge_list

In [4]:
import networkx as nx

G = nx.DiGraph()
G.add_edges_from(edge_list)

## Save edge_list to csv file

In [5]:
import csv
with open(inoutpath + 'edge_list.csv','w') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['source','target'])
    csv_out.writerows(edge_list)

## Get top referenced papers

In [6]:
#pd_node_inf.sort_values(by='degree', ascending=False).to_csv(inoutpath + 'top_referenced_papers.csv')

# Retrieve degree, topic and title for nodes
## Retrieve node topic and title from data

In [16]:
old_topics = ['Imaging', 'Clinics', 'Drug discovery', 'Epidemiology', 'Genomics', 'Healthcare']
topics = ['Clinical Imaging', 'Clinical Medicine', 'Pharmacology', 'Epidemiology', 'Genomics', 'Healthcare']
df_compdata_ref_auth_redup.loc[:, 'topic'] = df_compdata_ref_auth_redup.topic.replace(old_topics, topics)
df_compdata_ref_auth_redup.topic.head(10)

0    Clinical Medicine
1             Genomics
2         Pharmacology
3         Epidemiology
4           Healthcare
5           Healthcare
6     Clinical Imaging
7         Pharmacology
8    Clinical Medicine
9           Healthcare
Name: topic, dtype: object

In [17]:
pdoi2topic = dict()
pdoi2title = dict()
for idx in df_compdata_ref_auth_redup.index:
    if df_compdata_ref_auth_redup['p_doi'].loc[idx] in pdoi2topic.keys():
        continue
    
    pdoi2topic[df_compdata_ref_auth_redup['p_doi'].loc[idx]] = df_compdata_ref_auth_redup['topic'].loc[idx]
    pdoi2title[df_compdata_ref_auth_redup['p_doi'].loc[idx]] = df_compdata_ref_auth_redup['title'].loc[idx]

#pdoi2topic

## Save node info to csv file

In [18]:
nodes = list(G.nodes())
degree = [ v for (k,v) in list(G.in_degree(nodes)) ]
topic =  [ pdoi2topic[n] for n in nodes]
title =  [ pdoi2title[n] for n in nodes]
pd_node_inf = pd.DataFrame(list(zip(nodes, degree, topic, title)), columns = ['p_doi', 'degree', 'topic', 'title'])
pd_node_inf.to_csv(inoutpath + 'pd_node_inf.csv')