# Preprocessing

The preprocessing of data occurs in this notebook. Note that in this context, `future_time` equals `leadtime`.

## Concept Name Retrieval

We map CUIs to their Unified Medical Language System names. 

In [None]:
import pandas as pd
import requests
from lxml.html import fromstring
from tqdm.notebook import tqdm

In [None]:
# UMLS versions that will be used. 
UMLS_VERSION = '2020AA'
UMLS_VERSION_2 = '2021AA'

In [None]:
# read in given data
edges_cc = pd.read_csv('../data/edges_cc.csv')
edges_pc = pd.read_csv('../data/edges_pc.csv')

In [None]:
# all unique concepts, which will act as nodes
temp_concepts = set(edges_pc['dst']).union(set(edges_cc['src']).union(set(edges_cc['dst'])))
# drop all values that are not CUIs; in this case, they are nan, 1, and 2.
concepts = [concept for concept in temp_concepts if str(concept)[0]=='C']

Create a file `../UMLS_API_KEY.txt` to store your API key from https://documentation.uts.nlm.nih.gov/rest/authentication.html.

In [None]:
# read in API key
with open('../UMLS_API_KEY.txt', 'r') as f:
    API_KEY = f.read()

In [None]:
# taken from https://github.com/HHS/uts-rest-api
def generate_service_ticket():
    """Generates single-use service ticket for UMLS REST API"""
    global API_KEY
    
    # get Ticket-Granting Ticket
    headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent":"python"}
    data = {'apikey': API_KEY}
    r = requests.post('https://utslogin.nlm.nih.gov/cas/v1/api-key',data=data,headers=headers)
    response = fromstring(r.text)
    tgt = response.xpath('//form/@action')[0]
    
    # get Service Ticket
    data = {'service': 'http://umlsks.nlm.nih.gov'}
    headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent":"python" }
    r = requests.post(tgt,data=data,headers=headers)
    st = r.text
    return st

In [None]:
# retrieve names for every CUI
cui_names_list = []
failed_cui = []
count = 0
for cui in tqdm(concepts):
    try:
        data = requests.get('https://uts-ws.nlm.nih.gov/rest/content/{}/CUI/{}?ticket={}'.format(UMLS_VERSION, cui, generate_service_ticket())).json()
        cui_names_list.append({'CUI': cui, 'name': data['result']['name']})
    except:
        try:
            data = requests.get('https://uts-ws.nlm.nih.gov/rest/content/{}/CUI/{}?ticket={}'.format(UMLS_VERSION_2, cui, generate_service_ticket())).json()
            cui_names_list.append({'CUI': cui, 'name': data['result']['name']})
        except:
            failed_cui.append(cui)
    count+=1
    if count%1000 == 0:
        print('Length of cui_names_list : {}; length of failed_cui : {}''.format(len(cui_names_list), len(failed_cui)))

In [None]:
# save as .csv
cui_names = pd.DataFrame(cui_names_list)
cui_names.to_csv('../data/cui_names.csv', index=False)

In [None]:
# save failed CUIs to manually add to cui_names.csv
failed_cui_df = pd.DataFrame(failed_cui, columns=['CUI'])
failed_cui_df.to_csv('../data/failed_cui.csv', index=False)

## Clean Data v1

We perform the first round of data cleaning by performing the following tasks:
1. Self loops are dropped.
2. Papers without valid publication dates (i.e., missing year or month) are dropped because we cannot know whether or not they should be training or validation data.
3. Paper-concept edges in which the paper does not have a valid date are dropped. This results in a set of concepts that have at least one valid associated paper.
4. Paper-paper edges in which either of the nodes do not have a valid date are dropped.
5. Using the set of concepts formed in step 3, concept-concept edges in which either of the nodes are not in the set of valid concepts are dropped.
6. Duplicate concept-concept and paper-concept edges or edges with nodes in reverse order (i.e., A-B, B-A) are condensed into one edge.

In [None]:
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
# read in given data
edges_cc = pd.read_csv('../data/edges_cc.csv')
edges_pc = pd.read_csv('../data/edges_pc.csv')
edges_pp = pd.read_csv('../data/edges_pp.csv')
papers = pd.read_csv('../data/papers.csv')

In [None]:
# drop self loops
edges_cc = edges_cc[edges_cc['src']!=edges_cc['dst']]
edges_pp = edges_pp[edges_pp['src']!=edges_pp['dst']]
edges_pc = edges_pc[edges_pc['src']!=edges_pc['dst']]
'Starting lengths - edges_cc : {}, edges_pc : {}, edges_pp : {}, papers : {}'.format(len(edges_cc), len(edges_pc), len(edges_pp), len(papers))

In [None]:
# drop all papers that do not have both month and year dates
papers = papers[(papers['month'].notna()) & (papers['year'].notna())]
'After dropping invalid dates - papers : {}'.format(len(papers))

In [None]:
# sort papers by date
papers.sort_values(['year', 'month'], inplace = True)
papers.reset_index(inplace = True, drop =True)

In [None]:
# create a list of valid papers
valid_papers = list(papers['id'])

# drop all paper-concept edges where the paper is invalid
edges_pc = edges_pc[edges_pc['src'].isin(valid_papers)]

# drop all paper-paper edges where the paper is invalid
edges_pp = edges_pp.astype(str)
edges_pp = edges_pp[(edges_pp['src'].isin(valid_papers)) & (edges_pp['dst'].isin(valid_papers))]

In [None]:
# create a list of valid concepts
valid_concepts = list(set(edges_pc['dst']))

# drop all concept-concept edges where concept is invalid due to it not have an associated paper
edges_cc = edges_cc[(edges_cc['src'].isin(valid_concepts)) & (edges_cc['dst'].isin(valid_concepts))]

In [None]:
# convert to DataFrame
df_valid_papers = pd.DataFrame({'paper': valid_papers})
df_valid_concepts = pd.DataFrame({'CUI': valid_concepts})

In [None]:
# unique concept-concept edges, merge duplicates
unique_edges_cc = pd.DataFrame(set([tuple(sorted(x)) for x in zip(edges_cc['src'], edges_cc['dst'])]), columns=['src', 'dst'])

# unique paper-concept edges, merge duplicates
unique_edges_pc = pd.DataFrame(set([x for x in zip(edges_pc['src'], edges_pc['dst'])]), columns=['src', 'dst'])

In [None]:
print('Total papers : {}; total concepts: {}'.format(len(df_valid_papers), len(df_valid_concepts)))
print('Ending lengths - edges_cc : {}, edges_pc : {}, edges_pp : {}'.format(len(unique_edges_cc), len(unique_edges_pc), len(edges_pp)))

In [None]:
# write out processed data
unique_edges_cc.to_csv('../data/edges_cc_processed.csv', index=False)
unique_edges_pc.to_csv('../data/edges_pc_processed.csv', index=False)
edges_pp.to_csv('../data/edges_pp_processed.csv', index=False)
papers.to_csv('../data/papers_processed.csv', index=False)
df_valid_papers.to_csv('../data/valid_papers.csv', index=False)
df_valid_concepts.to_csv('../data/valid_concepts.csv', index=False)

## Abstract Retrieval 

We retrieve the abstracts of the papers.

In [None]:
import pandas as pd
import requests
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup

Download `metadata.csv` from https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge and store it in `../data`.

In [None]:
# read in metadata from CORD 19, which contains abstracts
cord_meta = pd.read_csv('../data/metadata.csv')
cord_meta.set_index('cord_uid', drop=True, inplace=True)

In [None]:
# read in valid papers
valid_papers = list(pd.read_csv('../data/valid_papers.csv')['paper'])

In [None]:
# separate PubMed and CORD 19 papers. PubMed papers have purely numeric IDs while CORD 19 paper IDs are alphanumeric
pubmed = [x for x in valid_papers if x.isnumeric()]
s_pubmed = set(pubmed)
cord = [x for x in valid_papers if x not in s_pubmed]

In [None]:
# get all possible CORD 19 abstracts
cord_abstract = []
for paper in tqdm(cord):
    try:
        cord_abstract.append((paper, cord_meta.loc[paper]['abstract']))
    except:
        pass

Create a file `../NCBI_API_KEY.txt` to store your API key from https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities.

In [None]:
# read in API key
with open('../NCBI_API_KEY.txt', 'r') as f:
    API_KEY = f.read()

In [None]:
# retrieve all possible PubMed abstracts
pubmed_abstract = []
for paper in tqdm(pubmed):
    try:
        data = requests.get('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&retmode=XML&rettype=abstract&api_key={}'.format(paper, API_KEY)).text
        data = BeautifulSoup(data)
        pubmed_abstract.append((paper, data.pubmedarticleset.pubmedarticle.abstract.abstracttext.get_text()))
    except:
        pass

In [None]:
# read out retrieved abtracts, dropping any NaN rows
pubmed_abstract_df = pd.DataFrame(pubmed_abstract, columns=['paper', 'abstract'])
pubmed_abstract_df.dropna(inplace = True)
pubmed_abstract_df.to_csv('../data/pubmed_abstracts.csv', index = False)
cord_abstract_df = pd.DataFrame(cord_abstract, columns=['paper', 'abstract'])
cord_abstract_df.dropna(inplace = True)
cord_abstract_df.to_csv('../data/cord_abstracts.csv', index = False)

## Node Embeddings

We create the node embeddings using the names and abstracts we retrieved.

In [None]:
import nltk
import pandas as pd
import re
import gensim.downloader as api
import numpy as np
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# read in Google News Word2Vec
wv = api.load('word2vec-google-news-300')

### Paper Nodes

In [None]:
# read in retrieved abtracts
cord_abstracts = pd.read_csv('../data/cord_abstracts.csv')
pubmed_abstracts = pd.read_csv('../data/pubmed_abstracts.csv')

In [None]:
# tokenized abstracts for w2v
cord_tokenize = []
for paper, abstract in tqdm(list(zip(cord_abstracts['paper'], cord_abstracts['abstract']))):
    cord_tokenize.append((paper, [word.lower() for word in nltk.word_tokenize(abstract) if word.isalpha()]))
pubmed_tokenize = []
for paper, abstract in tqdm(list(zip(pubmed_abstracts['paper'], pubmed_abstracts['abstract']))):
    pubmed_tokenize.append((paper, [word.lower() for word in nltk.word_tokenize(abstract) if word.isalpha()]))

In [None]:
# processed abstracts for TF-IDF
cord_processed = []
for i in tqdm(cord_tokenize):
    cord_processed.append(' '.join(i[1]))
pubmed_processed = []
for i in tqdm(pubmed_tokenize):
    pubmed_processed.append(' '.join(i[1]))

In [None]:
# get TF-IDF values
vect = TfidfVectorizer()
tfidf_matrix = vect.fit_transform(cord_processed+pubmed_processed)
tfidf_values = dict(zip(vect.get_feature_names(), vect.idf_))

In [None]:
# get all TF-IDF weighted averages of w2v vectors for each abstract, which will be the node feature for the paper node
abstract_embeddings = []
count = 0
for i in tqdm(cord_tokenize):
    paper = i[0]
    abstract = i[1]
    embeddings = []
    used_unk = False
    used_not_unk = False
    for word in abstract:
        try:
            word_vector = wv[word]
            used_not_unk = True
        except:
            word_vector = wv['unk']
            used_unk = True
        try:
            word_vector *= tfidf_values[word]
        except:
            pass
        embeddings.append(word_vector)
    if(len(embeddings)==0):
        embeddings.append(wv['unk'])
        count+=1
    elif(not used_not_unk and used_unk):
        count+=1
    abstract_embeddings.append((paper, np.mean(embeddings, axis=0)))

for i in tqdm(pubmed_tokenize):
    paper = i[0]
    abstract = i[1]
    embeddings = []
    used_unk = False
    used_not_unk = False
    for word in abstract:
        try:
            word_vector = wv[word]
            used_not_unk = True
        except:
            word_vector = wv['unk']
            used_unk = True
        try:
            word_vector *= tfidf_values[word]
        except:
            pass
        embeddings.append(word_vector)
    if(len(embeddings)==0):
        embeddings.append(wv['unk'])
        count+=1
    elif(not used_not_unk and used_unk):
        count+=1
    abstract_embeddings.append((paper, np.mean(embeddings, axis=0)))

In [None]:
# this is the number of papers with only unknown characters in their final embedding, not the number of papers with one or more unknown characters
'Number of paper nodes with unk : {}'.format(count)

In [None]:
# convert to DataFrame and save
embeddings = pd.DataFrame(abstract_embeddings, columns=['paper', 'embedding'])
embeddings.to_hdf('../data/paper_embeddings.h5', index = False, key='df')

### Concept Nodes

In [None]:
# read in UMLS names and valid concepts
cui_names = pd.read_csv('../data/cui_names.csv')
cui_names.set_index('CUI', drop = True, inplace = True)
valid_concepts = list(pd.read_csv('../data/valid_concepts.csv')['CUI'])

In [None]:
# tokenized concept names for w2v
concept_tokenize = []
for concept in tqdm(valid_concepts):
    concept_tokenize.append((concept, [x.lower() for x in nltk.word_tokenize(cui_names.loc[concept]['name']) if x.isalpha()]))

In [None]:
# processed concept names for TF-IDF
concept_processed = []
for i in tqdm(concept_tokenize):
    if(len(i[1])>0):
        concept_processed.append(' '.join(i[1]))

In [None]:
# get TF-IDF values
vect = TfidfVectorizer()
tfidf_matrix = vect.fit_transform(concept_processed)
tfidf_values = dict(zip(vect.get_feature_names(), vect.idf_))

In [None]:
# get all TF-IDF weighted averages of w2v vectors for each concept name, which will be the node feature for the concept node
concept_embeddings = []
count = 0
for i in tqdm(concept_tokenize):
    concept = i[0]
    name = i[1]
    embeddings = []
    used_unk = False
    used_not_unk = False
    for word in name:
        try:
            word_vector = wv[word]
            used_not_unk = True
        except:
            word_vector = wv['unk']
            used_unk = True
        try:
            word_vector *= tfidf_values[word]
        except:
            pass
        embeddings.append(word_vector)
    if(len(embeddings)==0):
        embeddings.append(wv['unk'])
        count+=1
    elif(not used_not_unk and used_unk):
        count+=1
    concept_embeddings.append((concept, np.mean(embeddings, axis=0)))

In [None]:
# this is the number of concepts with only unknown characters in their final embedding, not the number of concepts with one or more unknown characters
'Number of concept nodes with unk : {}'.format(count)

In [None]:
# convert to DataFrame and save
embeddings = pd.DataFrame(concept_embeddings, columns=['CUI', 'embedding'])
embeddings.to_hdf('../data/concept_embeddings.h5', index = False, key='df')

## Clean Data v2

We perform the second round of data cleaning by performing the following tasks:
1. Papers without a valid abstract are dropped because they do not have node features.
2. Paper-concept and paper-paper edges in which the paper became invalidated due to step 1 are dropped.
3. Concept-concept edges in which a concept became invalidated due to its paper being dropped in step 2 are dropped.

In [None]:
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
# read in data
edges_cc = pd.read_csv('../data/edges_cc_processed.csv')
edges_pc = pd.read_csv('../data/edges_pc_processed.csv')
edges_pp = pd.read_csv('../data/edges_pp_processed.csv')
papers = pd.read_csv('../data/papers_processed.csv')
valid_papers = list(pd.read_csv('../data/valid_papers.csv')['paper'])
valid_concepts = list(pd.read_csv('../data/valid_concepts.csv')['CUI'])
paper_node_features = pd.read_hdf('../data/paper_embeddings.h5')
concept_node_features = pd.read_hdf('../data/concept_embeddings.h5')

In [None]:
'Starting lengths - edges_cc : {}, edges_pc : {}, edges_pp : {}, valid_papers : {}; valid_concepts: {}'.format(len(edges_cc), len(edges_pc), len(edges_pp), len(valid_papers), len(valid_concepts))

In [None]:
# drop all papers without valid abstracts
papers = papers[papers['id'].isin([str(x) for x in paper_node_features['paper']])]
papers.reset_index(inplace = True, drop = True)

In [None]:
# create list of valid papers
valid_papers = list(papers['id'])

# drop all paper-concept edges where the paper is invalid
edges_pc = edges_pc[edges_pc['src'].isin(valid_papers)]

# drop all paper-paper edges where the paper is invalid
edges_pp = edges_pp.astype(str)
edges_pp = edges_pp[(edges_pp['src'].isin(valid_papers)) & (edges_pp['dst'].isin(valid_papers))]

In [None]:
# create list of valid concepts
valid_concepts = list(set(edges_pc['dst']))

# drop all concept-concept edges where concept is invalid due to it not have an associated paper
edges_cc = edges_cc[(edges_cc['src'].isin(valid_concepts)) & (edges_cc['dst'].isin(valid_concepts))]

In [None]:
'Ending lengths - edges_cc : {}, edges_pc : {}, edges_pp : {}, valid_papers : {}; valid_concepts: {}'.format(len(edges_cc), len(edges_pc), len(edges_pp), len(valid_papers), len(valid_concepts))

In [None]:
# convert to DataFrame
df_valid_papers = pd.DataFrame({'paper': valid_papers})
df_valid_concepts = pd.DataFrame({'CUI': valid_concepts})

# write out processed data
edges_cc.to_csv('../data/edges_cc_processed.csv', index=False)
edges_pc.to_csv('../data/edges_pc_processed.csv', index=False)
edges_pp.to_csv('../data/edges_pp_processed.csv', index=False)
papers.to_csv('../data/papers_processed.csv', index=False)
df_valid_papers.to_csv('../data/valid_papers.csv', index=False)
df_valid_concepts.to_csv('../data/valid_concepts.csv', index=False)

## Date Assignments

We assign dates to all the edges and correct dates on the papers.

In [None]:
import pandas as pd
from tqdm.notebook import tqdm

### CC Edges

In [None]:
# read in data for cc edges
papers = pd.read_csv('../data/papers_processed.csv')
edges_cc = pd.read_csv('../data/edges_cc_processed.csv')
edges_pc = pd.read_csv('../data/edges_pc_processed.csv')

In [None]:
# create dictionary where keys are concepts and values are the set of papers associated with the concept
pc_dict = {}
for i in tqdm(range(len(edges_pc))):
    concept = edges_pc.iloc[i]['dst']
    paper = edges_pc.iloc[i]['src']
    if concept not in pc_dict:
        pc_dict[concept] = set()
    pc_dict[concept].add(paper)

In [None]:
# set index to paper IDs
papers.set_index('id', inplace = True)

In [None]:
# iterate through all concept-concept edges to find the papers that link them together and assign the date as the earliest paper's date
edges_cc_dates_list = []
failed_intersection = []
for index in tqdm(range(len(edges_cc))):
    row = edges_cc.iloc[index]
    intersection = list(pc_dict[row['src']].intersection(pc_dict[row['dst']]))
    if len(intersection) > 0:
        year = papers.loc[intersection[0]]['year']
        month = papers.loc[intersection[0]]['month']
        for i in range(1, len(intersection)):
            cur_year = papers.loc[intersection[i]]['year']
            cur_month = papers.loc[intersection[i]]['month']
            if cur_year > year:
                pass
            elif cur_year == year:
                month = min(month, cur_month)
            else:
                year = cur_year
                month = cur_month
        edges_cc_dates_list.append((row['src'], row['dst'], year, month, len(intersection)))
    else:
        # there are no papers that link the two concepts, so we cannot assign a date to it
        failed_intersection.append((row['src'], row['dst']))

In [None]:
#create DataFrame
edges_cc_dates = pd.DataFrame(edges_cc_dates_list, columns = ['src', 'dst', 'year', 'month', 'num_paper_link'])
failed_cc_edges = pd.DataFrame(failed_intersection, columns = ['src', 'dst'])

In [None]:
'Number of final cc edges : {}; number of failed cc edges : {}'.format(len(edges_cc_dates), len(failed_cc_edges))

In [None]:
# sort by date
edges_cc_dates.sort_values(['year', 'month'], inplace = True)
edges_cc_dates.reset_index(inplace = True, drop =True)

# correct dates that are after May 2021
edges_cc.loc[(edges_cc['year']==2021) & (edges_cc['month']>5), 'month'] = 5

# save
edges_cc_dates.to_csv('../data/edges_cc_dates.csv', index = False)
failed_cc_edges.to_csv('../data/failed_cc_edges.csv', index = False)

### PP Edges

In [None]:
# read in data for pp edges
edges_pp = pd.read_csv('../data/edges_pp_processed.csv')

In [None]:
# assign dates to each pp edge
edges_pp_dates_list = []
for index in tqdm(range(len(edges_pp))):
    row = edges_pp.iloc[index]
    edges_pp_dates_list.append((row['src'], row['dst'], papers.loc[str(row['src'])]['year'], papers.loc[str(row['src'])]['month']))

In [None]:
#create DataFrame
edges_pp_dates = pd.DataFrame(edges_pp_dates_list, columns = ['src', 'dst', 'year', 'month'])

In [None]:
# sort by date
edges_pp_dates.sort_values(['year', 'month'], inplace = True)
edges_pp_dates.reset_index(inplace = True, drop =True)

# correct dates that are after May 2021
edges_pp.loc[(edges_pp['year']==2021) & (edges_pp['month']>5), 'month'] = 5

# save
edges_pp_dates.to_csv('../data/edges_pp_dates.csv', index = False)

### PC Edges

In [None]:
# read in data for pc edges
edges_pc = pd.read_csv('../data/edges_pc_processed.csv')

In [None]:
# assign dates to each pc edge
edges_pc_dates_list = []
for index in tqdm(range(len(edges_pc))):
    row = edges_pc.iloc[index]
    edges_pc_dates_list.append((row['src'], row['dst'], papers.loc[str(row['src'])]['year'], papers.loc[str(row['src'])]['month']))

In [None]:
#create DataFrame
edges_pc_dates = pd.DataFrame(edges_pc_dates_list, columns = ['src', 'dst', 'year', 'month'])

In [None]:
# sort by date
edges_pc_dates.sort_values(['year', 'month'], inplace = True)
edges_pc_dates.reset_index(inplace = True, drop =True)

# correct dates that are after May 2021
edges_pc.loc[(edges_pc['year']==2021) & (edges_pc['month']>5), 'month'] = 5

# save
edges_pc_dates.to_csv('../data/edges_pc_dates.csv', index = False)

### Papers

In [None]:
# read in data
papers = pd.read_csv('../data/papers_processed.csv')

# correct dates that are after May 2021
papers.loc[(papers['year']==2021) & (papers['month']>5), 'month'] = 5

# save
papers.to_csv('../data/papers_processed.csv', index = False)

## Network Formation

We form a network using the data we preprocessed.

In [None]:
import pandas as pd
import networkx as nx
from tqdm.notebook import tqdm

In [None]:
# read in data
papers = pd.read_csv('../data/papers_processed.csv')
edges_cc = pd.read_csv('../data/edges_cc_dates.csv')
edges_pc = pd.read_csv('../data/edges_pc_dates.csv')
edges_pp = pd.read_csv('../data/edges_pp_dates.csv')
edges_pp['src'] = edges_pp['src'].astype(str)
edges_pp['dst'] = edges_pp['dst'].astype(str)
valid_concepts = list(pd.read_csv('../data/valid_concepts.csv')['CUI'])
valid_papers = list(pd.read_csv('../data/valid_papers.csv')['paper'])

In [None]:
'Number of papers: {}; Number of concepts: {}'.format(len(valid_papers), len(valid_concepts))

In [None]:
# Graph object
G = nx.Graph()

# add both concept and paper nodes
for paper in tqdm(valid_papers):
    G.add_node(paper, data = {'type': 'paper'})
    
for concept in tqdm(valid_concepts):
    G.add_node(concept, data = {'type': 'concept'})

In [None]:
'Number of nodes: {}'.format(G.number_of_nodes())

In [None]:
# add concept-concept, paper-concept, and paper-paper edges to G
past_edge_num = 0

G.add_edges_from(list(zip(edges_cc['src'], edges_cc['dst'])), type='cc')
print('Number of cc edges: {}'.format(G.number_of_edges()))
past_edge_num = G.number_of_edges()

G.add_edges_from(list(zip(edges_pc['src'], edges_pc['dst'])), type='pc')
print('Number of pc edges: {}'.format(G.number_of_edges() - past_edge_num))
past_edge_num = G.number_of_edges()

G.add_edges_from(list(zip(edges_pp['src'], edges_pp['dst'])), type='pp')
print('Number of pp edges: {}'.format(G.number_of_edges() - past_edge_num))

print('Number of total edges: {}'.format(G.number_of_edges()))

In [None]:
# write networkX object to binary
nx.readwrite.gpickle.write_gpickle(G, '../data/graph_all_data_undirected.gpickle')

## Graph Sampling

We utilize the Forest Fire Sampler from [Leskovec & Faloutsos](https://cs.stanford.edu/~jure/pubs/sampling-kdd06.pdf) to sample our network.

In [None]:
import networkx as nx
import pickle
from littleballoffur import ForestFireSampler

In [None]:
# read in graph
with open('../data/graph_all_data_undirected.gpickle', 'rb') as handle:
    g = pickle.load(handle)

In [None]:
# get the largest connected component, which is only original minus ~6000 nodes
cc = nx.connected_components(g)
l = sorted(list(cc), key=lambda x: len(x), reverse = True)
sub_g = g.subgraph(l[0])
sub_g = nx.convert_node_labels_to_integers(sub_g, label_attribute='name')

# starting statistics
num_nodes = sub_g.number_of_nodes()
num_edges = sub_g.number_of_edges()
print('Starting number of nodes : {}; number of edges : {}'.format(num_nodes, num_edges))

In [None]:
# percentage of nodes to keep
PERCENTAGE = 0.15

# sample from network
sampler = ForestFireSampler(number_of_nodes=num_nodes * PERCENTAGE)
sampled = sampler.sample(sub_g)

num_nodes = sampled.number_of_nodes()
num_edges = sampled.number_of_edges()
print('Sampled number of nodes : {}; number of edges : {}'.format(num_nodes, num_edges))

In [None]:
# relabel nodes with names
mapping = {row[0]:row[1]['name'] for row in list(sampled.nodes(data = True))}
sampled = nx.relabel_nodes(sampled, mapping)

# remove name attribute
for (n,d) in sampled.nodes(data=True):
    del d['name']

In [None]:
# save
with open('../data/sampled_undirected.gpickle', 'wb') as handle:
    pickle.dump(sampled, handle, protocol=pickle.HIGHEST_PROTOCOL)

## networkX to PyTorch Geometric

We convert our networkX graph object to a PyTorch Geometric compatible format. We write out a version of the network at every timestep between January 2014 and May 2021 as they will be used to construct the sequences of graphs.

In [None]:
import networkx as nx
import pickle
import torch
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm

In [None]:
# read in sampled graph
with open('../data/sampled_undirected.gpickle', 'rb') as handle:
    g = pickle.load(handle)

In [None]:
# read in data
papers = pd.read_csv('../data/papers_processed.csv')
edges_cc = pd.read_csv('../data/edges_cc_dates.csv')
edges_pc = pd.read_csv('../data/edges_pc_dates.csv')
edges_pp = pd.read_csv('../data/edges_pp_dates.csv')
edges_pp['src'] = edges_pp['src'].astype(str)
edges_pp['dst'] = edges_pp['dst'].astype(str)
valid_concepts = pd.read_csv('../data/valid_concepts.csv')
valid_papers = pd.read_csv('../data/valid_papers.csv')
paper_node_features = pd.read_hdf('../data/paper_embeddings.h5')
paper_node_features['paper'] = paper_node_features['paper'].astype(str)
paper_node_features.set_index('paper', inplace = True, drop = True)
concept_node_features = pd.read_hdf('../data/concept_embeddings.h5')
concept_node_features.set_index('CUI', inplace = True, drop = True)
node_features = paper_node_features.append(concept_node_features)

In [None]:
nodes = set(g.nodes())

In [None]:
# drop all edges and nodes not in the sampled graph
papers = papers[papers['id'].isin(nodes)].reset_index(drop = True)
valid_papers = valid_papers[valid_papers['paper'].isin(nodes)].reset_index(drop = True)
valid_concepts = valid_concepts[valid_concepts['CUI'].isin(nodes)].reset_index(drop = True)
edges_cc = edges_cc[(edges_cc['src'].isin(nodes)) & (edges_cc['dst'].isin(nodes))].reset_index(drop = True)
edges_pc = edges_pc[(edges_pc['src'].isin(nodes)) & (edges_pc['dst'].isin(nodes))].reset_index(drop = True)
edges_pp = edges_pp[(edges_pp['src'].isin(nodes)) & (edges_pp['dst'].isin(nodes))].reset_index(drop = True)

In [None]:
cc_val = edges_cc.values
pc_val = edges_pc.values
pp_val = edges_pp.values
node_names = np.concatenate((valid_concepts.values.flatten(), valid_papers.values.flatten()))

with open('../data/sampled_graphs/node_names.pickle', 'wb') as handle:
    pickle.dump(node_names, handle,  protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# assign an index to each concept and paper
index_dict = {}
for i in tqdm(range(len(valid_concepts.values))):
    index_dict[valid_concepts.values[i][0]] = i
for i in tqdm(range(len(valid_papers.values))):
    index_dict[valid_papers.values[i][0]] = i + len(valid_concepts)

with open('../data/sampled_graphs/index_dict.pickle', 'wb') as handle:
    pickle.dump(index_dict, handle,  protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# PyTorch compatible format
edge_index = [[], []]
tensor_edge_index = torch.LongTensor(edge_index)

# self loops are added so that the node exists in the graph
for row in tqdm(valid_concepts.values):
    edge_index[0].append(index_dict[row[0]])
    edge_index[1].append(index_dict[row[0]])
for row in tqdm(valid_papers.values):
    edge_index[0].append(index_dict[row[0]])
    edge_index[1].append(index_dict[row[0]])

In [None]:
# add all edges up to and including 1/2014
STOP_DATE = (2014, 1)
cc_index = 0
pc_index = 0
pp_index = 0
for row in tqdm(cc_val):
    year = row[2]
    month = row[3]
    if (year, month) > STOP_DATE:
        break
    else:
        edge_index[0].append(index_dict[row[0]])
        edge_index[1].append(index_dict[row[1]])
        edge_index[0].append(index_dict[row[1]])
        edge_index[1].append(index_dict[row[0]])
        cc_index+=1
        
for row in tqdm(pc_val):
    year = row[2]
    month = row[3]
    if (year, month) > STOP_DATE:
        break
    else:
        edge_index[0].append(index_dict[row[0]])
        edge_index[1].append(index_dict[row[1]])
        edge_index[0].append(index_dict[row[1]])
        edge_index[1].append(index_dict[row[0]])
        pc_index+=1
        
for row in tqdm(pp_val):
    year = row[2]
    month = row[3]
    if (year, month) > STOP_DATE:
        break
    else:
        edge_index[0].append(index_dict[row[0]])
        edge_index[1].append(index_dict[row[1]])
        edge_index[0].append(index_dict[row[1]])
        edge_index[1].append(index_dict[row[0]])
        pp_index+=1
tensor_edge_index = torch.LongTensor(edge_index)

In [None]:
# perform dimensionality reduction on the node features using PCA
features = torch.tensor([node_features.loc[node]['embedding']  for node in node_names])
pca = PCA(n_components=32)
pca.fit(features)
features = torch.tensor(pca.transform(features), dtype=torch.float32)

In [None]:
with open('../data/sampled_graphs/node_features.pickle', 'wb') as handle:
    pickle.dump(features, handle,  protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# write out every version of the network from January 2014 to May 2021 to be used in the graph sequences
date_index = []
graph_date = (2014, 1)
while graph_date < (2021, 5):
    print("Graph date : {}".format(graph_date))
    
    with open('../data/sampled_graphs/graph_{}_{}.pickle'.format(graph_date[0], graph_date[1]), 'wb') as handle:
        pickle.dump(tensor_edge_index, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    date_index.append(graph_date)
    graph_date = (graph_date[0]+graph_date[1]//12, graph_date[1]%12+1)
    while(cc_index < len(cc_val)):
        row = cc_val[cc_index]
        year = row[2]
        month = row[3]
        if (year, month) <= graph_date:
            edge_index[0].append(index_dict[row[0]])
            edge_index[1].append(index_dict[row[1]])
            edge_index[0].append(index_dict[row[1]])
            edge_index[1].append(index_dict[row[0]])
            cc_index+=1
        else:
            break
    while(pc_index < len(pc_val)):
        row = pc_val[pc_index]
        year = row[2]
        month = row[3]
        if (year, month) <= graph_date:
            edge_index[0].append(index_dict[row[0]])
            edge_index[1].append(index_dict[row[1]])
            edge_index[0].append(index_dict[row[1]])
            edge_index[1].append(index_dict[row[0]])
            pc_index+=1
        else:
            break
    while(pp_index < len(pp_val)):
        row = pp_val[pp_index]
        year = row[2]
        month = row[3]
        if (year, month) <= graph_date:
            edge_index[0].append(index_dict[row[0]])
            edge_index[1].append(index_dict[row[1]])
            edge_index[0].append(index_dict[row[1]])
            edge_index[1].append(index_dict[row[0]])
            pp_index+=1
        else:
            break
    tensor_edge_index = torch.LongTensor(edge_index)

In [None]:
# write out date indices
with open('../data/sampled_graphs/date_index.pickle', 'wb') as handle:
    pickle.dump(date_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Leadtime Assignments

We assign leadtime to concept node pairs.

In [None]:
import pandas as pd
import networkx as nx
import numpy as np
from tqdm.notebook import tqdm

In [None]:
np.random.seed(12345)

In [None]:
# read in sampled graph and date indices
with open('../data/sampled_undirected.gpickle', 'rb') as handle:
    g = pickle.load(handle)

with open('../data/sampled_graphs/date_index.pickle', 'rb') as handle:
    date_index = pickle.load(handle)

### Positive Samples

In [None]:
# read in data and drop all edges/nodes not in the sampled network
edges_cc = pd.read_csv('../data/edges_cc_dates.csv')
nodes = set(g.nodes())
edges_cc = edges_cc[(edges_cc['src'].isin(nodes)) & (edges_cc['dst'].isin(nodes))].reset_index(drop = True)

In [None]:
# all edges formed after or in January 2019 are used
to_use = edges_cc.iloc[edges_cc[(edges_cc['year']==2018) & (edges_cc['month']==12)].index[-1]+1:]
to_use.reset_index(inplace = True, drop = True)

In [None]:
# assign a leadtime. 0 = 1 month (immediate future), 1 = 1 year, 2 = 2 years, 3 = 3 years
to_use = to_use.assign(future_time=pd.Series(np.random.randint(0,4, len(to_use))).values)
to_use

In [None]:
# determine what date formation date - leadtime is 
date_data = []
for row in tqdm(to_use.values):
    l = list(row)
    if(row[5] == 0):
        if(row[3] == 1):
            l.append(row[2] - 1)
            l.append(12)
        else:
            l.append(row[2])
            l.append(row[3] - 1)
    else:
        l.append(row[2] - row[5])
        l.append(row[3])
    date_data.append(l)

# determine where the starting and ending indices should be
BACK = 36
for row in date_data:
    i = date_index.index((row[-2], row[-1]))
    row.append(i-BACK)
    row.append(i)

In [None]:
# convert to DataFrame and save
date_data = pd.DataFrame(date_data, columns = ['src', 'dst', 'year', 'month', 'num_paper_link', 'future_time', 'end_year', 'end_month', 'start_index', 'end_index'])
date_data.to_csv('../data/sampled_graphs/date_data_pos.csv', index =False)

### Negative Samples

In [None]:
# read in sampled graph
with open('../data/sampled_undirected.gpickle', 'rb') as handle:
    g = pickle.load(handle)

In [None]:
# randomly sample negative node pairs
list_no_cc = set()
while len(list_no_cc)<len(date_data):
    src = valid_concepts.values[np.random.randint(0, len(valid_concepts))][0]
    dst = valid_concepts.values[np.random.randint(0, len(valid_concepts))][0]
    if not g.has_edge(src, dst):
        list_no_cc.add((src, dst))

list_no_cc = list(list_no_cc)
no_cc = pd.DataFrame(list_no_cc, columns=['src', 'dst'])

In [None]:
# assign date of May 2021 to all node pairs
no_cc = no_cc.assign(year=pd.Series([2021.0 for x in range(len(no_cc))]).values)
no_cc = no_cc.assign(month=pd.Series([5.0 for x in range(len(no_cc))]).values)

# assign a leadtime. 0 = 1 month (immediate future), 1 = 1 year, 2 = 2 years, 3 = 3 years
no_cc = no_cc.assign(future_time=pd.Series(np.random.randint(0,4, len(to_use))).values)
no_cc

In [None]:
# determine what date - leadtime is 
neg_date_data = []
for row in tqdm(no_cc.values):
    l = list(row)
    if(row[4] == 0):
        if(row[3] == 1):
            l.append(row[2] - 1)
            l.append(12)
        else:
            l.append(row[2])
            l.append(row[3] - 1)
    else:
        l.append(row[2] - row[4])
        l.append(row[3])
    neg_date_data.append(l)

# determine where the starting and ending indices should be
BACK = 36
for row in neg_date_data:
    i = date_index.index((row[-2], row[-1]))
    row.append(i-BACK)
    row.append(i)

In [None]:
# convert to DataFrame and save
neg_date_data = pd.DataFrame(neg_date_data, columns = ['src', 'dst', 'year', 'month', 'future_time', 'end_year', 'end_month', 'start_index', 'end_index'])
neg_date_data.to_csv('../data/sampled_graphs/date_data_neg.csv', index =False)