The variables below should be updated accordingly.
`NEO4J_OUTPUT_DIR` is where the data to be imported into Neo4j is ouputted.

In [1]:
NEO4J_OUTPUT_DIR = '../neo4j-import' # folder to store files for neo4j import

In [2]:
import pandas as pd
import os

In [3]:
metadata_path = '../data/metadata.csv' # CORD-19 metadata from kaggle
metadata_df = pd.read_csv(metadata_path)

# text-document relationships produced from Stage 2, used to get the list of documents for which to extract metadata
sentences_path = os.path.join(NEO4J_OUTPUT_DIR, 'text_edges.csv')
sentences_df = pd.read_csv(sentences_path)
all_paper_ids = set(sentences_df['doc_id:END_ID(Document)'])
print('total number of doccument/paper IDs after Stage 2:', len(all_paper_ids))

# formated metadata for Neo4j import
output_path = os.path.join(NEO4J_OUTPUT_DIR, 'metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


total number of doccument/paper IDs after Stage 2: 178900


In [4]:
# metadata statistics
all_abstracts = set(metadata_df['abstract'].dropna().str.lower().str.strip())
print('Total number of abstracts in metadata =', len(all_abstracts))
print('Total number of PMC IDs in metadata =', len(set(metadata_df['pmcid'].dropna())))
print('Total number of PubMed IDs in metadata =', len(set(metadata_df['pubmed_id'].dropna())))

Total number of abstracts in metadata = 309175
Total number of PMC IDs in metadata = 170722
Total number of PubMed IDs in metadata = 233312


In [5]:
header = ['cord_uid', 'title', 'authors', 'journal', 'publish_time'] # original header
header_neo4j = ['doc_id:ID(Document)', 'title:STRING', 'authors:STRING', 'journal:STRING', 'publish_time:DATE'] # headers for neo4j import

In [6]:
# format metadata for Neo4j import
metadata_df['publish_time'] = pd.to_datetime(metadata_df['publish_time'])
metadata_df = metadata_df.sort_values(by='publish_time', ascending=False)
metadata_df = metadata_df[header]
metadata_df = metadata_df[metadata_df['cord_uid'].isin(all_paper_ids)]
metadata_df = metadata_df.dropna(subset=['cord_uid']).drop_duplicates(subset=['cord_uid'])
metadata_df.columns = header_neo4j
paper_ids = set(metadata_df['doc_id:ID(Document)'])

print('total number of doccument/paper IDs after Stage 2 with metadata information:', len(paper_ids))

total number of doccument/paper IDs after Stage 2 with metadata information: 178900


In [7]:
# add papers w/o metadata
temp_df = pd.DataFrame(all_paper_ids.difference(paper_ids)) 
if len(temp_df) > 0:
    temp_df.columns = [list(header_neo4j)[0]]
    metadata_df = metadata_df.append(temp_df)
metadata_df[':LABEL'] = 'Document'

In [8]:
# save to .csv
metadata_df.to_csv(output_path, index=False)