In [2]:
%load_ext watermark
%watermark -v -n -m -p numpy,scipy,sklearn,pandas

Sun Apr 12 2020 

CPython 3.7.4
IPython 5.5.0

numpy 1.16.4
scipy 1.3.1
sklearn 0.0
pandas 0.24.2

compiler   : MSC v.1915 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 158 Stepping 10, GenuineIntel
CPU cores  : 12
interpreter: 64bit


In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import os
PROJ_ROOT = os.path.abspath(os.path.join(os.pardir))
print(PROJ_ROOT)

E:\Root\GitHub\COVID-19-research-paper-analysis


In [4]:
import os
import json
from pprint import pprint
from copy import deepcopy

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm


## formats the author's names in a consistent manner

def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])

## parses metadata block to extract text of author affiliations, e.g., university

def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)


## parses metadata block to extract text of the authors

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

## formats the body content block of the article text

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

## formats the bibliography metadata

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)

In [5]:
##loads the files into memory as a list

def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files

## pushes the json loaded files into data frame(s)

def generate_clean_df(all_files):
    cleaned_files = []
    
    for file in tqdm(all_files):
        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'], 
                           with_affiliation=True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            format_bib(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries']
        ]

        cleaned_files.append(features)

    col_names = ['paper_id', 'title', 'authors',
                 'affiliations', 'abstract', 'text', 
                 'bibliography','raw_authors','raw_bibliography']

    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df.head()
    
    return clean_df

In [6]:
## see how many articles there are in biorxiv_dir

biorxiv_dir = '../data/interim/biorxiv_medrxiv/pdf_json/'
filenames = os.listdir(biorxiv_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))

Number of articles retrieved from biorxiv: 1625


In [7]:
## Loop through all files and save their information in the list

all_files = []

for filename in filenames:
    filename = biorxiv_dir + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)

In [8]:
## take a look at one of the files

file = all_files[0]
print("Dictionary keys:", file.keys())

Dictionary keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


In [9]:
## Print an abstract of the data dictionary

pprint(file['abstract'])

[{'cite_spans': [],
  'ref_spans': [],
  'section': 'Abstract',
  'text': 'word count: 194 22 Text word count: 5168 23 24 25 author/funder. '
          'All rights reserved. No reuse allowed without permission. Abstract '
          '27 The positive stranded RNA genomes of picornaviruses comprise a '
          'single large open reading 28 frame flanked by 5′ and 3′ '
          'untranslated regions (UTRs). Foot-and-mouth disease virus (FMDV) 29 '
          'has an unusually large 5′ UTR (1.3 kb) containing five structural '
          'domains. These include the 30 internal ribosome entry site (IRES), '
          'which facilitates initiation of translation, and the cis-acting 31 '
          'replication element (cre). Less well characterised structures are a '
          '5′ terminal 360 nucleotide 32 stem-loop, a variable length '
          'poly-C-tract of approximately 100-200 nucleotides and a series of '
          '33 two to four tandemly repeated pseudoknots (PKs). We investigated

In [10]:
## look at some of the "body_text" characteristics of this first file

print("body_text type:", type(file['body_text']))
print("body_text length:", len(file['body_text']))
print("body_text keys:", file['body_text'][0].keys())

body_text type: <class 'list'>
body_text length: 20
body_text keys: dict_keys(['text', 'cite_spans', 'ref_spans', 'section'])


In [11]:
## looks like there are some nested structures in "body_text"
## let's extract some of them and see what we have...

print("body_text content:")
pprint(file['body_text'][:2], depth=3)

body_text content:
[{'cite_spans': [],
  'ref_spans': [{...}],
  'section': '',
  'text': 'VP3, and VP0 (which is further processed to VP2 and VP4 during '
          'virus assembly) (6). The P2 64 and P3 regions encode the '
          'non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro '
          'and 4 structural protein-coding region is replaced by reporter '
          'genes, allow the study of genome 68 replication without the '
          'requirement for high containment (9, 10) ( figure 1A ).'},
 {'cite_spans': [{...}, {...}, {...}, {...}, {...}, {...}],
  'ref_spans': [],
  'section': '70',
  'text': 'The FMDV 5′ UTR is the largest known picornavirus UTR, comprising '
          'approximately 1300 71 nucleotides and containing several highly '
          'structured regions. The first 360 nucleotides at the 5′ 72 end are '
          'predicted to fold into a single large stem loop termed the '
          'S-fragment, followed by a The PKs were originally predicted 

In [12]:
## extract the grouped "section" titles

texts = [(di['section'], di['text']) for di in file['body_text']]
texts_di = {di['section']: "" for di in file['body_text']}
for section, text in texts:
    texts_di[section] += text

pprint(list(texts_di.keys()))

['',
 '70',
 '120',
 '135',
 '136',
 '144',
 '301',
 'Function of the PKs in replication is dependent on downstream interactions '
 'and 350',
 '368',
 '468',
 '479']


In [13]:
## formatting each section title with the correct content
## limiting to 3000 characters

body = ""

for section, text in texts_di.items():
    body += section
    body += "\n\n"
    body += text
    body += "\n\n"

print(body[:3000])



VP3, and VP0 (which is further processed to VP2 and VP4 during virus assembly) (6). The P2 64 and P3 regions encode the non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro and 4 structural protein-coding region is replaced by reporter genes, allow the study of genome 68 replication without the requirement for high containment (9, 10) ( figure 1A ).

70

The FMDV 5′ UTR is the largest known picornavirus UTR, comprising approximately 1300 71 nucleotides and containing several highly structured regions. The first 360 nucleotides at the 5′ 72 end are predicted to fold into a single large stem loop termed the S-fragment, followed by a The PKs were originally predicted in 1987 and consist of two to four tandem repeats of a ~48 86 nucleotide region containing a small stem loop and downstream interaction site (figure 1B) 87 (12). Due to the sequence similarity between the PKs (figure 1C), it is speculated that they 88 were formed by duplication events during viral replication, p

In [14]:
## same as above but in a single line (display looks same as above)

print(format_body(file['body_text'])[:3000])



VP3, and VP0 (which is further processed to VP2 and VP4 during virus assembly) (6). The P2 64 and P3 regions encode the non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro and 4 structural protein-coding region is replaced by reporter genes, allow the study of genome 68 replication without the requirement for high containment (9, 10) ( figure 1A ).

70

The FMDV 5′ UTR is the largest known picornavirus UTR, comprising approximately 1300 71 nucleotides and containing several highly structured regions. The first 360 nucleotides at the 5′ 72 end are predicted to fold into a single large stem loop termed the S-fragment, followed by a The PKs were originally predicted in 1987 and consist of two to four tandem repeats of a ~48 86 nucleotide region containing a small stem loop and downstream interaction site (figure 1B) 87 (12). Due to the sequence similarity between the PKs (figure 1C), it is speculated that they 88 were formed by duplication events during viral replication, p

## Metadata

In [15]:
## what's in the metadata dictionary:

print(all_files[0]['metadata'].keys())

dict_keys(['title', 'authors'])


In [16]:
## sample of contents of the metadata title

print(all_files[0]['metadata']['title'])

The RNA pseudoknots in foot-and-mouth disease virus are dispensable for genome replication but essential for the production of infectious virus. 2 3


In [17]:
## sample of contents of the metadata authors (limited to 3)

authors = all_files[52]['metadata']['authors']
pprint(authors[:3])

[{'affiliation': {},
  'email': '',
  'first': 'D',
  'last': 'Luis',
  'middle': ['Sc Eric'],
  'suffix': ''},
 {'affiliation': {'institution': 'Brazilian Federal Court of Accounts',
                  'laboratory': '',
                  'location': {'country': 'Brazil',
                               'postCode': '70042-900',
                               'region': 'DF',
                               'settlement': 'Brasília'}},
  'email': '',
  'first': 'Barroso',
  'last': 'Cavalcante',
  'middle': [],
  'suffix': ''},
 {'affiliation': {},
  'email': '',
  'first': 'M',
  'last': 'Cristina',
  'middle': ['D Juliana'],
  'suffix': ''}]


In [18]:
## using two of the formatting functions

for author in authors:
    print("Name:", format_name(author))
    print("Affiliation:", format_affiliation(author['affiliation']))
    print()

Name: D Sc Eric Luis
Affiliation: 

Name: Barroso Cavalcante
Affiliation: Brazilian Federal Court of Accounts, 70042-900, Brasília, DF, Brazil

Name: M D Juliana Cristina
Affiliation: 

Name: Cardoso Ferreira
Affiliation: Federal University of São Paulo, 04021-001, São Paulo, SP, Brazil



In [19]:
## longer list of authors

pprint(all_files[4]['metadata'], depth=4)

{'authors': [{'affiliation': {'institution': 'University of Georgia',
                              'laboratory': '',
                              'location': {...}},
              'email': '',
              'first': 'Salman',
              'last': 'Butt',
              'middle': ['L'],
              'suffix': ''},
             {'affiliation': {'institution': 'University of Georgia',
                              'laboratory': '',
                              'location': {...}},
              'email': '',
              'first': 'Eric',
              'last': 'Erwood',
              'middle': ['C'],
              'suffix': ''},
             {'affiliation': {'institution': 'University of Georgia',
                              'laboratory': '',
                              'location': {...}},
              'email': '',
              'first': 'Jian',
              'last': 'Zhang',
              'middle': [],
              'suffix': ''},
             {'affiliation': {'institution': 'Univ

In [20]:
## showing the formatting of all authors from a longer list
## optional argument to show affiliation

authors = all_files[4]['metadata']['authors']
print("Formatting without affiliation:")
print(format_authors(authors, with_affiliation=False))
print("\nFormatting with affiliation:")
print(format_authors(authors, with_affiliation=True))

Formatting without affiliation:
Salman L Butt, Eric C Erwood, Jian Zhang, Holly S Sellers, Kelsey Young, Kevin K Lahmers, James B Stanton

Formatting with affiliation:
Salman L Butt (University of Georgia, 30602, Athens, GA, USA), Eric C Erwood (University of Georgia, 30602, Athens, GA, USA), Jian Zhang (University of Georgia, 30602, Athens, GA, USA), Holly S Sellers (University of Georgia, 30602, Athens, GA, USA), Kelsey Young (University of Georgia, 30602, Athens, GA, USA), Kevin K Lahmers (Virginia Polytechnical Institute and State University, 24061, Blacksburg, VA, USA), James B Stanton (University of Georgia, 30602, Athens, GA, USA)


## Bibliography

In [21]:
bibs = list(file['bib_entries'].values())
pprint(bibs[:2], depth=4)

[{'authors': [{'first': 'T', 'last': 'Jackson', 'middle': [], 'suffix': ''},
              {'first': 'T', 'last': 'Tuthill', 'middle': [...], 'suffix': ''},
              {'first': 'D', 'last': 'Rowlands', 'middle': [...], 'suffix': ''},
              {'first': 'N',
               'last': 'Stonehouse',
               'middle': [...],
               'suffix': ''}],
  'issn': '',
  'other_ids': {},
  'pages': '',
  'ref_id': 'b0',
  'title': 'Genetic economy in 598 picornaviruses: Foot-and-mouth disease '
           'virus replication exploits alternative precursor 599 cleavage '
           'pathways',
  'venue': 'PLOS Pathog',
  'volume': '13',
  'year': 2017},
 {'authors': [{'first': 'N',
               'last': 'Sanderson',
               'middle': [...],
               'suffix': ''},
              {'first': 'N', 'last': 'Knowles', 'middle': [...], 'suffix': ''},
              {'first': 'D', 'last': 'King', 'middle': [...], 'suffix': ''},
              {'first': 'E', 'last': 'Cottam', 

In [22]:
## reusing the author's formatting function

format_authors(bibs[1]['authors'], with_affiliation=False)

'N D Sanderson, N J Knowles, D P King, E M Cottam'

In [23]:
## complete formatting of bibliography

bib_formatted = format_bib(bibs[:5])
print(bib_formatted)

Genetic economy in 598 picornaviruses: Foot-and-mouth disease virus replication exploits alternative precursor 599 cleavage pathways, T Jackson, T J Tuthill, D J Rowlands, N J Stonehouse, PLOS Pathog, 2017; A universal protocol to 602 generate consensus level genome sequences for foot-and-mouth disease virus and other 603 positive-sense polyadenylated RNA viruses using the Illumina MiSeq, N D Sanderson, N J Knowles, D P King, E M Cottam, BMC Genomics, 2014; Library preparation for highly accurate population 606 sequencing of RNA viruses, A Acevedo, R Andino, Nat Protoc, 2014; IDBA-UD: a de novo assembler for 608 single-cell and metagenomic sequencing data with highly uneven depth, Y Peng, Hcm Leung, S M Yiu, Fyl Chin, , 2012; Basic local alignment 611 search tool, S F Altschul, W Gish, W Miller, E W Myers, D J Lipman, J Mol Biol, 1990


## Convert to CSV

In [24]:
cleaned_files = []

for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        format_authors(file['metadata']['authors']),
        format_authors(file['metadata']['authors'], 
                       with_affiliation=True),
        format_body(file['abstract']),
        format_body(file['body_text']),
        format_bib(file['bib_entries']),
        file['metadata']['authors'],
        file['bib_entries']
    ]
    
    cleaned_files.append(features)

HBox(children=(FloatProgress(value=0.0, max=1625.0), HTML(value='')))




In [25]:
## add some column names and throw into pandas dataframe

col_names = [
    'paper_id', 
    'title', 
    'authors',
    'affiliations', 
    'abstract', 
    'text', 
    'bibliography',
    'raw_authors',
    'raw_bibliography'
]

clean_df = pd.DataFrame(cleaned_files, columns=col_names)
clean_df.head(10)

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,"Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...","Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...",Abstract\n\nword count: 194 22 Text word count...,"\n\nVP3, and VP0 (which is further processed t...",Genetic economy in 598 picornaviruses: Foot-an...,"[{'first': 'Joseph', 'middle': ['C'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Genetic..."
1,00340eea543336d54adda18236424de6a5e91c9d,Analysis Title: Regaining perspective on SARS-...,"Carla Mavian, Simone Marini, Costanza Manes, I...","Carla Mavian (University of Florida, Gainesvil...","Abstract\n\nDuring the past three months, a ne...","\n\nIn December 2019, a novel coronavirus, SAR...","Situation Report -43, , Coronavirus disease 20...","[{'first': 'Carla', 'middle': [], 'last': 'Mav...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Situati..."
2,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,"Hanchu Zhou, Jiannan Yang, Kaicheng Tang, † , ...","Hanchu Zhou (City University of Hong Kong, Hon...",,Introduction\n\nThe 2019-nCoV epidemic has spr...,World Health Organizations. Novel Coronavirus ...,"[{'first': 'Hanchu', 'middle': [], 'last': 'Zh...","{'BIBREF0': {'ref_id': 'b0', 'title': 'World H..."
3,00911cf4f99a3d5ae5e5b787675646a743574496,CHEER: hierarCHical taxonomic classification f...,"Jiayu Shang, Yanni Sun","Jiayu Shang (City University of Hong Kong, Hon...",Abstract\n\nThe fast accumulation of viral met...,"Introduction\n\nMetagenomic sequencing, which ...",Application of metagenomics in the human gut m...,"[{'first': 'Jiayu', 'middle': [], 'last': 'Sha...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Applica..."
4,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...","Salman L Butt, Eric C Erwood, Jian Zhang, Holl...","Salman L Butt (University of Georgia, 30602, A...",Abstract\n\nInfectious bronchitis (IB) causes ...,"Introduction\n\nInfectious bronchitis (IB), wh...",Emergence of novel strains of avian infectious...,"[{'first': 'Salman', 'middle': ['L'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Emergen..."
5,00eb9220dc8cd351393b6b035323d350f103f8c6,Title: Impact of COVID-19 on psychiatric asses...,"Victor M Castro, Roy H Perlis",Victor M Castro (Massachusetts General Hospita...,,Introduction\n\nEvidence from prior pandemics ...,The psychological impact of severe acute respi...,"[{'first': 'Victor', 'middle': ['M'], 'last': ...","{'BIBREF4': {'ref_id': 'b4', 'title': 'The psy..."
6,0139ea4ca580af99b602c6435368e7fdbefacb03,A Combined Evidence Approach to Prioritize Nip...,"Nishi Kumari, Ayush Upadhyay, Kishan Kalia, Ra...","Nishi Kumari (Panjab University, Chandigarh, I...",Abstract\n\nNipah Virus (NiV) came into limeli...,Introduction\n\nNipah is an infectious negativ...,"Molecular biology of Hendra and Nipah viruses,...","[{'first': 'Nishi', 'middle': [], 'last': 'Kum...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Molecul..."
7,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Assessing spread risk of Wuhan novel coronavir...,"Shengjie Lai, Isaac I Bogoch, Nick W Ruktanonc...","Shengjie Lai (University of Southampton, UK), ...",Abstract\n\nBackground: A novel coronavirus (2...,"Introduction\n\nIn December 2019, a cluster of...",A Novel Coronavirus Genome Identified in a Clu...,"[{'first': 'Shengjie', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A Novel..."
8,018fb5e62fbbcae07d57d94d29ac630dcc4dccf9,TITLE: The early scientific literature respons...,"Davide Gori, Erik Boetto, Maria Pia Fantini","Davide Gori (University of Bologna), Erik Boet...",Abstract\n\nAll rights reserved. No reuse allo...,Introduction\n\nRecent events highlight how em...,"A""IV to, G F Gao, From, Z""IKV: Attacks from E...","[{'first': 'Davide', 'middle': [], 'last': 'Go...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A""IV to..."
9,01d162d7fae6aaba8e6e60e563ef4c2fca7b0e18,"TWIRLS, an automated topic-wise inference meth...","Xiaoyang Ji, Chunming Zhang, Yubo Zhai, Zhongh...","Xiaoyang Ji (Chinese Academy of Sciences, Beij...",Abstract\n\nFaced with the current large-scale...,Introduction\n\nThe sudden outbreak of the new...,A pneumonia outbreak associated with a new cor...,"[{'first': 'Xiaoyang', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A pneum..."


In [26]:
## aaaaaaand generate csv

clean_df.to_csv('biorxiv_clean.csv', index=False)

In [27]:
## let's do the same with the other json components...

pmc_dir = '../data/interim/custom_license/pmc_json/'
pmc_files = load_files(pmc_dir)


HBox(children=(FloatProgress(value=0.0, max=7802.0), HTML(value='')))




In [None]:
filenames = os.listdir(pmc_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))


filename = pmc_dir + filenames[0]
file = json.load(open(filename, 'rb'))

print("Dictionary keys:", file.keys())

In [30]:
pdf_dir = '../data/interim/custom_license/pdf_json/'
pdf_files = load_files(pdf_dir)

HBox(children=(FloatProgress(value=0.0, max=26505.0), HTML(value='')))




In [None]:
pmc_df = generate_clean_df(pmc_files)
pmc_df.to_csv('clean_pmc.csv', index=False)
pmc_df.head()

In [31]:
filenames = os.listdir(pdf_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))


filename = pdf_dir + filenames[0]
file = json.load(open(filename, 'rb'))

print("Dictionary keys:", file.keys())

Number of articles retrieved from biorxiv: 26505
Dictionary keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


In [32]:
pdf_df = generate_clean_df(pdf_files)
pdf_df.to_csv('clean_pdf.csv', index=False)
pdf_df.head()

HBox(children=(FloatProgress(value=0.0, max=26505.0), HTML(value='')))




Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,0001418189999fea7f7cbe3e82703d71c85a6fe5,Absence of surface expression of feline infect...,"E Cornelissen, H L Dewerchin, E Van Hamme, H J...","E Cornelissen (Ghent University, Salisburylaan...",Abstract\n\nFeline infectious peritonitis viru...,Introduction\n\nFeline infectious peritonitis ...,Using direct immunofluorescence to detect coro...,"[{'first': 'E', 'middle': [], 'last': 'Corneli...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Using d..."
1,00016663c74157a66b4d509d5c4edffd5391bbe0,,,,,Introduction\n\nViruses are increasingly recog...,"Principles of Virology in Fields Virology, R C...",[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'Princip..."
2,0005d253951fedc237715a37db147032eea28912,,,,,\n\nSynthesized by William Prusoff in the late...,Molecular basis of interferon resistance in he...,[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'Molecul..."
3,00060fb61742ff60e4e3ba4648c74a34cfe9560d,,"Jesi Kim, Todd Thomsen, Naomi Sell, Andrew J G...","Jesi Kim (Harvard Medical School, Boston, MA, ...",,Introduction\n\nThe outbreak of a novel corona...,An interactive web-based dashboard to track CO...,"[{'first': 'Jesi', 'middle': [], 'last': 'Kim'...","{'BIBREF0': {'ref_id': 'b0', 'title': 'An inte..."
4,00073cb65dd2596249230fab8b15a71c4a135895,Risk Parameters of Fulminant Acute Respiratory...,"Shoji Kawachi, San Thi Luong, Mika Shigematsu,...","Shoji Kawachi, San Thi Luong (National Hospita...",Abstract\n\nA clinical picture of patients wit...,"\n\nSince then, many clinical case reports hav...",Characterization of avian H5N1 influenza virus...,"[{'first': 'Shoji', 'middle': [], 'last': 'Kaw...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Charact..."


In [33]:
pdf_dir = '../data/interim/comm_use_subset/pdf_json/'
pdf_files = load_files(pdf_dir)
pdf_df = generate_clean_df(pdf_files)
pdf_df.to_csv('comm_use_subset.csv', index=False)

HBox(children=(FloatProgress(value=0.0, max=9524.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9524.0), HTML(value='')))




In [34]:
pdf_dir = '../data/interim/noncomm_use_subset/pdf_json/'
pdf_files = load_files(pdf_dir)
pdf_df = generate_clean_df(pdf_files)
pdf_df.to_csv('noncomm_use_subset.csv', index=False)

HBox(children=(FloatProgress(value=0.0, max=2490.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2490.0), HTML(value='')))


