# COVID-19 Workbook
In this Workbook, I will try to replicate some graphs others have made. 
Example from Tarun Paparaju (in Kaggle)

## Install and import libraries

In [1]:
!pip install -q pycountry

In [3]:
!pip install folium

Collecting folium
  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [4]:
import os
import gc
import re
import folium
from scipy import stats

import warnings
warnings.filterwarnings("ignore")

import math
import numpy as np
import scipy as sp
import pandas as pd

import pycountry
from sklearn import metrics
from sklearn.utils import shuffle
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import nltk
from textblob import TextBlob
from wordcloud import WordCloud
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import random
import networkx as nx
from pandas import Timestamp

import requests
from IPython.display import HTML

In [6]:
!pip install plotly

Collecting plotly
  Downloading https://files.pythonhosted.org/packages/f5/c3/03a183b94441da857e7d2b0564cb482bd15824dc1af2d2b337ea6e538c8f/plotly-4.5.4-py2.py3-none-any.whl (7.1MB)
Collecting retrying>=1.3.3 (from plotly)
  Downloading https://files.pythonhosted.org/packages/44/ef/beae4b4ef80902f22e3af073397f079c96969c69b2c7d52a57ea9ae61c9d/retrying-1.3.3.tar.gz
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py): started
  Building wheel for retrying (setup.py): finished with status 'done'
  Stored in directory: C:\Users\Renate\AppData\Local\pip\Cache\wheels\d7\a9\33\acc7b709e2a35caa7d4cae442f6fe6fbf2c43f80823d46460c
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-4.5.4 retrying-1.3.3


In [7]:
import seaborn as sns
from tqdm import tqdm
import matplotlib.cm as cm
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

tqdm.pandas()
np.random.seed(0)
%env PYTHONHASHSEED=0

import warnings
warnings.filterwarnings("ignore")

env: PYTHONHASHSEED=0


In [9]:
!pip install tqdm



In [11]:
#First clean the JSON files from biorxiv according to xhlulu (EDA, parse JSON and generate clean CSV)
import os
import json
from pprint import pprint
from copy import deepcopy

import numpy as np
import pandas as pd
#from tqdm.notebook import tqdm
import tqdm

Unhide the cell below to find the definition of the following functions:

- format_name(author)
- format_affiliation(affiliation)
- format_authors(authors, with_affiliation=False)
- format_body(body_text)
- format_bib(bibs)

In [12]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)

Unhide the cell below to find the definition of the following functions:

- load_files(dirname)
- generate_clean_df(all_files)

In [13]:
def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files

def generate_clean_df(all_files):
    cleaned_files = []
    
    for file in tqdm(all_files):
        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'], 
                           with_affiliation=True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            format_bib(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries']
        ]

        cleaned_files.append(features)

    col_names = ['paper_id', 'title', 'authors',
                 'affiliations', 'abstract', 'text', 
                 'bibliography','raw_authors','raw_bibliography']

    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df.head()
    
    return clean_df

C:\Users\Renate\Documents\GitHub\Data-Projects\Kaggle - Covid-19\biorxiv_medrxiv\biorxiv_medrxiv

## Biorxiv: Exploration
Let's first take a quick glance at the biorxiv subset of the data. We will also use this opportunity to load all of the json files into a list of nested dictionaries (each dict is an article).

In [17]:
biorxiv_dir = 'C:/Users/Renate/Documents/GitHub/Data-Projects/Kaggle - Covid-19/biorxiv_medrxiv/biorxiv_medrxiv/'
filenames = os.listdir(biorxiv_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))

Number of articles retrieved from biorxiv: 885


In [18]:
all_files = []

for filename in filenames:
    filename = biorxiv_dir + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)

In [19]:
file = all_files[0]
print("Dictionary keys:", file.keys())

Dictionary keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


## Biorxiv: Abstract
the abstract dictionary is fairly simple

In [21]:
pprint(file['abstract'])

[{'cite_spans': [],
  'ref_spans': [],
  'section': 'Abstract',
  'text': 'word count: 194 22 Text word count: 5168 23 24 25 author/funder. '
          'All rights reserved. No reuse allowed without permission. Abstract '
          '27 The positive stranded RNA genomes of picornaviruses comprise a '
          'single large open reading 28 frame flanked by 5′ and 3′ '
          'untranslated regions (UTRs). Foot-and-mouth disease virus (FMDV) 29 '
          'has an unusually large 5′ UTR (1.3 kb) containing five structural '
          'domains. These include the 30 internal ribosome entry site (IRES), '
          'which facilitates initiation of translation, and the cis-acting 31 '
          'replication element (cre). Less well characterised structures are a '
          '5′ terminal 360 nucleotide 32 stem-loop, a variable length '
          'poly-C-tract of approximately 100-200 nucleotides and a series of '
          '33 two to four tandemly repeated pseudoknots (PKs). We investigated

## Biorxiv: body text
Let's try to find out how the body_text dictionary looks like

In [22]:
print("body_text type:", type(file['body_text']))
print("body_text length:", len(file['body_text']))
print("body_text keys:", file['body_text'][0].keys())

body_text type: <class 'list'>
body_text length: 20
body_text keys: dict_keys(['text', 'cite_spans', 'ref_spans', 'section'])


In [23]:
print(all_files[0]['metadata'].keys())

dict_keys(['title', 'authors'])


In [24]:
print(all_files[0]['metadata']['title'])

The RNA pseudoknots in foot-and-mouth disease virus are dispensable for genome replication but essential for the production of infectious virus. 2 3


In [36]:
authors = all_files[1]['metadata']['authors']
pprint(authors[:4])

[{'affiliation': {'institution': 'City University of Hong Kong',
                  'laboratory': '',
                  'location': {'country': 'China', 'settlement': 'Hong Kong'}},
  'email': '',
  'first': 'Hanchu',
  'last': 'Zhou',
  'middle': [],
  'suffix': ''},
 {'affiliation': {'institution': 'City University of Hong Kong',
                  'laboratory': '',
                  'location': {'country': 'China', 'settlement': 'Hong Kong'}},
  'email': '',
  'first': 'Jiannan',
  'last': 'Yang',
  'middle': [],
  'suffix': ''},
 {'affiliation': {'institution': 'City University of Hong Kong',
                  'laboratory': '',
                  'location': {'country': 'China', 'settlement': 'Hong Kong'}},
  'email': '',
  'first': 'Kaicheng',
  'last': 'Tang',
  'middle': [],
  'suffix': ''},
 {'affiliation': {},
  'email': '',
  'first': '†',
  'last': '',
  'middle': [],
  'suffix': ''}]


In [37]:
texts = [(di['section'], di['text']) for di in file['body_text']]
texts_di = {di['section']: "" for di in file['body_text']}
for section, text in texts:
    texts_di[section] += text

pprint(list(texts_di.keys()))

['',
 '70',
 '120',
 '135',
 '136',
 '144',
 '301',
 'Function of the PKs in replication is dependent on downstream interactions '
 'and 350',
 '368',
 '468',
 '479']


In [28]:
body = ""

for section, text in texts_di.items():
    body += section
    body += "\n\n"
    body += text
    body += "\n\n"

print(body[:3000])



VP3, and VP0 (which is further processed to VP2 and VP4 during virus assembly) (6). The P2 64 and P3 regions encode the non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro and 4 structural protein-coding region is replaced by reporter genes, allow the study of genome 68 replication without the requirement for high containment (9, 10) ( figure 1A ).

70

The FMDV 5′ UTR is the largest known picornavirus UTR, comprising approximately 1300 71 nucleotides and containing several highly structured regions. The first 360 nucleotides at the 5′ 72 end are predicted to fold into a single large stem loop termed the S-fragment, followed by a The PKs were originally predicted in 1987 and consist of two to four tandem repeats of a ~48 86 nucleotide region containing a small stem loop and downstream interaction site (figure 1B) 87 (12). Due to the sequence similarity between the PKs (figure 1C), it is speculated that they 88 were formed by duplication events during viral replication, p

In [38]:

authors = all_files[1]['metadata']['authors']
pprint(authors[:3])


[{'affiliation': {'institution': 'City University of Hong Kong',
                  'laboratory': '',
                  'location': {'country': 'China', 'settlement': 'Hong Kong'}},
  'email': '',
  'first': 'Hanchu',
  'last': 'Zhou',
  'middle': [],
  'suffix': ''},
 {'affiliation': {'institution': 'City University of Hong Kong',
                  'laboratory': '',
                  'location': {'country': 'China', 'settlement': 'Hong Kong'}},
  'email': '',
  'first': 'Jiannan',
  'last': 'Yang',
  'middle': [],
  'suffix': ''},
 {'affiliation': {'institution': 'City University of Hong Kong',
                  'laboratory': '',
                  'location': {'country': 'China', 'settlement': 'Hong Kong'}},
  'email': '',
  'first': 'Kaicheng',
  'last': 'Tang',
  'middle': [],
  'suffix': ''}]


In [39]:
for author in authors:
    print("Name:", format_name(author))
    print("Affiliation:", format_affiliation(author['affiliation']))
    print()

Name: Hanchu Zhou
Affiliation: City University of Hong Kong, Hong Kong, China

Name: Jiannan Yang
Affiliation: City University of Hong Kong, Hong Kong, China

Name: Kaicheng Tang
Affiliation: City University of Hong Kong, Hong Kong, China

Name: † 
Affiliation: 

Name: Qingpeng Zhang
Affiliation: City University of Hong Kong, Hong Kong, China

Name: Zhidong Cao
Affiliation: Chinese Academy of Sciences, Beijing, China

Name: Dirk Pfeiffer
Affiliation: City University of Hong Kong, Hong Kong, China

Name: Daniel Dajun Zeng
Affiliation: Chinese Academy of Sciences, Beijing, China



In [40]:
pprint(all_files[4]['metadata'], depth=4)

{'authors': [{'affiliation': {'institution': 'University of Southampton',
                              'laboratory': '',
                              'location': {...}},
              'email': '',
              'first': 'Shengjie',
              'last': 'Lai',
              'middle': [],
              'suffix': ''},
             {'affiliation': {'institution': 'University of Toronto',
                              'laboratory': '',
                              'location': {...}},
              'email': '',
              'first': 'Isaac',
              'last': 'Bogoch',
              'middle': ['I'],
              'suffix': ''},
             {'affiliation': {'institution': 'University of Southampton',
                              'laboratory': '',
                              'location': {...}},
              'email': '',
              'first': 'Nick',
              'last': 'Ruktanonchai',
              'middle': ['W'],
              'suffix': ''},
             {'affiliation': {'in

In [41]:
authors = all_files[4]['metadata']['authors']
print("Formatting without affiliation:")
print(format_authors(authors, with_affiliation=False))
print("\nFormatting with affiliation:")
print(format_authors(authors, with_affiliation=True))

Formatting without affiliation:
Shengjie Lai, Isaac I Bogoch, Nick W Ruktanonchai, Alexander Watts, Xin Lu, Weizhong Yang, Hongjie Yu, Kamran Khan, Andrew J Tatem

Formatting with affiliation:
Shengjie Lai (University of Southampton, UK), Isaac I Bogoch (University of Toronto, Toronto, Canada), Nick W Ruktanonchai (University of Southampton, UK), Alexander Watts (St. Michael's Hospital, Toronto, Canada), Xin Lu (National University of Defense Technology, Changsha, China), Weizhong Yang (Chinese Academy of Medical Sciences & Peking Union Medical College), Hongjie Yu (Fudan University, Shanghai, China), Kamran Khan (University of Toronto, Toronto, Canada), Andrew J Tatem (University of Southampton, UK)


In [42]:
bibs = list(file['bib_entries'].values())
pprint(bibs[:2], depth=4)

[{'authors': [{'first': 'T', 'last': 'Jackson', 'middle': [], 'suffix': ''},
              {'first': 'T', 'last': 'Tuthill', 'middle': [...], 'suffix': ''},
              {'first': 'D', 'last': 'Rowlands', 'middle': [...], 'suffix': ''},
              {'first': 'N',
               'last': 'Stonehouse',
               'middle': [...],
               'suffix': ''}],
  'issn': '',
  'other_ids': {},
  'pages': '',
  'ref_id': 'b0',
  'title': 'Genetic economy in 598 picornaviruses: Foot-and-mouth disease '
           'virus replication exploits alternative precursor 599 cleavage '
           'pathways',
  'venue': 'PLOS Pathog',
  'volume': '13',
  'year': 2017},
 {'authors': [{'first': 'N',
               'last': 'Sanderson',
               'middle': [...],
               'suffix': ''},
              {'first': 'N', 'last': 'Knowles', 'middle': [...], 'suffix': ''},
              {'first': 'D', 'last': 'King', 'middle': [...], 'suffix': ''},
              {'first': 'E', 'last': 'Cottam', 

In [43]:
format_authors(bibs[1]['authors'], with_affiliation=False)

'N D Sanderson, N J Knowles, D P King, E M Cottam'

In [44]:
bib_formatted = format_bib(bibs[:5])
print(bib_formatted)

Genetic economy in 598 picornaviruses: Foot-and-mouth disease virus replication exploits alternative precursor 599 cleavage pathways, T Jackson, T J Tuthill, D J Rowlands, N J Stonehouse, PLOS Pathog, 2017; A universal protocol to 602 generate consensus level genome sequences for foot-and-mouth disease virus and other 603 positive-sense polyadenylated RNA viruses using the Illumina MiSeq, N D Sanderson, N J Knowles, D P King, E M Cottam, BMC Genomics, 2014; Library preparation for highly accurate population 606 sequencing of RNA viruses, A Acevedo, R Andino, Nat Protoc, 2014; IDBA-UD: a de novo assembler for 608 single-cell and metagenomic sequencing data with highly uneven depth, Y Peng, Hcm Leung, S M Yiu, Fyl Chin, , 2012; Basic local alignment 611 search tool, S F Altschul, W Gish, W Miller, E W Myers, D J Lipman, J Mol Biol, 1990


In [49]:
! pip3 install tqdm

Collecting tqdm
  Downloading https://files.pythonhosted.org/packages/47/55/fd9170ba08a1a64a18a7f8a18f088037316f2a41be04d2fe6ece5a653e8f/tqdm-4.43.0-py2.py3-none-any.whl (59kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.43.0


You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [53]:
! pip install --upgrade pip

Collecting pip
  Downloading https://files.pythonhosted.org/packages/54/0c/d01aa759fdc501a58f431eb594a17495f15b88da142ce14b5845662c13f3/pip-20.0.2-py2.py3-none-any.whl (1.4MB)
Installing collected packages: pip
  Found existing installation: pip 19.0.3
    Uninstalling pip-19.0.3:
      Successfully uninstalled pip-19.0.3


Could not install packages due to an EnvironmentError: [WinError 5] Access is denied: 'C:\\Users\\Renate\\AppData\\Local\\Temp\\pip-uninstall-rrij6h7c\\pip.exe'
Consider using the `--user` option or check the permissions.



In [54]:
!pip install pip --upgrade 

Requirement already up-to-date: pip in c:\users\renate\anaconda3\anaconda\lib\site-packages (20.0.2)


In [56]:
from tqdm import tqdm

In [57]:
cleaned_files = []

for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        format_authors(file['metadata']['authors']),
        format_authors(file['metadata']['authors'], 
                       with_affiliation=True),
        format_body(file['abstract']),
        format_body(file['body_text']),
        format_bib(file['bib_entries']),
        file['metadata']['authors'],
        file['bib_entries']
    ]
    
    cleaned_files.append(features)

100%|███████████████████████████████████████████████████████████████████████████████| 885/885 [00:01<00:00, 530.98it/s]


In [58]:
col_names = [
    'paper_id', 
    'title', 
    'authors',
    'affiliations', 
    'abstract', 
    'text', 
    'bibliography',
    'raw_authors',
    'raw_bibliography'
]

clean_df = pd.DataFrame(cleaned_files, columns=col_names)
clean_df.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,"Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...","Joseph C Ward, Lidia Lasecka-Dykes, Chris Neil...",Abstract\n\nword count: 194 22 Text word count...,"\n\nVP3, and VP0 (which is further processed t...",Genetic economy in 598 picornaviruses: Foot-an...,"[{'first': 'Joseph', 'middle': ['C'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Genetic..."
1,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,"Hanchu Zhou, Jiannan Yang, Kaicheng Tang, † , ...","Hanchu Zhou (City University of Hong Kong, Hon...",,Introduction\n\nThe 2019-nCoV epidemic has spr...,World Health Organizations. Novel Coronavirus ...,"[{'first': 'Hanchu', 'middle': [], 'last': 'Zh...","{'BIBREF0': {'ref_id': 'b0', 'title': 'World H..."
2,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...","Salman L Butt, Eric C Erwood, Jian Zhang, Holl...","Salman L Butt (University of Georgia, 30602, A...",Abstract\n\nInfectious bronchitis (IB) causes ...,"Introduction\n\nInfectious bronchitis (IB), wh...",Emergence of novel strains of avian infectious...,"[{'first': 'Salman', 'middle': ['L'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Emergen..."
3,0139ea4ca580af99b602c6435368e7fdbefacb03,A Combined Evidence Approach to Prioritize Nip...,"Nishi Kumari, Ayush Upadhyay, Kishan Kalia, Ra...","Nishi Kumari (Panjab University, Chandigarh, I...",Abstract\n\nNipah Virus (NiV) came into limeli...,Introduction\n\nNipah is an infectious negativ...,"Molecular biology of Hendra and Nipah viruses,...","[{'first': 'Nishi', 'middle': [], 'last': 'Kum...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Molecul..."
4,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Assessing spread risk of Wuhan novel coronavir...,"Shengjie Lai, Isaac I Bogoch, Nick W Ruktanonc...","Shengjie Lai (University of Southampton, UK), ...",Abstract\n\nBackground: A novel coronavirus (2...,"Introduction\n\nIn December 2019, a cluster of...",A Novel Coronavirus Genome Identified in a Clu...,"[{'first': 'Shengjie', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A Novel..."


In [59]:
clean_df.to_csv('C:/Users/Renate/Documents/GitHub/Data-Projects/Kaggle - Covid-19/biorxiv_clean.csv', index=False)