In [5]:
import json
import os
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import requests
import csv
import yaml
from collections import Counter
import numpy as np

In [6]:
root_path = Path('..')
data_path = root_path / "data"
raw_data_path = data_path /'raw'
processed_data_path = data_path / 'processed'
figures = root_path / 'figures'

In [14]:
def get_children(tree, nodes, parent):
    nodes.append(process_node(tree, parent))
    if 'children' in tree:
        for child in tree['children']:
            get_children(child, nodes, tree)

In [16]:
def bfs_count_children(tree):
    cnt = 0
    q = []
    q.append(tree)

    while len(q) > 0:
        n = q.pop(0)
        cnt += 1
        if 'children' in n:
            for child in n['children']:
                q.append(child)
    return cnt - 1

In [15]:
def process_node(node, parent):
    # node['branch_attrs', 'name', 'node_attrs']
    name = node['name']
    # num_descendants = descendants_count(node)
    attrs = {}
    mutations = {}
    p = -1
    children_count = bfs_count_children(node)
    for node_attr in node['node_attrs']:
        val = node['node_attrs'][node_attr]
        if isinstance(val, dict) and 'value' in val:
            # print(node_attr, val['value'])
            attrs[node_attr] = val['value']

    if 'div' in node['node_attrs']:
        attrs['div'] = node['node_attrs']['div']

    if 'branch_attrs' in node and 'mutations' in node['branch_attrs']:
        mutations = node['branch_attrs']

    if parent is not None:
        p = node['node_attrs']['tree_id']

    return (node['node_attrs']['tree_id'], name, p, children_count, attrs, mutations)

In [7]:
base_url = 'https://nextstrain.org/charon/getDataset?prefix=/ncov/global'
name = 'covid.json'

In [8]:
resp = requests.get(base_url)

In [9]:
with open(raw_data_path / 'covid.json', 'w') as f:
    f.write(resp.text)
with open(os.path.join('..', 'data', 'raw', '{}.json'.format('covid'))) as f:
    data = json.load(f)

In [12]:
tree = data['tree']

In [13]:
q = []
q.append(tree)
idx = 0
while len(q) > 0:
    n = q.pop(0)
    n['node_attrs']['tree_id'] = idx
    idx += 1
    if 'children' in n:
        for child in n['children']:
            q.append(child)

In [17]:
nodes_list = []
get_children(tree, nodes_list, None)

In [18]:
k1 = list({'age',
               'author',
               'clade_membership',
               'country',
               'division',
               'gisaid_epi_isl',
               'host',
               'location',
               'num_date',
               'originating_lab',
               'recency',
               'region',
               'submitting_lab'}) + ['div']  # ,'labels', 'mutations']

In [21]:
for node_item in nodes_list:
    print(node_item)

(0, 'NODE_0000000', -1, 7142, {'S1_mutations': 0, 'clade_membership': '19A', 'num_date': 2019.921229494423, 'subclade_membership': '19A', 'region': 'Asia', 'div': 0}, {'labels': {'clade': '19A'}, 'mutations': {}})
(1, 'Wuhan/WH01/2019', 1, 0, {'GISAID_clade': 'L', 'S1_mutations': 0, 'age': '44', 'author': 'Weijun Chen et al', 'clade_membership': '19A', 'country': 'China', 'country_exposure': 'China', 'division': 'Hubei', 'division_exposure': 'Hubei', 'genbank_accession': 'LR757998.1', 'gisaid_epi_isl': 'EPI_ISL_406798', 'host': 'Human', 'location': 'Wuhan', 'num_date': 2019.9849315068493, 'originating_lab': "General Hospital of Central Theater Command of People's Liberation Army of China", 'pango_lineage': 'B', 'recency': 'Older', 'region': 'Asia', 'sex': 'Male', 'subclade_membership': '19A', 'submitting_lab': "BGI & Institute of Microbiology, Chinese Academy of Sciences & Shandong First Medical University & Shandong Academy of Medical Sciences & General Hospital of Central Theater Com

(695, 'NODE_0000368', 695, 230, {'S1_mutations': 1, 'clade_membership': '20A', 'num_date': 2020.1478682664745, 'subclade_membership': '20A', 'region': 'Europe', 'div': 4}, {'mutations': {}})
(792, 'NODE_0000369', 792, 3, {'S1_mutations': 1, 'clade_membership': '20A', 'num_date': 2020.1516393442623, 'subclade_membership': '20A', 'region': 'Europe', 'div': 4}, {'mutations': {}})
(905, 'Brazil/SP-01/2020_travel_history', 905, 1, {'num_date': 2020.1516393442623, 'region': 'Europe', 'div': 4}, {})
(1035, 'Brazil/SP-01/2020', 1035, 0, {'GISAID_clade': 'G', 'S1_mutations': 1, 'age': '61', 'author': 'Jaqueline Goes de Jesus et al', 'clade_membership': '20A', 'country': 'Brazil', 'country_exposure': 'Italy', 'division': 'São Paulo', 'division_exposure': 'Lombardy', 'gisaid_epi_isl': 'EPI_ISL_412964', 'host': 'Human', 'location': 'São Paulo', 'num_date': 2020.1516393442623, 'originating_lab': 'Hospital Israelita Albert Einstein', 'pango_lineage': 'B.1', 'recency': 'Older', 'region': 'South Ameri

(2089, 'NODE_0000824', 2089, 4, {'S1_mutations': 1, 'clade_membership': '20A', 'num_date': 2020.217213114754, 'subclade_membership': '20A', 'region': 'Europe', 'div': 4}, {'mutations': {}})
(2268, 'BosniaandHerzegovina/ChVir7349/2020', 2268, 0, {'GISAID_clade': 'G', 'S1_mutations': 1, 'author': 'Victor M Corman et al', 'clade_membership': '20A', 'country': 'Bosnia and Herzegovina', 'country_exposure': 'Bosnia and Herzegovina', 'division': 'Sarajevo', 'division_exposure': 'Sarajevo', 'gisaid_epi_isl': 'EPI_ISL_462456', 'host': 'Human', 'num_date': 2020.217213114754, 'originating_lab': 'Clinical Center, University of Sarajevo', 'pango_lineage': 'B.1', 'recency': 'Older', 'region': 'Europe', 'subclade_membership': '20A', 'submitting_lab': 'Charite Universitatsmedizin Berlin, Institute of Virology', 'div': 4}, {'mutations': {}})
(2269, 'NODE_0000322', 2269, 2, {'S1_mutations': 1, 'clade_membership': '20A', 'num_date': 2020.2937158469945, 'subclade_membership': '20A', 'region': 'South Ameri

(3013, 'India/KA-NIMH-SEQ-84/2020', 3013, 0, {'GISAID_clade': 'GH', 'S1_mutations': 2, 'author': 'Chitra Pattabiraman et al', 'clade_membership': '20A', 'country': 'India', 'country_exposure': 'India', 'division': 'Karnataka', 'division_exposure': 'Karnataka', 'gisaid_epi_isl': 'EPI_ISL_995174', 'host': 'Human', 'num_date': 2020.8975409836066, 'originating_lab': 'BBMP Urban PHC', 'pango_lineage': 'B.1.36', 'recency': 'One month ago', 'region': 'Asia', 'subclade_membership': '20A/N.194L', 'submitting_lab': 'Department of Neurovirology, National Institute of Mental Health and Neurosciences (NIMHANS)', 'div': 26}, {'labels': {'aa': 'ORF1b: L261F, E735D; ORF3a: S216T, G224C; S: L822F, K1191N'}, 'mutations': {'ORF1b': ['L261F', 'E735D'], 'ORF3a': ['S216T', 'G224C'], 'S': ['L822F', 'K1191N'], 'nuc': ['G4300T', 'C10336T', 'C12076T', 'G14250T', 'C14583T', 'G15672T', 'C24026T', 'G25135C', 'T26038A', 'G26062T']}})
(2803, 'NODE_0001343', 2803, 9, {'S1_mutations': 2, 'clade_membership': '20A', 'nu

(837, 'Australia/QLD973/2020', 837, 0, {'GISAID_clade': 'GH', 'S1_mutations': 1, 'author': 'Son Nguyen et al et al', 'clade_membership': '20A', 'country': 'Australia', 'country_exposure': 'Australia', 'division': 'Queensland', 'division_exposure': 'Queensland', 'gisaid_epi_isl': 'EPI_ISL_530231', 'host': 'Human', 'num_date': 2020.2527322404371, 'originating_lab': 'Queensland Health Forensic and Scientific Services, Public Health Virology', 'pango_lineage': 'B.1', 'recency': 'Older', 'region': 'Oceania', 'subclade_membership': '20A', 'submitting_lab': 'Public Health Virology Laboratory, Forensic and Scientific Services, Queensland Health', 'div': 9}, {'labels': {'aa': 'N: R95L; ORF1b: V2178F; ORF9b: V92L'}, 'mutations': {'N': ['R95L'], 'ORF1b': ['V2178F'], 'ORF9b': ['V92L'], 'nuc': ['G19999T', 'G28557T']}})
(838, 'Congo/Q001C112AQ_2101-TM-r1-026/2021', 838, 0, {'GISAID_clade': 'GH', 'S1_mutations': 2, 'author': 'Angel Angelov et al', 'clade_membership': '20A', 'country': 'Republic of th

(7000, 'NODE_0002176', 7000, 12, {'S1_mutations': 2, 'clade_membership': '20G', 'num_date': 2020.8164326133642, 'subclade_membership': '20G', 'region': 'North America', 'div': 17}, {'mutations': {}})
(7015, 'USA/SD-CDC-2-3880625/2021', 7015, 0, {'GISAID_clade': 'GH', 'S1_mutations': 2, 'age': '65', 'author': 'Krista Queen et al', 'clade_membership': '20G', 'country': 'USA', 'country_exposure': 'USA', 'division': 'South Dakota', 'division_exposure': 'South Dakota', 'gisaid_epi_isl': 'EPI_ISL_1094543', 'host': 'Human', 'num_date': 2021.0671232876712, 'originating_lab': 'SD Public Health Laboratory', 'pango_lineage': 'B.1.2', 'recency': 'One week ago', 'region': 'North America', 'sex': 'Male', 'subclade_membership': '20G', 'submitting_lab': 'Respiratory Viruses Branch, Division of Viral Diseases, Centers for Disease Control and Prevention', 'div': 24}, {'labels': {'aa': 'ORF1a: S3195G; ORF1b: G940C, V1271L; ORF3a: T14I; S: Q1180R'}, 'mutations': {'ORF1a': ['S3195G'], 'ORF1b': ['G940C', 'V

(424, 'NODE_0002375', 424, 2, {'S1_mutations': 2, 'clade_membership': '20B', 'num_date': 2020.8510623802322, 'subclade_membership': '20B', 'region': 'South America', 'div': 15}, {'labels': {'aa': 'ORF1a: L2146F, A3209V; S: T33I'}, 'mutations': {'ORF1a': ['L2146F', 'A3209V'], 'S': ['T33I'], 'nuc': ['T6637G', 'C6701T', 'C9891T', 'T10480C', 'C17502T', 'C18138T', 'C21660T']}})
(482, 'Peru/LIM-UPCH-0339/2020', 482, 0, {'GISAID_clade': 'GR', 'S1_mutations': 2, 'age': '42', 'author': 'Pablo Tsukayama et al', 'clade_membership': '20B', 'country': 'Peru', 'country_exposure': 'Peru', 'division': 'Lima', 'division_exposure': 'Lima', 'gisaid_epi_isl': 'EPI_ISL_812494', 'host': 'Human', 'num_date': 2020.9494535519125, 'originating_lab': 'Laboratorio de Referencia Nacional de Virus Respiratorios, Instituto Nacional de Salud Peru', 'pango_lineage': 'B.1.1.29', 'recency': 'Older', 'region': 'South America', 'sex': 'Female', 'subclade_membership': '20B', 'submitting_lab': 'Laboratorio de Genómica Micro

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [90]:
vremia = []
mesto = []
mutations = []
with open('{}.tsv'.format('covid'), 'w') as f:
    f.write('\t'.join(['idx', 'parent', 'name', 'child_count'] + k1 + ['labels', 'mutations']) + os.linesep)
    for node_item in nodes_list:
        node_id = node_item[0]
        name = node_item[1]
        parent = node_item[2]
        child_count = node_item[3]
        _attrs = node_item[4]
        mut = node_item[-1]
        mut_part = ['', '']
        if 'labels' in mut:
            mut_part[0] = str(mut['labels'])
        if 'mutations' in mut and len(mut['mutations']) > 0:
            mut_part[1] = str(mut['mutations'])

        attr_line = [_attrs.get(key, '') for key in k1]

            #         line = '\t'.join([node_item[0]]
            #                          + [str(node_item[1][key]) for key in k1 ]
            #                          +mut_part
            #                         )
        vremia.append(attr_line[11])
        mutations.append(mut_part[1])
        mesto.append([attr_line[4], attr_line[6]])
        line = [node_id, parent, name, child_count] + attr_line + mut_part
        f.write('\t'.join([str(itm) for itm in line]) + os.linesep)
    

In [93]:
df = pd.DataFrame({'Place' : mesto, 'Time' : vremia, 'Mutations' : mutations})

In [96]:
df.to_csv(data_path/'nextstrain_data.csv')

2911193664864030