# Cryostation initialization

A cryostation is a device for storing viruses in the real world.  Cryostations keep virus samples (different locations or generations) for later use in vaccine or gene sequencing research.

The `covidvu.cryostation` module will be used for storing all the processed data sources and manipulation results, for quick bundling and access.

## TinyDB

- [API reference](https://tinydb.readthedocs.io/en/latest/api.html)
- [Documentation](https://tinydb.readthedocs.io/en/latest/index.html)


---
## General purpose functions

In [None]:
import json
import os
import shutil
import tqdm

In [None]:
from covidvu.cryostation import Cryostation

In [None]:
from covidvu.pipeline.vujson import SITE_DATA

In [None]:
def loadDatasetFrom(source = 'confirmed', suffix = ''):
    fileName = os.path.join(SITE_DATA, source+'%s.json' % suffix)
    
    return json.loads(open(fileName, 'r').read())

In [None]:
def updateCasesIn(cryostation, casesType = 'confirmed'):
    sourceDataset = loadDatasetFrom(casesType)
    
    for key in tqdm.tqdm(sourceDataset.keys()):
        item = cryostation.get(key, dict())
        if not key in item:
            item = dict()
            item['key'] = key
            
        item[casesType] = sourceDataset[key]
        
        cryostation[key] = item

In [None]:
storage = Cryostation('database/virustrack.db')

---
## First time use

The routines in this notebook are used as a one-time only database initialization, based on the existing data bundles.

## Global cases

In [None]:
updateCasesIn(storage, 'confirmed')

In [None]:
updateCasesIn(storage, 'deaths')

In [None]:
storage.close()

### Capitals, ISO codes, languages

In [None]:
from covidvu.virustrack.countryinfo import COUNTRIES_INFO

In [None]:
storage = Cryostation('database/virustrack.db')

In [None]:
for country in tqdm.tqdm(COUNTRIES_INFO.keys()):
    countryInfo = COUNTRIES_INFO[country]
    if country not in storage:
        print('MISSING: %s' % country)
        missingCountry = { 'key': country, }
        storage[country] = missingCountry

    element = storage[country]
    element['info'] = countryInfo
    storage[country] = element

In [None]:
storage.close()

---
## US states

In [None]:
def update(element, casesType, suffix = '-US'):
    dataset = loadDatasetFrom(casesType, suffix)   
    for state in tqdm.tqdm(dataset.keys()):
        if state not in element:
            element[state] = dict()
        
        element[state][casesType] = dataset[state]
    
    return element

In [None]:
storage = Cryostation('database/virustrack.db')

In [None]:
element = storage['US']

In [None]:
element = update(element, 'confirmed')

In [None]:
element = update(element, 'deaths')

In [None]:
storage['US'] = element

In [None]:
storage.close()

### US Regions

In [None]:
storage = Cryostation('database/virustrack.db')

In [None]:
element = storage['US']

In [None]:
element = update(element, 'confirmed', '-US-Regions')

In [None]:
element = update(element, 'deaths', '-US-Regions')

In [None]:
storage['US'] = element

In [None]:
storage.close()

Each US state belongs to a region -- let's mark them here so the regions exist in the data, not in the code.

In [None]:
from covidvu.virustrack.countryinfo import US_REGIONS

In [None]:
storage = Cryostation('database/virustrack.db')

In [None]:
element = storage['US']

In [None]:
for state in tqdm.tqdm(US_REGIONS.keys()):
    if state in element:
        element[state]['region'] = US_REGIONS[state]

In [None]:
storage['US'] = element

In [None]:
storage.close()

---
## US Counties time series

Data import cut off is 2300 on the date we captured the data.  Hourly data snapshots are available if needed.

In [None]:
import json
import os
import shutil

### Populate cryostation with US counties and locations

In [None]:
storage = Cryostation('database/virustrack.db')

In [None]:
country = storage['US']

In [None]:
validStatesRegions = [
    elementKey for elementKey in country.keys()
        if elementKey not in ('key', 'confirmed', 'deaths', '!Total US', 'info',
                              'Other', 'Midwest', 'Northeast', 'South', 'West') ]

In [None]:
invalidCounties = list()
for countyElement in tqdm.tqdm(json.loads(open('./resources/counties-US-all-20200324.json', 'r').read())):
    state  = countyElement['province']
    county = countyElement['county']
    if state in validStatesRegions:
        if 'counties' not in country[state]:
            country[state]['counties'] = dict()
        country[state]['counties'][county] = { 'confirmed': dict(), 
                                               'coordinates': countyElement['coordinates'],
                                               'deaths': dict(),
                                             }
    else:
        invalidCounties.append('%s, %s\n' %(countyElement['county'], countyElement['province']))

In [None]:
storage['US'] = country

In [None]:
storage.close()

### Make the counties time series

Extract the county data for each specific date, from the `.tar.bz2` file.  These files are backed up on the server.

In [None]:
storage = Cryostation('database/virustrack.db')
country = storage['US']

In [None]:
def extractCountiesSnapshotFrom(fileName, snapshotsDir, siteDir = SITE_DATA):
    command = 'tar xvjf %s site-data/counties-US-all.json' % (os.path.join(snapshotsDir, fileName))
    print(command)
    os.system(command)

    dateDec = fileName.split('-')[1].split('.')[0]
    print(dateDec)
    dateStamp = '%s-%s-%s' % (dateDec[:4], dateDec[4:6], dateDec[-2:])

    return os.path.join(siteDir, 'counties-US-all.json'), dateStamp

In [None]:
import time

def processCountiesForDate(source_Date, country):
    print(source_Date[0], source_Date[1])
    dataset = json.loads(open(source_Date[0]).read())
    for state in tqdm.tqdm(dataset.keys()):
        for county in dataset[state]:
            if state not in country:
                continue
            if county not in country[state]['counties']:
                continue
            country[state]['counties'][county]['confirmed'][source_Date[1]] = dataset[state][county]['confirmed']
            country[state]['counties'][county]['deaths'][source_Date[1]] = dataset[state][county]['deaths']
    
    os.unlink(source_Date[0])

In [None]:
snapshotsDir = './snapshots'
for archive in (fileName for fileName in os.listdir(snapshotsDir) if 'snapshot' in fileName):
    sourceFile, dateStamp = extractCountiesSnapshotFrom(archive, snapshotsDir)
    processCountiesForDate((sourceFile, dateStamp), country)

In [None]:
storage['US'] = country
storage.close()

---
## Experiments


In [None]:
storage = Cryostation('database/virustrack.db')

In [None]:
country = storage['US']

In [None]:
print(country['info'])

In [None]:
for entry in country.keys():
    if entry not in ('confirmed', 'deaths', 'info', '!Total US') and entry != 'key':
        print(entry, country[entry]['deaths']['2020-04-05'])

In [None]:
storage.close()

---
&#169; The VirusTrack/COVIDvu Open Source Development Team