In [1]:
import json
from pathlib import Path
import os

import pandas as pd
import s3fs

In [2]:
def read_cluster_csv(file_path, endpoint_url='https://storage.budsc.midwest-datascience.com'):
    s3 = s3fs.S3FileSystem(
        anon=True,
        client_kwargs={
            'endpoint_url': endpoint_url
        }
    )
    return pd.read_csv(s3.open(file_path, mode='rb'))

current_dir = Path(os.getcwd()).absolute()
results_dir = current_dir.joinpath('results')
kv_data_dir = results_dir.joinpath('kvdb')
kv_data_dir.mkdir(parents=True, exist_ok=True)

people_json = kv_data_dir.joinpath('people.json')
visited_json = kv_data_dir.joinpath('visited.json')
sites_json = kv_data_dir.joinpath('sites.json')
measurements_json = kv_data_dir.joinpath('measurements.json')

In [3]:
class KVDB(object):
    def __init__(self, db_path):
        self._db_path = Path(db_path)
        self._db = {}
        self._load_db()

    def _load_db(self):
        if self._db_path.exists():
            with open(self._db_path) as f:
                self._db = json.load(f)

    def get_value(self, key):
        return self._db.get(key)

    def set_value(self, key, value):
        self._db[key] = value

    def save(self):
        with open(self._db_path, 'w') as f:
            json.dump(self._db, f, indent=2)

In [4]:
def create_sites_kvdb():
    db = KVDB(sites_json)
    df = read_cluster_csv('data/external/tidynomicon/site.csv')
    for site_id, group_df in df.groupby('site_id'):
        db.set_value(site_id, group_df.to_dict(orient='records')[0])
    db.save()
    
create_sites_kvdb()

In [5]:
def create_people_kvdb():
    db = KVDB(people_json)
    df = read_cluster_csv('data/external/tidynomicon/person.csv')
    for person_id, group_df in df.groupby('person_id'):
        db.set_value(person_id, group_df.to_dict(orient='records')[0])    
    db.save()

create_people_kvdb()

In [27]:
peopleDB = KVDB(sites_json)

In [28]:
peopleDF = read_cluster_csv('data/external/tidynomicon/person.csv')

In [29]:
for person_id, group_df in peopleDF.groupby('person_id'):
    print(person_id)

danforth
dyer
lake
pb
roe


In [30]:
for person_id, group_df in peopleDF.groupby('person_id'):
    print(group_df)

  person_id personal_name family_name
4  danforth         Frank    Danforth
  person_id personal_name family_name
0      dyer       William        Dyer
  person_id personal_name family_name
2      lake      Anderson        Lake
  person_id personal_name family_name
1        pb         Frank     Pabodie
  person_id personal_name family_name
3       roe     Valentina     Roerich


In [4]:
db = KVDB(visited_json)

In [5]:
db

<__main__.KVDB at 0x7fee4a26a730>

In [6]:
df = read_cluster_csv('data/external/tidynomicon/visited.csv')

In [7]:
df

Unnamed: 0,visit_id,site_id,visit_date
0,619,DR-1,1927-02-08
1,622,DR-1,1927-02-10
2,734,DR-3,1930-01-07
3,735,DR-3,1930-01-12
4,751,DR-3,1930-02-26
5,752,DR-3,
6,837,MSK-4,1932-01-14
7,844,DR-1,1932-03-22


In [9]:
for composite_id, group_df in df.groupby(['visit_id', 'site_id']):
    print(composite_id)

(619, 'DR-1')
(622, 'DR-1')
(734, 'DR-3')
(735, 'DR-3')
(751, 'DR-3')
(752, 'DR-3')
(837, 'MSK-4')
(844, 'DR-1')


In [19]:
for composite_id, group_df in df.groupby(['visit_id', 'site_id']):
    test = group_df

In [21]:
dict(test)

{'visit_id': 7    844
 Name: visit_id, dtype: int64,
 'site_id': 7    DR-1
 Name: site_id, dtype: object,
 'visit_date': 7    1932-03-22
 Name: visit_date, dtype: object}

In [15]:
for composite_id, group_df in df.groupby(['visit_id', 'site_id']):
    print(group_df)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fee48dbff70>

In [10]:
for composite_id, group_df in df.groupby(['visit_id', 'site_id']):
    print(group_df)

   visit_id site_id  visit_date
0       619    DR-1  1927-02-08
   visit_id site_id  visit_date
1       622    DR-1  1927-02-10
   visit_id site_id  visit_date
2       734    DR-3  1930-01-07
   visit_id site_id  visit_date
3       735    DR-3  1930-01-12
   visit_id site_id  visit_date
4       751    DR-3  1930-02-26
   visit_id site_id visit_date
5       752    DR-3        NaN
   visit_id site_id  visit_date
6       837   MSK-4  1932-01-14
   visit_id site_id  visit_date
7       844    DR-1  1932-03-22


In [26]:
for composite_id, group_df in df.groupby(['visit_id', 'site_id']):
    print(dict(group_df.to_dict(orient='records')[0]))

{'visit_id': 619, 'site_id': 'DR-1', 'visit_date': '1927-02-08'}
{'visit_id': 622, 'site_id': 'DR-1', 'visit_date': '1927-02-10'}
{'visit_id': 734, 'site_id': 'DR-3', 'visit_date': '1930-01-07'}
{'visit_id': 735, 'site_id': 'DR-3', 'visit_date': '1930-01-12'}
{'visit_id': 751, 'site_id': 'DR-3', 'visit_date': '1930-02-26'}
{'visit_id': 752, 'site_id': 'DR-3', 'visit_date': nan}
{'visit_id': 837, 'site_id': 'MSK-4', 'visit_date': '1932-01-14'}
{'visit_id': 844, 'site_id': 'DR-1', 'visit_date': '1932-03-22'}


In [11]:
for composite_id, group_df in df.groupby(['visit_id', 'site_id']):
    print(group_df.to_dict(orient='records')[0])

{'visit_id': 619, 'site_id': 'DR-1', 'visit_date': '1927-02-08'}
{'visit_id': 622, 'site_id': 'DR-1', 'visit_date': '1927-02-10'}
{'visit_id': 734, 'site_id': 'DR-3', 'visit_date': '1930-01-07'}
{'visit_id': 735, 'site_id': 'DR-3', 'visit_date': '1930-01-12'}
{'visit_id': 751, 'site_id': 'DR-3', 'visit_date': '1930-02-26'}
{'visit_id': 752, 'site_id': 'DR-3', 'visit_date': nan}
{'visit_id': 837, 'site_id': 'MSK-4', 'visit_date': '1932-01-14'}
{'visit_id': 844, 'site_id': 'DR-1', 'visit_date': '1932-03-22'}


In [11]:
kvdb_path = 'visits.json'

In [12]:
kvdb = KVDB(kvdb_path)

In [8]:
testKey = (619, 'DR-1')

In [9]:
testKey

(619, 'DR-1')

In [6]:
testVal = dict(
    visit_id=619,
    site_id='DR-1',
    visit_date='1927-02-08'
 )

In [7]:
testVal

{'visit_id': 619, 'site_id': 'DR-1', 'visit_date': '1927-02-08'}

In [13]:
kvdb.set_value(testKey, testVal)

In [15]:
retrieved_value = kvdb.get_value(testKey)

In [16]:
retrieved_value

{'visit_id': 619, 'site_id': 'DR-1', 'visit_date': '1927-02-08'}

In [6]:
def create_visits_kvdb():
    db = KVDB(visited_json)
    df = read_cluster_csv('data/external/tidynomicon/visited.csv')
    for composite_id, group_df in df.groupby(['visit_id', 'site_id']):
        key = str(composite_id)
        db.set_value(key, group_df.to_dict(orient='records')[0])
    db.save()   
    
create_visits_kvdb()

In [7]:
def create_measurements_kvdb():
    db = KVDB(measurements_json)
    df = read_cluster_csv('data/external/tidynomicon/measurements.csv')
    for composite_id, group_df in df.groupby(['visit_id', 'person_id', 'quantity']):
        key = str(composite_id)
        db.set_value(key, group_df.to_dict(orient='records')[0])
    db.save()
    
create_measurements_kvdb()

In [None]:
def create_sites_kvdb():
    db = KVDB(sites_json)
    df = read_cluster_csv('data/external/tidynomicon/site.csv')
    for site_id, group_df in df.groupby('site_id'):
        db.set_value(site_id, group_df.to_dict(orient='records')[0])
    db.save()


def create_people_kvdb():
    db = KVDB(people_json)
    df = read_cluster_csv('data/external/tidynomicon/person.csv')
    for person_id, group_df in df.groupby('person_id'):
        db.set_value(person_id, group_df.to_dict(orient='records')[0])    
    db.save()


def create_visits_kvdb():
    db = KVDB(visited_json)
    df = read_cluster_csv('data/external/tidynomicon/visited.csv')
    for composite_id, group_df in df.groupby(['visit_id', 'site_id']):
        key = str(composite_id)
        db.set_value(key, group_df.to_dict(orient='records')[0])
    db.save() 

def create_measurements_kvdb():
    db = KVDB(measurements_json)
    df = read_cluster_csv('data/external/tidynomicon/measurements.csv')
    for composite_id, group_df in df.groupby(['visit_id', 'person_id', 'quantity']):
        key = str(composite_id)
        db.set_value(key, group_df.to_dict(orient='records')[0])
    db.save()

In [None]:
create_sites_kvdb()
create_people_kvdb()
create_visits_kvdb()
create_measurements_kvdb()

TRY SETTING A SINGLE KEY (INSTEAD OF COMPOSITE) FOR VISITS

In [1]:
import json
from pathlib import Path
import os

import pandas as pd
import s3fs

def read_cluster_csv(file_path, endpoint_url='https://storage.budsc.midwest-datascience.com'):
    s3 = s3fs.S3FileSystem(
        anon=True,
        client_kwargs={
            'endpoint_url': endpoint_url
        }
    )
    return pd.read_csv(s3.open(file_path, mode='rb'))

current_dir = Path(os.getcwd()).absolute()
results_dir = current_dir.joinpath('results')
kv_data_dir = results_dir.joinpath('kvdb')
kv_data_dir.mkdir(parents=True, exist_ok=True)

people_json = kv_data_dir.joinpath('people.json')
visited_json = kv_data_dir.joinpath('visited.json')
sites_json = kv_data_dir.joinpath('sites.json')
measurements_json = kv_data_dir.joinpath('measurements.json')

In [2]:
class KVDB(object):
    def __init__(self, db_path):
        self._db_path = Path(db_path)
        self._db = {}
        self._load_db()

    def _load_db(self):
        if self._db_path.exists():
            with open(self._db_path) as f:
                self._db = json.load(f)

    def get_value(self, key):
        return self._db.get(key)

    def set_value(self, key, value):
        self._db[key] = value

    def save(self):
        with open(self._db_path, 'w') as f:
            json.dump(self._db, f, indent=2)

In [5]:
def create_visits_kvdb():
    db = KVDB(visited_json)
    df = read_cluster_csv('data/external/tidynomicon/visited.csv')
#     for composite_id, group_df in df.groupby(['visit_id', 'site_id']):
#         key = str(composite_id)
#         db.set_value(key, group_df.to_dict(orient='records')[0])
    for visit_id, group_df in df.groupby('visit_id'):
        db.set_value(visit_id, group_df.to_dict(orient='records')[0])
    db.save() 

In [7]:
create_visits_kvdb()