In [30]:
import json
from pathlib import Path
import os

import pandas as pd
import s3fs


def read_cluster_csv(file_path, endpoint_url='https://storage.budsc.midwest-datascience.com'):
    s3 = s3fs.S3FileSystem(
        anon=True,
        client_kwargs={
            'endpoint_url': endpoint_url
        }
    )
    return pd.read_csv(s3.open(file_path, mode='rb'))

current_dir = Path(os.getcwd()).absolute()
results_dir = current_dir.joinpath('results')
kv_data_dir = results_dir.joinpath('kvdb')
kv_data_dir.mkdir(parents=True, exist_ok=True)

people_json = kv_data_dir.joinpath('people.json')
visited_json = kv_data_dir.joinpath('visited.json')
sites_json = kv_data_dir.joinpath('sites.json')
measurements_json = kv_data_dir.joinpath('measurements.json')

In [31]:
class KVDB(object):
    def __init__(self, db_path):
        self._db_path = Path(db_path)
        self._db = {}
        self._load_db()

    def _load_db(self):
        if self._db_path.exists():
            with open(self._db_path) as f:
                self._db = json.load(f)

    def get_value(self, key):
        return self._db.get(key)

    def set_value(self, key, value):
        self._db[key] = value

    def save(self):
        with open(self._db_path, 'w') as f:
            json.dump(self._db, f, indent=2)

In [32]:
# I am going to really try to do this stuff locally. We shall see how it goes.
# Changed from read cluster to read csv
# I also got the data locally from the data folder

# Here I opted to roll with the primary key for my grouping
def create_sites_kvdb():
    db = KVDB(sites_json)
    # df = read_cluster_csv('data/external/tidynomicon/site.csv')
    df = pd.read_csv('/home/totennacht/Bellevue/dsc650/data/external/tidynomicon/site.csv')
    for site_id, group_df in df.groupby('site_id'):
        db.set_value(site_id, group_df.to_dict(orient='records')[0])
    db.save()


# This one was a bit arbitrary person_id seemed to be their last name but it really should have some kind of
# unique value for a primary key. In this case their last names were unique so it will do but if I was building it
# I would just give a unique int value here.
def create_people_kvdb():
    db = KVDB(people_json)
    # df = read_cluster_csv('data/external/tidynomicon/person.csv')
    df = pd.read_csv('/home/totennacht/Bellevue/dsc650/data/external/tidynomicon/person.csv')
    for person_id, group_df in df.groupby('person_id'):
        db.set_value(person_id, group_df.to_dict(orient='records')[0])
    db.save()

# visits has a composite id using visit_id and site_id
# interestingly enough since this one has a primary key associated with it visit_id
# I can actually just group it by primary key to get what I want
def create_visits_kvdb():
    db = KVDB(visited_json)
    # df = read_cluster_csv('data/external/tidynomicon/visited.csv')
    df = pd.read_csv('/home/totennacht/Bellevue/dsc650/data/external/tidynomicon/visited.csv')
    for visit_id, group_df in df.groupby('visit_id'):
        db.set_value(visit_id, group_df.to_dict(orient='records')[0])
    db.save()

# measurements has composite keys visit_id, person_id, and quantity
# this one has no unique primary key so I have to go a different route.
# if I group these I lose information because all columns have repeated values
# pandas to the rescue!
# I could have also split the database by the second comma and turned those into keys.
# But pandas handles this idea nicely. I could also build keys using the two foreign keys if I needed to and loop.
# This just happened to give me the result I was looking for and was really efficient.
def create_measurements_kvdb():
    df = pd.read_csv('/home/totennacht/Bellevue/dsc650/data/external/tidynomicon/measurements.csv')
    df.to_json(orient='records', path_or_buf='/home/totennacht/Bellevue/dsc650/dsc650/assignments/assignment02/results/kvdb/measurements.json', indent=2)




In [33]:
# creating the json files
create_sites_kvdb()
create_people_kvdb()
create_visits_kvdb()
create_measurements_kvdb()

In [34]:
# a quick test to see how mine compared to the others
my_measurements = pd.read_json('/home/totennacht/Bellevue/dsc650/dsc650/assignments/assignment02/results/kvdb/measurements.json')
my_measurements.head()

Unnamed: 0,visit_id,person_id,quantity,reading
0,619,dyer,rad,9.82
1,619,dyer,sal,0.13
2,622,dyer,rad,7.8
3,622,dyer,sal,0.09
4,734,pb,rad,8.41


In [35]:
my_visits = pd.read_json('/home/totennacht/Bellevue/dsc650/dsc650/assignments/assignment02/results/kvdb/visited.json')
# interesting I had to do a transversal to get what I was expecting here. But it looks good.
my_visits.head().T


Unnamed: 0,visit_id,site_id,visit_date
619,619,DR-1,1927-02-08
622,622,DR-1,1927-02-10
734,734,DR-3,1930-01-07
735,735,DR-3,1930-01-12
751,751,DR-3,1930-02-26
752,752,DR-3,
837,837,MSK-4,1932-01-14
844,844,DR-1,1932-03-22
