# Community Health Mapping

In [1]:
import pandas as pd
import numpy as np
import re
import awswrangler as wr

In [65]:
def extractKeys(filename : str, mapping : dict):
    """
    Extract the keys from checkin and pid files using a regular expression.
    """
    # Using regular expression to extract anything that looks like a key from the file.
    # Mongo keys are 24 characters long and alpha numeric.
    p = re.compile('[0-9a-z]{24}')
    
    # Parse through the checkin file first.
    with open(filename, 'r') as f:
        for line in f:
            # Will get a list of all the matches.
            keys = p.findall(line)  # [recordid, communityid, healthid]
            if len(keys) < 3:
                print('Failed for - {}'.format(line))
                continue
            mapping[keys[1]] = keys[2]  # Community id -> health id
    
    return mapping

In [69]:
mapping = {}
mapping = extractKeys('raw_data/checkin_uid_mapped.csv', mapping=mapping)
mapping = extractKeys('raw_data/periods_uid_mapped.csv', mapping=mapping)

Failed for - ﻿"/_id/$oid,/community_id/$oid,""/exercise"",""/mood"",""/products/0"",""/products/1"",""/products/2"",""/products/3"",""/products/4"",""/products/5"",""/products/6"",""/products/7"",""/sleep"",""/timestamp/$date"",""/uid/$oid"",""/water"",""/weight"",""/weight_unit""",,,,,

Failed for - ﻿"/_id/$oid,""/community_id/$oid"",""/end_time/$date"",""/flow/0/flow_time/$date"",""/flow/0/value"",""/flow/1/flow_time/$date"",""/flow/1/value"",""/flow/10/flow_time/$date"",""/flow/10/value"",""/flow/11/flow_time/$date"",""/flow/11/value"",""/flow/12/flow_time/$date"",""/flow/12/value"",""/flow/13/flow_time/$date"",""/flow/13/value"",""/flow/14/flow_time/$date"",""/flow/14/value"",""/flow/15/flow_time/$date"",""/flow/15/value"",""/flow/16/flow_time/$date"",""/flow/16/value"",""/flow/17/flow_time/$date"",""/flow/17/value"",""/flow/18/flow_time/$date"",""/flow/18/value"",""/flow/19/flow_time/$date"",""/flow/19/value"",""/flow/2/flow_time/$date"",""/flow/2/value"",""/flow/20/flow_time/$dat

In [70]:
frame = pd.DataFrame.from_dict(mapping, orient='index', columns=['healthid'])
frame.shape

(42619, 1)

In [71]:
frame['communityid'] = frame.index.values

In [72]:
frame.head()

Unnamed: 0,healthid,communityid
5e4a9bb73c52676149bb86f2,5f8e89640054a6c5fab1544d,5e4a9bb73c52676149bb86f2
5efa276ea61e8a770265691e,5f8e90320054a6c5fab1544e,5efa276ea61e8a770265691e
5efb7a38be8ac404c2a8f8e9,5f8f274466dae0d0328adb80,5efb7a38be8ac404c2a8f8e9
5edfc5e819e422142035b922,5f8f175466dae0d0328adb76,5edfc5e819e422142035b922
5e81f0663e55077bcf96dfc1,5f8f260566dae0d0328adb7e,5e81f0663e55077bcf96dfc1


In [57]:
%set_env AWS_PROFILE=opensci

env: AWS_PROFILE=opensci


In [74]:
# Storing the mapping data on Data Lake
wr.s3.to_parquet(
    df=frame,
    path="s3://menolife-opensci/dataset/mapping/",
    dataset=True,
    database="menolife",
    table="uid_mapping",
    index=True
)

{'paths': ['s3://menolife-opensci/dataset/mapping/7d81ce2707e647a2be5c6076bf2188bc.snappy.parquet'],
 'partitions_values': {}}