In [1]:
import wearipedia
import pandas as pd
from dotenv import load_dotenv
import datetime
import boto3
import io
import gzip
import os
import json
import re
import math

In [2]:
load_dotenv('../.env/.env')
BUCKET_NAME = os.environ.get("WEARIPEDIA_S3_BUCKET_NAME")
PREFIX = "wearipedia/raw/cronometer/"
BACKFILL = True
email_address = os.getenv(f'CRONOMETER_EMAIL')
password = os.getenv(f'CRONOMETER_PASSWORD')

start_date='2025-04-23' #'2025-04-23' is first start date for logging. #@param {type:"string"}
end_date=(datetime.datetime.today()- datetime.timedelta(days=2)).strftime('%Y-%m-%d') #@param {type:"string"}
synthetic = False #@param {type:"boolean"}

In [3]:
# Get Device
device = wearipedia.get_device("cronometer/cronometer")

if not synthetic:
    device.authenticate({"username": email_address, "password": password})

Authentication successful


In [4]:
def list_s3_filenames(bucket_name: str, folder_prefix: str, region="us-east-2"):
    s3 = boto3.client("s3", region_name=region)
    
    paginator = s3.get_paginator("list_objects_v2")
    result = []

    for page in paginator.paginate(Bucket=bucket_name, Prefix=folder_prefix):
        for obj in page.get("Contents", []):
            result.append(obj["Key"])

    return result

In [5]:
def extract_unique_dates_from_keys(keys: list[str]) -> set[str]:
    date_pattern = re.compile(r"\d{4}-\d{2}-\d{2}")
    dates = set()

    for key in keys:
        match = date_pattern.search(key)
        if match:
            dates.add(match.group(0))

    return dates

In [6]:
def get_missing_dates(start_date: str, end_date: str, existing_dates: list[str]) -> list[str]:
    # Convert to datetime
    start = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.datetime.strptime(end_date, "%Y-%m-%d")
    existing = set(existing_dates)

    # Generate full date range
    all_dates = {
        (start + datetime.timedelta(days=i)).strftime("%Y-%m-%d")
        for i in range((end - start).days + 1)
    }

    # Find dates in range not in existing list
    missing = sorted(all_dates - existing)
    return missing

In [7]:
def sanitize_nan(obj):
    if isinstance(obj, float) and math.isnan(obj):
        return None
    elif isinstance(obj, dict):
        return {k: sanitize_nan(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [sanitize_nan(i) for i in obj]
    return obj

In [8]:
def upload_dicts_as_gzipped_jsonl_to_s3(records: list[dict], data_type: str, end_date: str, bucket: str, prefix: str, region: str = "us-east-2"):
    buffer = io.BytesIO()

    # Write DataFrame to GZIP CSV in memory
    with gzip.GzipFile(fileobj=buffer, mode="w") as gz:
        for item in records:
            sanitized = sanitize_nan(item)
            line = json.dumps(sanitized) + "\n"
            gz.write(line.encode('utf-8'))

    buffer.seek(0)

    # Create timestamped filename
    filename = f"{data_type}_{end_date}.jsonl.gz"
    key = os.path.join(f"{prefix}{end_date.replace('-','/')}", filename)

    # Upload to S3
    s3 = boto3.client("s3", region_name=region)
    s3.upload_fileobj(buffer, bucket, key)

    print(f"Uploaded {filename} to S3 bucket.")

In [9]:
bucket = BUCKET_NAME
prefix = PREFIX

files = list_s3_filenames(bucket, prefix)
existing_dates = extract_unique_dates_from_keys(files)

missing_dates = get_missing_dates(start_date, end_date, existing_dates)
print(missing_dates)

['2025-04-23', '2025-04-24', '2025-04-25', '2025-04-26', '2025-04-27', '2025-04-28', '2025-04-29', '2025-04-30', '2025-05-01', '2025-05-02', '2025-05-03', '2025-05-04', '2025-05-05', '2025-05-06', '2025-05-07', '2025-05-08', '2025-05-09', '2025-05-10', '2025-05-11', '2025-05-12', '2025-05-13', '2025-05-14', '2025-05-15', '2025-05-16', '2025-05-17', '2025-05-18', '2025-05-19', '2025-05-20', '2025-05-21', '2025-05-22', '2025-05-23', '2025-05-24', '2025-05-25', '2025-05-26', '2025-05-27', '2025-05-28', '2025-05-29', '2025-05-30', '2025-05-31', '2025-06-01', '2025-06-02', '2025-06-03', '2025-06-04', '2025-06-05', '2025-06-06', '2025-06-07', '2025-06-08', '2025-06-09', '2025-06-10', '2025-06-11', '2025-06-12', '2025-06-13', '2025-06-14', '2025-06-15', '2025-06-16', '2025-06-17', '2025-06-18', '2025-06-19', '2025-06-20', '2025-06-21', '2025-06-22', '2025-06-23', '2025-06-24', '2025-06-25', '2025-06-26', '2025-06-27', '2025-06-28', '2025-06-29', '2025-06-30', '2025-07-01', '2025-07-02', '2025

In [10]:
for date in missing_dates:
    if BACKFILL:
        params = {"start_date": start_date, "end_date": end_date}
        date = end_date
    else:
        params = {"start_date": date, "end_date": date}
    datasets = ['dailySummary', 'servings', 'exercises', 'biometrics']
    for dataset in datasets:
        try:
            data = device.get_data(dataset, params=params)
        except:
            continue
        
        upload_dicts_as_gzipped_jsonl_to_s3(
            data,
            data_type=dataset,
            end_date=date,
            bucket=BUCKET_NAME,
            prefix=PREFIX
        )
    if BACKFILL:
        break

Uploaded dailySummary_2025-07-13.jsonl.gz to S3 bucket.
Uploaded servings_2025-07-13.jsonl.gz to S3 bucket.
Uploaded exercises_2025-07-13.jsonl.gz to S3 bucket.
Uploaded biometrics_2025-07-13.jsonl.gz to S3 bucket.
