In [19]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='../.env')

import requests
import os
from datetime import datetime, timedelta
from pprint import pprint
import pandas as pd
from pathlib import Path
import boto3
import shutil
import gzip
import json

In [2]:
import logging

class LoggerConfig:
    _is_configured = False
    _log_path = Path('./logs')
    _log_level = logging.INFO
    _formatter = logging.Formatter('[%(asctime)s]  %(filename)s:%(lineno)d - %(levelname)s - %(message)s')

    @classmethod
    def set_log_path(cls, path: str):
        cls._log_path = Path(path)

    @classmethod
    def set_formatter(cls, format: logging.Formatter):
        cls._formatter = format
    
    @classmethod
    def configure_root_logger(cls):
        if cls._is_configured:
            return

        # Root Logger
        logger = logging.getLogger()
        logger.setLevel(logging.INFO)

        cls._log_path.mkdir(parents=True, exist_ok=True)
        filepath = cls._log_path / f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"

        # File Logger
        filehandler = logging.FileHandler(filepath)
        filehandler.setLevel(logging.INFO)
        filehandler.setFormatter(cls._formatter)

        # Console Logger
        streamhandler = logging.StreamHandler()
        streamhandler.setLevel(logging.INFO)
        streamhandler.setFormatter(cls._formatter)

        # Add handlers
        logger.addHandler(filehandler)
        logger.addHandler(streamhandler)

        cls._is_configured = True

class Logger(LoggerConfig):
    def __init__(self, name: str):
        self.configure_root_logger()
        self.logger = logging.getLogger(name)

    def __getattr__(self, name):
        return getattr(self.logger, name)

In [3]:
logger = Logger("api-call.ipynb")

TODO
- Perform Initial Load from API into a database
- Perform Incremental Load from API into a database

Initial Load
- How to fetch using the url? - Work in progress
- Where to dump the data initially? (S3) - Yes
- Do you need to dump the data? - Yes: Avoid re-fetching from API if pipeline fails

In [None]:
def fetch_crime_data(delta: int, pagesize: int = 50000):

    # For Backfill
    # min_update = f"SELECT min(updated_on)"
    # max_update = f"SELECT max(updated_on)"
    
    last_update = (datetime.now() - timedelta(days=delta)).isoformat()[:-3]

    # Implementing CDC, batch incremental loads only
    query = f"SELECT * WHERE updated_on >= '{last_update}'"

    url = "https://data.cityofchicago.org/api/v3/views/crimes/query.json"
    headers = {
        'X-App-Token' : os.getenv('APP_TOKEN')
    }

    # Need to add retry logic
    results = []
    pagenum = 1
    while True:
        body = {
            'query' : query,
            'page' : {
                'pageNumber' : pagenum,
                'pageSize' : pagesize
            },
            "includeSynthetic": False
        }

        res = requests.post(
            url,
            json=body,
            headers=headers
        )

        if res.status_code != 200:
            break

        if res.json() == []:
            break
        
        if pagenum >= 15:
            break
        
        yield(res.json())
        pagenum += 1

In [5]:
datetime.now().isocalendar().year

2025

In [6]:
s3_path = f"/raw/ingest={datetime.now().strftime('%Y-%m-%d')}"
Path(s3_path)

PosixPath('/raw/ingest=2025-07-22')

In [None]:
# Landing zone for fetched data from API
# Serialize - compress and save with a specifc file name
# Connect to s3 bucket and upload to bucket

# local path example:
# ./tmp/metadata.json
# ./tmp/part-0001.json.gz
# ./tmp/part-0002.json.gz

# S3 path example:
# Structure s3://<bucket>/<key>
# Might also add metadata for each ingest=(datetime)
# s3://     open-crime-etl/     raw/ingest=(datetime1)/part-0001.json.gz
# s3://     open-crime-etl/     raw/ingest=(datetime1)/part-0002.json.gz
# s3://     open-crime-etl/     raw/ingest=(datetime2)/part-0001.json.gz
# s3://     open-crime-etl/     raw/ingest=(datetime2)/part-0002.json.gz

tmp = Path('./tmp')
s3_key = Path(f"raw/ingest={datetime.now().strftime('%Y-%m-%d')}")
# Create the local folder structure
tmp.mkdir(parents=True, exist_ok=True)

BUCKET_NAME = 'open-crime-etl'
PAGESIZE = 500
DELTA = 29

client = boto3.client('s3')

logger.info(f"Initiating pipeline: ingest={datetime.now().strftime('%Y-%m-%d')}")
logger.info(f"tmp folder missing. Created 'tmp'")

logger.info(f"Fetching data from API")
# Iterates over each batch yielded by fetch_crime_data
for i, data in enumerate(fetch_crime_data(delta=DELTA,pagesize=PAGESIZE)):
    logger.info(f"Batch {i+1}")
    filename = f"part-{i+1:04}.json.gz"
    filepath = tmp / filename

    with gzip.open(filepath,'wt') as f:
        json.dump(data, f)
    logger.info(f"Saved file to: {filepath.as_posix()}")

    # Upload to s3
    client.upload_file(Filename=filepath.as_posix(), Bucket=BUCKET_NAME, Key=f"{(s3_key / filename).as_posix()}")
    logger.info(f"Uploaded file to: s3://{BUCKET_NAME}/{(s3_key / filename).as_posix()}")

# Purge the temp folder
if tmp.exists():
    logger.info("Clearing tmp")
    for item in tmp.iterdir():
        item.unlink()
        logger.info(f"Delete file: {item}")
    shutil.rmtree(tmp)

logger.info("Pipeline terminated")

[2025-07-22 16:41:45,127]  1805862261.py:25 - INFO - Initiating pipeline: ingest=2025-07-22
[2025-07-22 16:41:45,128]  1805862261.py:27 - INFO - tmp folder missing. Created 'tmp'
[2025-07-22 16:41:45,129]  1805862261.py:31 - INFO - Fetching data from API
[2025-07-22 16:41:45,464]  1805862261.py:34 - INFO - Batch 1
[2025-07-22 16:41:45,484]  1805862261.py:40 - INFO - Saved file to: tmp/part-0001.json.gz
[2025-07-22 16:41:46,067]  1805862261.py:45 - INFO - Uploaded file to: s3://open-crime-etl/raw/ingest=2025-07-22/part-0001.json.gz
[2025-07-22 16:41:46,412]  1805862261.py:34 - INFO - Batch 2
[2025-07-22 16:41:46,431]  1805862261.py:40 - INFO - Saved file to: tmp/part-0002.json.gz
[2025-07-22 16:41:46,711]  1805862261.py:45 - INFO - Uploaded file to: s3://open-crime-etl/raw/ingest=2025-07-22/part-0002.json.gz
[2025-07-22 16:41:47,047]  1805862261.py:34 - INFO - Batch 3
[2025-07-22 16:41:47,070]  1805862261.py:40 - INFO - Saved file to: tmp/part-0003.json.gz
[2025-07-22 16:41:47,612]  180