In [2]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='../.env')

import requests
import os
from datetime import datetime, timedelta
from pprint import pprint
import pandas as pd
from pathlib import Path
import boto3
import shutil
import gzip
import json

In [3]:
import logging

class LoggerConfig:
    _is_configured = False
    _log_path = Path('./logs')
    _log_level = logging.INFO
    _formatter = logging.Formatter('[%(asctime)s]  %(filename)s:%(lineno)d - %(levelname)s - %(message)s')

    @classmethod
    def set_log_path(cls, path: str):
        cls._log_path = Path(path)

    @classmethod
    def set_formatter(cls, format: logging.Formatter):
        cls._formatter = format
    
    @classmethod
    def configure_root_logger(cls):
        if cls._is_configured:
            return

        # Root Logger
        logger = logging.getLogger()
        logger.setLevel(logging.INFO)

        cls._log_path.mkdir(parents=True, exist_ok=True)
        filepath = cls._log_path / f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"

        # File Logger
        filehandler = logging.FileHandler(filepath)
        filehandler.setLevel(logging.INFO)
        filehandler.setFormatter(cls._formatter)

        # Console Logger
        streamhandler = logging.StreamHandler()
        streamhandler.setLevel(logging.INFO)
        streamhandler.setFormatter(cls._formatter)

        # Add handlers
        logger.addHandler(filehandler)
        logger.addHandler(streamhandler)

        cls._is_configured = True

class Logger(LoggerConfig):
    def __init__(self, name: str):
        self.configure_root_logger()
        self.logger = logging.getLogger(name)

    def __getattr__(self, name):
        return getattr(self.logger, name)

In [4]:
logger = Logger("api-call")

TODO
- Perform Initial Load from API into a database
- Perform Incremental Load from API into a database

Initial Load
- How to fetch using the url? - Work in progress
- Where to dump the data initially? (S3) - Yes
- Do you need to dump the data? - Yes: Avoid re-fetching from API if pipeline fails

In [5]:
def fetch_crime_data(delta: int, pagesize: int = 50000):
    """ TODO
    - Implement retries if failed to connect to api server
    """

    # For Backfill
    # min_update = f"SELECT min(updated_on)"
    # max_update = f"SELECT max(updated_on)"
    
    last_update = (datetime.now() - timedelta(days=delta)).isoformat()[:-3]

    # Implementing CDC, batch incremental loads only
    query = f"SELECT * WHERE updated_on >= '{last_update}'"

    url = "https://data.cityofchicago.org/api/v3/views/crimes/query.json"
    headers = {
        'X-App-Token' : os.getenv('APP_TOKEN')
    }

    # Need to add retry logic
    results = []
    pagenum = 1
    while True:
        body = {
            'query' : query,
            'page' : {
                'pageNumber' : pagenum,
                'pageSize' : pagesize
            },
            "includeSynthetic": False
        }

        res = requests.post(
            url,
            json=body,
            headers=headers
        )

        if res.status_code != 200:
            break

        if res.json() == []:
            break
        
        if pagenum >= 15:
            break
        
        yield(res.json())
        pagenum += 1

In [6]:
def get_conn():
    """
        Connect to snowflake
    """

def create_log_table():
    query = """CREATE TABLE IF NOT EXISTS pipeline_logs (
                id INT AUTOINCREMENT PRIMARY KEY
                , ingested_at TIMESTAMP
                , status VARCHAR(10)
                , batch_count INT
                , batch_size INT
                , file_location TEXT
            )"""
    pass

In [None]:
# Landing zone for fetched data from API
# Serialize - compress and save with a specifc file name
# Connect to s3 bucket and upload to bucket

# local path example:
# ./tmp/metadata.json
# ./tmp/part-0001.json.gz
# ./tmp/part-0002.json.gz

# S3 path example:
# Structure s3://<bucket>/<key>
# Might also add metadata for each ingest=(datetime)
# s3://     open-crime-etl/     raw/ingest=(datetime1)/part-0001.json.gz
# s3://     open-crime-etl/     raw/ingest=(datetime1)/part-0002.json.gz
# s3://     open-crime-etl/     raw/ingest=(datetime2)/part-0001.json.gz
# s3://     open-crime-etl/     raw/ingest=(datetime2)/part-0002.json.gz

tmp = Path('./tmp')
s3_key = Path(f"raw/ingest={datetime.now().strftime('%Y-%m-%d')}")
tmp.mkdir(parents=True, exist_ok=True)

BUCKET_NAME = 'open-crime-etl'
PAGESIZE = 500
DELTA = 29

client = boto3.client('s3')

logger.info(f"Initiating pipeline: ingest={datetime.now().strftime('%Y-%m-%d')}")
logger.info(f"tmp folder missing. Created 'tmp'")

logger.info(f"Fetching data from API")
# Iterates over each batch yielded by fetch_crime_data
for i, data in enumerate(fetch_crime_data(delta=DELTA,pagesize=PAGESIZE)):
    filename = f"part-{i+1:04}.json.gz"
    filepath = tmp / filename

    # Staging batch in local storage for upload
    with gzip.open(filepath,'wt') as f:
        json.dump(data, f)
    logger.info(f"Saved file to: {filepath.as_posix()}")


logger.info(f"Uploading to s3://{BUCKET_NAME}")
for file in tmp.iterdir():
    filename = str(file).split("/")[-1]

    # Upload to s3
    client.upload_file(Filename=file.as_posix(), Bucket=BUCKET_NAME, Key=f"{(s3_key / filename).as_posix()}")
    logger.info(f"Uploaded file to: s3://{BUCKET_NAME}/{(s3_key / filename).as_posix()}")

# Update Metadata in Snowflake
#  - Check if log table exists, else create
#  - Insert metadata into log table
#  - Attributes: ingested_at, records, total_batch, batch_size, status, s3_location

# Purge the temp folder
if tmp.exists():
    logger.info("Clearing tmp")
    for item in tmp.iterdir():
        item.unlink()
        logger.info(f"Deleted file: {item}")
    shutil.rmtree(tmp)

logger.info("Pipeline terminated")

[2025-07-22 20:35:20,285]  credentials.py:1216 - INFO - Found credentials in environment variables.
[2025-07-22 20:35:20,530]  924352149.py:28 - INFO - Initiating pipeline: ingest=2025-07-22
[2025-07-22 20:35:20,531]  924352149.py:29 - INFO - tmp folder missing. Created 'tmp'
[2025-07-22 20:35:20,531]  924352149.py:31 - INFO - Fetching data from API
[2025-07-22 20:35:24,131]  924352149.py:40 - INFO - Saved file to: tmp/part-0001.json.gz
[2025-07-22 20:35:24,857]  924352149.py:40 - INFO - Saved file to: tmp/part-0002.json.gz
[2025-07-22 20:35:25,261]  924352149.py:40 - INFO - Saved file to: tmp/part-0003.json.gz
[2025-07-22 20:35:25,694]  924352149.py:40 - INFO - Saved file to: tmp/part-0004.json.gz
[2025-07-22 20:35:26,098]  924352149.py:40 - INFO - Saved file to: tmp/part-0005.json.gz
[2025-07-22 20:35:26,455]  924352149.py:40 - INFO - Saved file to: tmp/part-0006.json.gz
[2025-07-22 20:35:26,856]  924352149.py:40 - INFO - Saved file to: tmp/part-0007.json.gz
[2025-07-22 20:35:27,216]

In [13]:
for i in tmp.iterdir():
    filename = str(i).split("/")[-1]
    print(filename)
    break

part-0007.json.gz


In [None]:
import yaml
config_file = Path('./config.yml')
configs = None
if config_file.exists():
    with open(config_file, 'r') as f:
        configs = yaml.safe_load(f)


<class 'str'>
