In [1]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='../.env')

import requests
import os
from datetime import datetime, timedelta, timezone
from pathlib import Path
import boto3
import shutil
import gzip
import json

In [2]:
import logging

class LoggerConfig:
    _is_configured = False
    _log_path = Path('./logs')
    _log_level = logging.INFO
    _formatter = logging.Formatter('[%(asctime)s]  %(filename)s:%(lineno)d - %(levelname)s - %(message)s')

    @classmethod
    def set_log_path(cls, path: str):
        cls._log_path = Path(path)

    @classmethod
    def set_formatter(cls, format: logging.Formatter):
        cls._formatter = format
    
    @classmethod
    def configure_root_logger(cls):
        if cls._is_configured:
            return

        # Root Logger
        logger = logging.getLogger()
        logger.setLevel(logging.INFO)

        cls._log_path.mkdir(parents=True, exist_ok=True)
        filepath = cls._log_path / f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log"

        # File Logger
        filehandler = logging.FileHandler(filepath)
        filehandler.setLevel(logging.INFO)
        filehandler.setFormatter(cls._formatter)

        # Console Logger
        streamhandler = logging.StreamHandler()
        streamhandler.setLevel(logging.INFO)
        streamhandler.setFormatter(cls._formatter)

        # Add handlers
        logger.addHandler(filehandler)
        logger.addHandler(streamhandler)

        cls._is_configured = True

class Logger(LoggerConfig):
    def __init__(self, name: str):
        self.configure_root_logger()
        self.logger = logging.getLogger(name)

    def __getattr__(self, name):
        return getattr(self.logger, name)

In [3]:
logger = Logger("api-call")

TODO
- Perform Initial Load from API into a database
- Perform Incremental Load from API into a database

Initial Load
- How to fetch using the url? - Work in progress
- Where to dump the data initially? (S3) - Yes
- Do you need to dump the data? - Yes: Avoid re-fetching from API if pipeline fails

In [4]:
def fetch_crime_data(pagesize: int = 50000):

    # For Backfill
    # min_update = f"SELECT min(updated_on)"
    # max_update = f"SELECT max(updated_on)"
    
    # last_update = (datetime.now() - timedelta(days=delta)).isoformat()[:-3] if delta else datetime.now().isoformat()[:-3]

    # set date to default to
    start_date = datetime(2025,1,1,1,1,1,1000).isoformat()[:-3]

    # Else fetch from
    last_update = datetime.now().isoformat()[:-3] 
    # Implementing CDC, batch incremental loads only
    # query = f"SELECT * WHERE updated_on >= '{last_update}'"
    query = f"SELECT * WHERE updated_on BETWEEN '{start_date}' AND '{last_update}'"

    url = "https://data.cityofchicago.org/api/v3/views/crimes/query.json"
    headers = {
        'X-App-Token' : os.getenv('APP_TOKEN')
    }

    # Need to add retry logic
    pagenum = 1
    results = []
    while True:
        print(pagenum)
        body = {
            'query' : query,
            'page' : {
                'pageNumber' : pagenum,
                'pageSize' : pagesize
            },
            "includeSynthetic": True
        }

        res = requests.post(
            url,
            json=body,
            headers=headers
        )

        if res.status_code != 200:
            print("Error status code")
            print(res.json())

        if res.json() == []:
            return results
        
        if pagenum >= 15:
            break

        results.extend(res.json())
        
        pagenum += 1

In [5]:
results = fetch_crime_data()

1
2
3
4
5
6
7
8


In [6]:
import pandas as pd
df = pd.DataFrame(results)
df.head()

Unnamed: 0,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,:updated_at,:@computed_region_awaf_s7ux,:@computed_region_6mkv_f3dw,:@computed_region_vrxf_vc4k,:@computed_region_bdys_3d7i,:@computed_region_43wa_7qmu,:@computed_region_rpca_8um6,:@computed_region_d9mm_jgwp,:@computed_region_d3ds_rm58,:@computed_region_8hcu_yrd4
0,13908594,JJ343629,2025-07-22T00:00:00.000,042XX S WELLS ST,2010,NARCOTICS,MANUFACTURE / DELIVER - AMPHETAMINES,GAS STATION,True,False,...,2025-07-30T11:29:58.693Z,12,14924,3,189,9,37,23,106,8
1,13908610,JJ343637,2025-07-22T00:00:00.000,075XX S ST LAWRENCE AVE,051A,ASSAULT,AGGRAVATED - HANDGUN,SIDEWALK,False,True,...,2025-07-30T11:29:58.693Z,31,21546,67,496,32,61,20,227,4
2,13909013,JJ343892,2025-07-22T00:00:00.000,006XX W 70TH ST,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,2025-07-30T11:29:58.693Z,31,21559,66,479,32,11,17,214,4
3,13909047,JJ344016,2025-07-22T00:00:00.000,012XX W GRAND AVE,0910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,2025-07-30T11:29:58.693Z,41,22620,25,521,46,49,15,76,27
4,13909372,JJ344733,2025-07-22T00:00:00.000,010XX W ADDISON ST,1330,CRIMINAL TRESPASS,TO LAND,SPORTS ARENA / STADIUM,True,False,...,2025-07-30T11:29:58.693Z,38,21186,57,691,25,53,5,12,44


In [8]:
list(results[0].keys())

['id',
 'case_number',
 'date',
 'block',
 'iucr',
 'primary_type',
 'description',
 'location_description',
 'arrest',
 'domestic',
 'beat',
 'district',
 'ward',
 'community_area',
 'fbi_code',
 'x_coordinate',
 'y_coordinate',
 'year',
 'updated_on',
 'latitude',
 'longitude',
 'location',
 ':id',
 ':version',
 ':created_at',
 ':updated_at',
 ':@computed_region_awaf_s7ux',
 ':@computed_region_6mkv_f3dw',
 ':@computed_region_vrxf_vc4k',
 ':@computed_region_bdys_3d7i',
 ':@computed_region_43wa_7qmu',
 ':@computed_region_rpca_8um6',
 ':@computed_region_d9mm_jgwp',
 ':@computed_region_d3ds_rm58',
 ':@computed_region_8hcu_yrd4']

In [9]:
import pandas as pd

df = pd.DataFrame(results)
df[[
    'id',
    'date',             # Date when crime occured, last 7 days crime wont be listed
    'updated_on',       # Records keep getting updated daily
    ':id',
    ':version',
    ':created_at', 
    ':updated_at'
    ]].head()

Unnamed: 0,id,date,updated_on,:id,:version,:created_at,:updated_at
0,13908594,2025-07-22T00:00:00.000,2025-07-29T15:42:06.000,row-zg3e-sfsb_x6zu,rv-a4cb~7x3n.ugei,2025-07-30T11:29:50.817Z,2025-07-30T11:29:58.693Z
1,13908610,2025-07-22T00:00:00.000,2025-07-29T15:42:06.000,row-n6e5~b5gq~ydgm,rv-d9w5_nfxg.mzcm,2025-07-30T11:29:50.817Z,2025-07-30T11:29:58.693Z
2,13909013,2025-07-22T00:00:00.000,2025-07-29T15:42:06.000,row-gdzi-bfu6-nghj,rv-q37s~pwx5-9tux,2025-07-30T11:29:50.817Z,2025-07-30T11:29:58.693Z
3,13909047,2025-07-22T00:00:00.000,2025-07-29T15:42:06.000,row-d662.zmwb-cun3,rv-6jqh~4yr8-qgbk,2025-07-30T11:29:50.817Z,2025-07-30T11:29:58.693Z
4,13909372,2025-07-22T00:00:00.000,2025-07-29T15:42:06.000,row-cqcx_gair~xh6j,rv-za4j_2abr_njkn,2025-07-30T11:29:50.817Z,2025-07-30T11:29:58.693Z


In [None]:
from snowflake.sqlalchemy import URL

from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.engine.url import URL as postgresURL

DB_ACCOUNT = 'QYKNAZY-GD18580'
DB_NAME = 'OPEN_CRIME_ETL'
DB_SCHEMA = 'PUBLIC'
DB_ROLE = 'ACCOUNTADMIN'
DB_WH = 'COMPUTE_WH'
DB_USER = 'hyderreza'
DB_PASS = os.getenv("SNOWFLAKE_DB_PASSWORD")

def create_postgres_conn():
    """
    Secondary DB to create a local copy when snowflake subscription expires
    """

    url = postgresURL.create(
        drivername='postgresql',
        host='localhost',
        port='5432',
        username='admin',
        password='admin',
        database='mydb'
    )

    return create_engine(url)

def create_snowflake_conn():
    """
        Connect to snowflake
    """

    url = URL(
        account = DB_ACCOUNT,
        user= DB_USER,
        password = DB_PASS,
        database = DB_NAME,
        schema = DB_SCHEMA,
        role = DB_ROLE,
        warehouse = DB_WH
    )

    return create_engine(url)

def drop_table(engine, table: Table):
    table.drop(engine, checkfirst=True)


: 

In [None]:
from sqlalchemy import Table, Column, String, Integer, Text, TIMESTAMP, text, TIME, Date
from sqlalchemy import insert, update

def create_log_table(engine):
    metadata = MetaData()
    table = Table(
        "pipeline_logs", metadata,
        Column("run_id", Integer, primary_key=True, autoincrement=True),
        Column("ingested_at", Date),
        Column("start_time", TIME),
        Column("end_time", TIME),
        Column("status", String(10)),
        Column("batch_count", Integer),
        Column("batch_size", Integer),
        Column("config", Text),
        Column("file_location", Text)
    )

    metadata.create_all(engine, checkfirst=True)
    return table

# Other tables to create - crime, ward, holidays, dim_date

def initialize_run_log(engine, logs_table, config: dict):
    with engine.begin() as conn:
        st = insert(logs_table).values(
            ingested_at = datetime.now(timezone.utc).strftime('%Y-%m-%d'),
            start_time = datetime.now(timezone.utc).strftime("%H:%M:%S"),
            status = 'RUNNING',
            batch_size = config.get('batchsize'),
            config = str(config)
        ).returning(logs_table.c.run_id)

        result = conn.execute(st)
        run_id = result.scalar()
        return run_id

def update_run_log(engine, logs_table, run_id, batch_count, file_location):
    with engine.begin() as conn:
        st = update(logs_table).where(logs_table.c.run_id == run_id).values(
            end_time = datetime.now(timezone.utc).strftime("%H:%M:%S"),
            status = "SUCCESS",
            batch_count = batch_count,
            file_location = file_location
        )

        conn.execute(st)

engine = create_postgres_conn()
log_table = create_log_table(engine)

: 

In [None]:
# log_table.drop(engine,checkfirst=True)
meta = MetaData()
meta.reflect(engine)
meta.tables.keys()

dict_keys([])

: 

In [None]:
from sqlalchemy import inspect
inspector = inspect(engine)
inspector.get_table_names()

['pipeline_logs']

: 

In [None]:
# Landing zone for fetched data from API
# Serialize - compress and save with a specifc file name
# Connect to s3 bucket and upload to bucket

# local path example:
# ./tmp/metadata.json
# ./tmp/part-0001.json.gz
# ./tmp/part-0002.json.gz

# S3 path example:
# Structure s3://<bucket>/<key> 
# Might also add metadata for each ingest=(datetime)
# s3://     open-crime-etl/     raw/ingest=(datetime1)/part-0001.json.gz
# s3://     open-crime-etl/     raw/inges t=(datetime1)/part-0002.json.gz
# s3://     open-crime-etl/     raw/ingest=(datetime2)/part-0001.json.gz
# s3://     open-crime-etl/     raw/ingest=(datetime2)/part-0002.json.gz

# Pipeline start => Init Log(ingested_at, start_time, status, config) -> Fetch API data -> Load to S3 -> Load to Snowflake -> Update Log(end_time, status, batch_count, batch_size, config, file_location)
config = {
      'batchsize': 1000,
      'delta': 29
}

tmp = Path('./tmp')
s3_key = Path(f"raw/ingest={datetime.now().strftime('%Y-%m-%d')}")
tmp.mkdir(parents=True, exist_ok=True)
 
BUCKET_NAME = 'open-crime-etl'
PAGESIZE = config.get('batchsize', 5000)
DELTA = config.get('delta', 14)
client = boto3.client('s3')


logger.info(f"Initiating pipeline: ingest={datetime.now().strftime('%Y-%m-%d')}")

# LogLog(ingested_at, start_time, status, config) and get run_id
run_id = initialize_run_log(engine=engine, logs_table=log_table, config=config)

# Check if REQUIRED tables are in place - [crime, holidays, etc]

logger.info(f"Fetching data from API")

count = 0
# Iterates over each batch yielded by fetch_crime_data
for i, data in enumerate(fetch_crime_data(delta=DELTA,pagesize=PAGESIZE)):
    filename = f"part-{i+1:04}.json.gz"
    filepath = tmp / filename

    # Staging batch in local storage for upload
    with gzip.open(filepath,'wt') as f:
        json.dump(data, f)
    logger.info(f"Saved file to: {filepath.as_posix()}")
    
    count+=1

logger.info(f"Uploading to s3://{BUCKET_NAME}")
for file in tmp.iterdir():
    filename = str(file).split("/")[-1]

    # Upload to s3
    client.upload_file(Filename=file.as_posix(), Bucket=BUCKET_NAME, Key=f"{(s3_key / filename).as_posix()}")
    logger.info(f"Uploaded file to: s3://{BUCKET_NAME}/{(s3_key / filename).as_posix()}")

# Check if crime table exists else create one
# Fetch Records from s3 or just use what you have locally stored to load it into DB

# Purge the temp folder
if tmp.exists():
    logger.info("Clearing tmp")
    for item in tmp.iterdir():
        item.unlink()
        logger.info(f"Deleted file: {item}")
    shutil.rmtree(tmp)

# Update metadata
update_run_log(engine=engine, logs_table=log_table, run_id=run_id, batch_count=count, file_location=f"s3://{BUCKET_NAME}/{s3_key.as_posix()}")

logger.info("Pipeline terminated") 

[2025-07-24 19:06:16,866]  3742988564.py:34 - INFO - Initiating pipeline: ingest=2025-07-24


[2025-07-24 19:06:16,873]  3742988564.py:39 - INFO - Fetching data from API
[2025-07-24 19:06:17,360]  3742988564.py:50 - INFO - Saved file to: tmp/part-0001.json.gz
[2025-07-24 19:06:17,828]  3742988564.py:50 - INFO - Saved file to: tmp/part-0002.json.gz
[2025-07-24 19:06:18,362]  3742988564.py:50 - INFO - Saved file to: tmp/part-0003.json.gz
[2025-07-24 19:06:18,813]  3742988564.py:50 - INFO - Saved file to: tmp/part-0004.json.gz
[2025-07-24 19:06:19,313]  3742988564.py:50 - INFO - Saved file to: tmp/part-0005.json.gz
[2025-07-24 19:06:19,781]  3742988564.py:50 - INFO - Saved file to: tmp/part-0006.json.gz
[2025-07-24 19:06:20,379]  3742988564.py:50 - INFO - Saved file to: tmp/part-0007.json.gz
[2025-07-24 19:06:20,896]  3742988564.py:50 - INFO - Saved file to: tmp/part-0008.json.gz
[2025-07-24 19:06:21,422]  3742988564.py:50 - INFO - Saved file to: tmp/part-0009.json.gz
[2025-07-24 19:06:21,945]  3742988564.py:50 - INFO - Saved file to: tmp/part-0010.json.gz
[2025-07-24 19:06:22,398

: 