In [1]:
from sqlalchemy import MetaData, Table, Column, String, Integer, Float, Date, DateTime, func, select
from sqlalchemy.engine.base import Engine
from common.connect import create_postgres_conn, create_aws_conn
from db.tables import create_date_table, create_crime_table, create_log_table
from utils.helper import generate_date_range
from dotenv import load_dotenv
import os

load_dotenv()

db_params = {
    "host": 'localhost',
    "port": '5433',
    "username": 'admin',
    "password": 'admin',
    "db": 'mydb',
}

engine = create_postgres_conn(**db_params)

In [3]:
meta = MetaData()
meta.reflect(engine)
print(meta.tables.keys())

# meta.drop_all(bind=engine, checkfirst=True)
# print(meta.tables.keys())

dict_keys([])


In [8]:
log_table = meta.tables['pipeline_logs']
with engine.begin() as conn:
    st = select(func.max(log_table.c.ingested_at))
    result = conn.execute(st)
    print(type(result.scalar()))

<class 'datetime.date'>


In [4]:
# Decide tables we need to create

# Log table
log_table = create_log_table(engine=engine)

# crime
crime_table = create_crime_table(engine=engine)

# police_stations
# : WILL BE HANDLED IN DBT

# ward_offices
# : WILL BE HANDLED IN DBT

# dim_date
date_table = create_date_table(engine=engine)


In [6]:
meta.reflect(engine)
meta.tables.keys()

dict_keys(['pipeline_logs', 'crime', 'date'])

In [None]:
import requests
import gzip
import json
from pathlib import Path
from datetime import datetime

def save_to_path(save_to: str, date: str, pagenum: int, data: dict) -> None:
    """ TODO
    - Remap folder structure
    - Example: tmp/year/month/part
    """
    parsed_date = datetime.strptime(date, '%Y-%m-%dT%H:%M:%S.%f')

    root = Path(save_to) # ./tmp
    year = str(parsed_date.year) # ./2024
    month = f"{parsed_date.month:02}" # ./01

    path = root / year / month
    filename = f"part-{pagenum:04}.json.gz"
    filepath =  path / filename

    if not path.exists():
        path.mkdir(parents=True, exist_ok=True)

    # Staging batch in local storage for upload
    with gzip.open(filepath,'wt') as f:
        json.dump(data, f)

def fetch_data_api(start_date, end_date, pagesize: int, save_path: str) -> int:
    """ TODO
    - Two modes: full or incremental
    - Default Date: 2024-01-01
    - Methodology: query month by month i.e relativedelta(months=1)
    """

    query = f"SELECT * WHERE updated_on BETWEEN '{start_date}' AND '{end_date}'"
    url = "https://data.cityofchicago.org/api/v3/views/crimes/query.json"
    headers = {'X-App-Token' : os.getenv('APP_TOKEN')}

    pagenum = 1
    while True:
        body = {
            'query' : query,
            'page' : {
                'pageNumber' : pagenum,
                'pageSize' : pagesize
            },
            "includeSynthetic": True
        }
        res = requests.post(url, json=body, headers=headers)

        if res.status_code != 200:
            raise Exception(f"API returned status {res.status_code} at page {pagenum}")
        
        if pagenum >= 50:
            raise Exception("Reached page limit 50, stopping to prevent infinite loop")
        
        if res.json() == []:
            return pagenum-1
        
        save_to_path(save_to=save_path, date=start_date,  pagenum=pagenum, data=res.json())
        
        pagenum += 1

In [46]:
from datetime import datetime

start_date = datetime(2024,1,1)
end_date = datetime(2024,3,1)
date_range = generate_date_range(start_date=start_date, end_date=end_date)
date_range

for d in date_range:
    sd = d.get('start_date')
    ed = d.get('end_date')
    fetch_data_api(start_date=sd, end_date=ed, pagesize=1000, save_path="./tmp")


In [82]:
source_path = Path("./tmp")
destination_path = Path("raw/")
# list(source_path.iterdir())[0].as_posix()
keys = []
for p in source_path.rglob("*.gz"):
    print(p)
    year, month, file = p.as_posix().split("/")[1:]
    key = destination_path / f"year={year}/" / f"month={month}/" / file
    keys.append(key.as_posix()) 
keys

tmp/2024/01/part-0002.json.gz
tmp/2024/01/part-0001.json.gz
tmp/2024/02/part-0002.json.gz
tmp/2024/02/part-0001.json.gz


['raw/year=2024/month=01/part-0002.json.gz',
 'raw/year=2024/month=01/part-0001.json.gz',
 'raw/year=2024/month=02/part-0002.json.gz',
 'raw/year=2024/month=02/part-0001.json.gz']

In [None]:
# Need a transformation layer before loading the data into the DB

In [None]:
from sqlalchemy import select, func

meta = MetaData()
meta.reflect(engine)

log_table = meta.tables['pipeline_logs']

# Query to get the file_location of the last ingested run
with engine.begin() as conn:
    st1 = select(func.max(log_table.c.ingested_at)).scalar_subquery()
    st = select(
            log_table.c.file_location
        ).where(log_table.c.ingested_at == st1)
    results = conn.execute(st).scalar()


About the source
- Frequency of update - Daily [based on updated_on attribute]

Incremental
- if db exists
- Check last updated_on date and fetch records past that date

Full load (work on this first)
- If db doesnt exist, perform full load
- set a base date to fetch records for full load maybe 2024-01-01
- Dag based full load (backfill) or full load within a dag run + incremental


In [None]:
# get data from s3, filter by ingest
aws_params = {
    "access_key" : os.getenv("AWS_ACCESS_KEY_ID"),
    "secret_access_key" : os.getenv("AWS_SECRET_ACCESS_KEY"),
    "region" : os.getenv("AWS_REGION")
}

s3_client = create_aws_conn(resource='s3', **aws_params)

BUCKET_NAME = 'open-crime-etl'
SOURCE = "raw/"
PREFIX = f"year=/month="

# We need to know about the last ingested date from the metadata

bucket = s3_client.Bucket(BUCKET_NAME)
for i in bucket.objects.filter(Prefix=f"{SOURCE}"):
    print(i.key)

# Uncompress it

# json loads

# Transform

# sqlalchemy insert into postgres


raw/ingest=2025-07-27/part-0001.json.gz
raw/ingest=2025-07-27/part-0002.json.gz
raw/ingest=2025-07-27/part-0003.json.gz
raw/ingest=2025-07-27/part-0004.json.gz
raw/ingest=2025-07-27/part-0005.json.gz
raw/ingest=2025-07-27/part-0006.json.gz
raw/ingest=2025-07-27/part-0007.json.gz
raw/ingest=2025-07-27/part-0008.json.gz
raw/ingest=2025-07-27/part-0009.json.gz
raw/ingest=2025-07-27/part-0010.json.gz
raw/ingest=2025-07-27/part-0011.json.gz
raw/ingest=2025-07-27/part-0012.json.gz
raw/ingest=2025-07-27/part-0013.json.gz
raw/ingest=2025-07-27/part-0014.json.gz
raw/ingest=2025-07-27/part-0015.json.gz
raw/ingest=2025-07-27/part-0016.json.gz
raw/ingest=2025-07-27/part-0017.json.gz
raw/ingest=2025-07-27/part-0018.json.gz
raw/ingest=2025-07-27/part-0019.json.gz
raw/ingest=2025-07-27/part-0020.json.gz
raw/ingest=2025-07-27/part-0021.json.gz
raw/ingest=2025-07-27/part-0022.json.gz
