In [100]:
from sqlalchemy import MetaData, Table, Column, String, Integer, Float, Date, DateTime
from sqlalchemy.engine.base import Engine
from common.connect import create_postgres_conn, create_aws_conn
from db.tables import create_date_table, create_crime_table, create_log_table
from utils.helper import generate_date_range
from dotenv import load_dotenv
import os

load_dotenv()

db_params = {
    "host": 'localhost',
    "port": '5433',
    "username": 'admin',
    "password": 'admin',
    "db": 'mydb',
}
engine = create_postgres_conn(**db_params)

In [3]:
meta = MetaData()
meta.reflect(engine)
print(meta.tables.keys())

# meta.drop_all(bind=engine, checkfirst=True)
# print(meta.tables.keys())

dict_keys(['pipeline_logs', 'crime', 'date'])


In [4]:
# Decide tables we need to create

# Log table
log_table = create_log_table(engine=engine)

# crime
crime_table = create_crime_table(engine=engine)

# police_stations
# : WILL BE HANDLED IN DBT

# ward_offices
# : WILL BE HANDLED IN DBT

# dim_date
date_table = create_date_table(engine=engine)


In [5]:
meta.reflect(engine)
meta.tables.keys()

dict_keys(['pipeline_logs', 'crime', 'date'])

In [None]:
from datetime import datetime
updated_on = False # Where do we get this? Possibly from log table?
start_date = updated_on if updated_on else datetime(2024,1,1) # Either fallback date or updated_on date
end_date = datetime.now()
date_range = generate_date_range(start_date=start_date, end_date=end_date)
date_range

[{'start_date': '2024-01-01T00:00:00.000',
  'end_date': '2024-02-01T00:00:00.000'},
 {'start_date': '2024-02-01T00:00:00.000',
  'end_date': '2024-03-01T00:00:00.000'},
 {'start_date': '2024-03-01T00:00:00.000',
  'end_date': '2024-04-01T00:00:00.000'},
 {'start_date': '2024-04-01T00:00:00.000',
  'end_date': '2024-05-01T00:00:00.000'},
 {'start_date': '2024-05-01T00:00:00.000',
  'end_date': '2024-06-01T00:00:00.000'},
 {'start_date': '2024-06-01T00:00:00.000',
  'end_date': '2024-07-01T00:00:00.000'},
 {'start_date': '2024-07-01T00:00:00.000',
  'end_date': '2024-08-01T00:00:00.000'},
 {'start_date': '2024-08-01T00:00:00.000',
  'end_date': '2024-09-01T00:00:00.000'},
 {'start_date': '2024-09-01T00:00:00.000',
  'end_date': '2024-10-01T00:00:00.000'},
 {'start_date': '2024-10-01T00:00:00.000',
  'end_date': '2024-11-01T00:00:00.000'},
 {'start_date': '2024-11-01T00:00:00.000',
  'end_date': '2024-12-01T00:00:00.000'},
 {'start_date': '2024-12-01T00:00:00.000',
  'end_date': '2025-01

In [None]:
import requests

def fetch_data(start_date, end_date, pagesize):
    query = f"SELECT * WHERE updated_on BETWEEN '{start_date}' AND '{end_date}'"

    url = "https://data.cityofchicago.org/api/v3/views/crimes/query.json"
    headers = {
        'X-App-Token' : os.getenv('APP_TOKEN')
    }

    pagenum = 1
    results = []
    while True:
        print(pagenum)
        body = {
            'query' : query,
            'page' : {
                'pageNumber' : pagenum,
                'pageSize' : pagesize
            },
            "includeSynthetic": True
        }

        res = requests.post(
            url,
            json=body,
            headers=headers
        )

        if res.status_code != 200:
            print("Error status code")
            print(res.json())
            return (pagenum-1, [])

        if res.json() == []:
            return (pagenum-1, results)
        
        if pagenum >= 15:
            print("Infinite loop")
            return (None, None)

        results.append(res.json())
        
        pagenum += 1

In [None]:
from pathlib import Path
import json
import gzip

def save_to_path(path: str, pagenum: int, data) -> None:
    # Iterates over each batch yielded by fetch_crime_data()
    path = Path(path)
    if not path.exists():
        path.mkdir(parents=True, exist_ok=True)

    for p in range(1, pagenum+1):
        filename = f"part-{p:04}.json.gz"
        filepath = path / filename

        # Staging batch in local storage for upload
        with gzip.open(filepath,'wt') as f:
            json.dump(data, f)

In [None]:
# Need to figure how to query the API for full load or incremental (Mapped date_range to handle both)
# should we query the API using a certain date window? I'm not sure what the benefit of this is? (Going with monthly window)

pagesize = 500

batch = []
for dr in date_range:
# set date to default to
    start_date = dr.get('start_date')
    end_date = dr.get('end_date')

    # What are the chances of OOM?
    count, results = fetch_data(start_date=start_date, end_date=end_date)

    # Save path
    save_to_path()

    
    

    

1
2
3
4
5
1
2
3
4
1
2
3
4
1
2
3
1
2
3


In [120]:
final = []
for count, size in batch:
    final.append((count, size/1024))

final

[(5, 14.9296875),
 (4, 9.0859375),
 (4, 9.0234375),
 (3, 6.1796875),
 (3, 4.5859375)]

In [None]:
# Need a transformation layer before loading the data into the DB

In [None]:
from sqlalchemy import select, func

meta = MetaData()
meta.reflect(engine)

log_table = meta.tables['pipeline_logs']

# Query to get the file_location of the last ingested run
with engine.begin() as conn:
    st1 = select(func.max(log_table.c.ingested_at)).scalar_subquery()
    st = select(
            log_table.c.file_location
        ).where(log_table.c.ingested_at == st1)
    results = conn.execute(st).scalar()


About the source
- Frequency of update - Daily [based on updated_on attribute]

Incremental
- if db exists
- Check last updated_on date and fetch records past that date

Full load (work on this first)
- If db doesnt exist, perform full load
- set a base date to fetch records for full load maybe 2024-01-01
- Dag based full load (backfill) or full load within a dag run + incremental


In [None]:
# get data from s3, filter by ingest
aws_params = {
    "access_key" : os.getenv("AWS_ACCESS_KEY_ID"),
    "secret_access_key" : os.getenv("AWS_SECRET_ACCESS_KEY"),
    "region" : os.getenv("AWS_REGION")
}

s3_client = create_aws_conn(resource='s3', **aws_params)

BUCKET_NAME = 'open-crime-etl'
SOURCE = "raw/"
PREFIX = f"ingest=YYYY-MM-DD/"

# We need to know about the last ingested date from the metadata

bucket = s3_client.Bucket(BUCKET_NAME)
for i in bucket.objects.filter(Prefix=f"{SOURCE}"):
    print(i.key)

# Uncompress it

# json loads

# Transform

# sqlalchemy insert into postgres


raw/ingest=2025-07-27/part-0001.json.gz
raw/ingest=2025-07-27/part-0002.json.gz
raw/ingest=2025-07-27/part-0003.json.gz
raw/ingest=2025-07-27/part-0004.json.gz
raw/ingest=2025-07-27/part-0005.json.gz
raw/ingest=2025-07-27/part-0006.json.gz
raw/ingest=2025-07-27/part-0007.json.gz
raw/ingest=2025-07-27/part-0008.json.gz
raw/ingest=2025-07-27/part-0009.json.gz
raw/ingest=2025-07-27/part-0010.json.gz
raw/ingest=2025-07-27/part-0011.json.gz
raw/ingest=2025-07-27/part-0012.json.gz
raw/ingest=2025-07-27/part-0013.json.gz
raw/ingest=2025-07-27/part-0014.json.gz
raw/ingest=2025-07-27/part-0015.json.gz
raw/ingest=2025-07-27/part-0016.json.gz
raw/ingest=2025-07-27/part-0017.json.gz
raw/ingest=2025-07-27/part-0018.json.gz
raw/ingest=2025-07-27/part-0019.json.gz
raw/ingest=2025-07-27/part-0020.json.gz
raw/ingest=2025-07-27/part-0021.json.gz
raw/ingest=2025-07-27/part-0022.json.gz
