In [2]:
from sqlalchemy import MetaData, Table, Column, String, Integer, Float, Date, DateTime, func, select
from sqlalchemy.engine.base import Engine
from common.connect import create_postgres_conn, create_aws_conn
from db.postgres.tables import create_date_table, create_crime_table, create_log_table
from utils.helper import generate_date_range
from dotenv import load_dotenv
import os

load_dotenv()

db_params = {
    "host": 'localhost',
    "port": '5433',
    "username": 'admin',
    "password": 'admin',
    "db": 'mydb',
}

engine = create_postgres_conn(**db_params)

In [53]:
result = None
with engine.begin() as conn:
    query = """
        SELECT tablename
        FROM pg_tables
        WHERE schemaname = 'public'
    """
    result = conn.execute(query)
    result = [r[0] for r in result.fetchall()]
    print(result)

['pipeline_logs', 'crime', 'date']


In [54]:
import pandas as pd

data = {
    'name' : ['John', 'Doe', "Mary"],
    'age' : [20, 21, 19]
}
df = pd.DataFrame(data)
columns = df.columns.tolist()
set_clause = ", ".join([f"{c} = c2.{c}" for c in columns])
insert_columns = ", ".join(columns)
insert_values = ", ".join([f"c2.{c}" for c in columns])
insert_values

'c2.name, c2.age'

In [2]:
meta = MetaData()
meta.reflect(engine)
print(meta.tables.keys())

# meta.drop_all(bind=engine, checkfirst=True)
# print(meta.tables.keys())

dict_keys(['date', 'crime', 'pipeline_logs'])


In [5]:
log_table = meta.tables['pipeline_logs']
last_src_update = None
with engine.begin() as conn:
    st = select(func.max(log_table.c.ingested_at))
    result = conn.execute(st)
    last_src_update = result.scalar()

In [6]:
last_src_update

datetime.date(2025, 8, 3)

In [10]:
# Decide tables we need to create

# Log table
log_table = create_log_table(engine=engine)

# crime
crime_table = create_crime_table(engine=engine)

# police_stations
# : WILL BE HANDLED IN DBT

# ward_offices
# : WILL BE HANDLED IN DBT

# dim_date
date_table = create_date_table(engine=engine)


In [None]:
# Need a transformation layer before loading the data into the DB

About the source
- Frequency of update - Daily [based on updated_on attribute]

Incremental
- if db exists
- Check last updated_on date and fetch records past that date

Full load (work on this first)
- If db doesnt exist, perform full load
- set a base date to fetch records for full load maybe 2024-01-01
- Dag based full load (backfill) or full load within a dag run + incremental


Workflow

- Download all files into tmp

- Uncompress it

- json loads

- Transform

- sqlalchemy batch insert into postgres

In [6]:
from datetime import datetime, time
t = '2025-01-01'
datetime.strptime(t,"%Y-%m-%d").date()

datetime.date(2025, 1, 1)

In [3]:
import re
from datetime import datetime, timedelta
from pathlib import Path

# get data from s3, filter by ingest
aws_params = {
    "access_key" : os.getenv("AWS_ACCESS_KEY_ID"),
    "secret_access_key" : os.getenv("AWS_SECRET_ACCESS_KEY"),
    "region" : os.getenv("AWS_REGION")
}

s3_client = create_aws_conn(resource='s3', **aws_params)

TMP = Path("./tmp")
BUCKET_NAME = 'open-crime-etl'

"""Need to find a way to detect what is missing from the db to perform ingestion, airflow can take care of this, however we need to handle how we load it from s3 to db.
In other words, say the ingestion takes place weekly, you would query the api and load each page as batch and sort them by year/month. but say we fetch the next weekly batch, how do you plan on organizing that into s3.
## SOLVED BY ADDING A NEW NAMESPACE i.e `load_date=yyyy-mm-dd/` ##
"""
bucket = s3_client.Bucket(BUCKET_NAME)

# Scan pipeline_logs and fetch the last ingested_at with status = 'SUCCESS'
last_date = datetime(2025,7,27).date() # Fetch this from pipeline_logs where last ingested_at with status = 'SUCCESS'
today = datetime.now().date()
date_range = [(last_date + timedelta(days=i)).strftime("%Y-%m-%d") for i in range((today - last_date).days+1)]
esc_date = "|".join(map(re.escape, date_range))

# Generate dates to add to regex pattern
regex = re.compile(rf"^raw/year=\d{{4}}/month=\d{{2}}/load_date=({esc_date})/.*")

# perform download here
for i in bucket.objects.filter(Prefix=f"raw/"):
    if regex.match(i.key):
        key = i.key.split("/")
        year = key[1]
        month = key[2]
        ingested_at = key[3]
        file = key[-1]

        # Create tmp directory
        if not TMP.exists():
            TMP.mkdir(parents=True, exist_ok=True)

        filename = f"{ingested_at.split('=')[-1]}_{year[-4:]}{month[-2:]}_{file}"
        bucket.download_file(i.key, (TMP / filename))
        break
    # break




In [5]:
list(TMP.rglob("*"))

[PosixPath('tmp/2025-08-05_202505_part-0007.json.gz')]

In [None]:
import gzip
import json
import pandas as pd
from db.tables import create_crime_table
from sqlalchemy import insert
from sqlalchemy.dialects.postgresql import insert as upsert
from pprint import pprint

# Stream approach, iterate each file -> uncompress -> load -> transform -> batch insert into db

df = None
for file in TMP.rglob("*.gz"):
    print(file.as_posix())
    # Unzip
    with gzip.open(file.as_posix(), 'rt') as f:
        # Load
        data = json.load(f)

        # Transform
        df = pd.DataFrame(data)

        col_drop = [
            ':@computed_region_awaf_s7ux',
            ':@computed_region_6mkv_f3dw',
            ':@computed_region_vrxf_vc4k',
            ':@computed_region_bdys_3d7i',
            ':@computed_region_43wa_7qmu',
            ':@computed_region_rpca_8um6',
            ':@computed_region_d9mm_jgwp',
            ':@computed_region_d3ds_rm58',
            ':@computed_region_8hcu_yrd4',
            'location',
            ':id',
            ':version',
            ':created_at',
            'year',
            'updated_on'
        ]
        
        rename_col = {
            'id' : 'crime_id',
            'case_number' : 'case',
            'date' : 'date_of_occurrence',
            'primary_type' : 'primary_description',
            'description' : 'secondary_description',
            ':updated_at' : 'source_updated_on'
        }
        
        # Drop
        df.drop(columns=col_drop, inplace=True)

        # Rename
        df.rename(columns=rename_col, inplace=True)

        # Handle Null
        df.where(pd.notnull(df), None, inplace=True)

        # Bulk insert (Check if tables exist, create them, then insert)
        meta = MetaData()
        meta.reflect(engine)

        # Check if table exists else create
        crime_table = create_crime_table(engine) if 'crime' not in meta.tables.keys() else meta.tables['crime']

        # Should I define a batch param for the insert or solely rely on the initial batchsize decided when ingesting and storing api data into s3? Would make sense, say, the db gets throttled with reads and you might want to limit the writes? so might want to adjust the batchsize for inserts?

        batchsize = 500

        key_columns = [pk_column.name for pk_column in crime_table.primary_key.columns.values()]
        for start in range(0, len(df), batchsize):
            batch = df.iloc[start : start + batchsize]
            with engine.begin() as conn:
                # Need to perform upsert instead of insert if crime_id exists

                st = upsert(crime_table).values(batch.to_dict(orient='records'))

                up_st = st.on_conflict_do_update(
                    index_elements = [crime_table.c.crime_id],

                    set_= {c.key: c for c in st.excluded if c.key not in key_columns}
                )

                conn.execute(up_st)
            
        # Checkpoint on success, might use sqlite3 for this

    # break

tmp/2025-08-03_202505_part-0011.json.gz
tmp/2025-08-02_202505_part-0034.json.gz
tmp/2025-08-03_202507_part-0002.json.gz
tmp/2025-08-03_202505_part-0033.json.gz
tmp/2025-08-03_202503_part-0001.json.gz
tmp/2025-08-02_202505_part-0031.json.gz
tmp/2025-08-03_202505_part-0025.json.gz
tmp/2025-08-02_202505_part-0005.json.gz
tmp/2025-08-03_202505_part-0045.json.gz
tmp/2025-08-03_202505_part-0030.json.gz
tmp/2025-08-03_202505_part-0018.json.gz
tmp/2025-08-03_202505_part-0043.json.gz
tmp/2025-08-03_202505_part-0008.json.gz
tmp/2025-08-02_202505_part-0007.json.gz
tmp/2025-08-02_202504_part-0001.json.gz
tmp/2025-08-02_202506_part-0004.json.gz
tmp/2025-08-03_202505_part-0037.json.gz
tmp/2025-08-03_202505_part-0041.json.gz
tmp/2025-08-02_202505_part-0012.json.gz
tmp/2025-08-02_202505_part-0024.json.gz
tmp/2025-08-02_202506_part-0003.json.gz
tmp/2025-08-02_202505_part-0009.json.gz
tmp/2025-08-03_202505_part-0002.json.gz
tmp/2025-08-02_202505_part-0014.json.gz
tmp/2025-08-02_202505_part-0010.json.gz


In [1]:
d = {
  "version": "1.0",
  "timestamp": 1754417343979,
  "highlights": {
    "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/": [
      {
        "colorKey": "yellow",
        "colorValue": "#ffff00",
        "id": "highlight-1754417030634",
        "nodePath": [
          5,
          9,
          1,
          1,
          5,
          5,
          1,
          4,
          1,
          13,
          1,
          1,
          1,
          1,
          1,
          12,
          0,
          1,
          0,
          0
        ],
        "occurrenceIndex": 1,
        "text": "gather, extract, restructure, analyze and interpret data, and communicate findings and recommendations to the business.",
        "timestamp": 1754417030635,
        "url": "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/"
      },
      {
        "colorKey": "yellow",
        "colorValue": "#ffff00",
        "id": "highlight-1754417120874",
        "nodePath": [
          5,
          9,
          1,
          1,
          5,
          5,
          1,
          4,
          1,
          13,
          1,
          1,
          1,
          1,
          1,
          12,
          3,
          1,
          0,
          0
        ],
        "occurrenceIndex": 1,
        "text": "strong business acumen",
        "timestamp": 1754417120875,
        "url": "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/"
      },
      {
        "colorKey": "yellow",
        "colorValue": "#ffff00",
        "id": "highlight-1754417128971",
        "nodePath": [
          5,
          9,
          1,
          1,
          5,
          5,
          1,
          4,
          1,
          13,
          1,
          1,
          1,
          1,
          1,
          12,
          3,
          1,
          0,
          4
        ],
        "occurrenceIndex": 1,
        "text": "many data sources",
        "timestamp": 1754417128972,
        "url": "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/"
      },
      {
        "colorKey": "yellow",
        "colorValue": "#ffff00",
        "id": "highlight-1754417145966",
        "nodePath": [
          5,
          9,
          1,
          1,
          5,
          5,
          1,
          4,
          1,
          13,
          1,
          1,
          1,
          1,
          1,
          12,
          3,
          1,
          0,
          6
        ],
        "occurrenceIndex": 1,
        "text": "technical skills (SQL/python) to restructure and clean this data",
        "timestamp": 1754417145967,
        "url": "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/"
      },
      {
        "colorKey": "yellow",
        "colorValue": "#ffff00",
        "id": "highlight-1754417173734",
        "nodePath": [
          5,
          9,
          1,
          1,
          5,
          5,
          1,
          4,
          1,
          13,
          1,
          1,
          1,
          1,
          1,
          12,
          3,
          1,
          0,
          10
        ],
        "occurrenceIndex": 1,
        "text": "output will embody data visualization",
        "timestamp": 1754417173735,
        "url": "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/"
      },
      {
        "colorKey": "yellow",
        "colorValue": "#ffff00",
        "id": "highlight-1754417189224",
        "nodePath": [
          5,
          9,
          1,
          1,
          5,
          5,
          1,
          4,
          1,
          13,
          1,
          1,
          1,
          1,
          1,
          12,
          3,
          1,
          0,
          14
        ],
        "occurrenceIndex": 1,
        "text": "formulating and testing hypotheses, performing correlation analysis, and other forms of data mining.",
        "timestamp": 1754417189224,
        "url": "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/"
      },
      {
        "colorKey": "yellow",
        "colorValue": "#ffff00",
        "id": "highlight-1754417200041",
        "nodePath": [
          5,
          9,
          1,
          1,
          5,
          5,
          1,
          4,
          1,
          13,
          1,
          1,
          1,
          1,
          1,
          12,
          3,
          1,
          0,
          18
        ],
        "occurrenceIndex": 1,
        "text": "guide requirement gathering",
        "timestamp": 1754417200041,
        "url": "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/"
      },
      {
        "colorKey": "yellow",
        "colorValue": "#ffff00",
        "id": "highlight-1754417209686",
        "nodePath": [
          5,
          9,
          1,
          1,
          5,
          5,
          1,
          4,
          1,
          13,
          1,
          1,
          1,
          1,
          1,
          12,
          3,
          1,
          0,
          20
        ],
        "occurrenceIndex": 1,
        "text": "explain results of analysis to team members",
        "timestamp": 1754417209687,
        "url": "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/"
      },
      {
        "colorKey": "yellow",
        "colorValue": "#ffff00",
        "id": "highlight-1754417212959",
        "nodePath": [
          5,
          9,
          1,
          1,
          5,
          5,
          1,
          4,
          1,
          13,
          1,
          1,
          1,
          1,
          1,
          12,
          3,
          1,
          0,
          22
        ],
        "occurrenceIndex": 0,
        "text": "train stakeholders on how to use decision support tools",
        "timestamp": 1754417212960,
        "url": "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/"
      },
      {
        "colorKey": "yellow",
        "colorValue": "#ffff00",
        "id": "highlight-1754417232680",
        "nodePath": [
          5,
          9,
          1,
          1,
          5,
          5,
          1,
          4,
          1,
          13,
          1,
          1,
          1,
          1,
          1,
          12,
          3,
          1,
          0,
          26
        ],
        "occurrenceIndex": 1,
        "text": "deep understanding of the core applications we leverage to collect data",
        "timestamp": 1754417232680,
        "url": "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/"
      },
      {
        "colorKey": "yellow",
        "colorValue": "#ffff00",
        "id": "highlight-1754417248751",
        "nodePath": [
          5,
          9,
          1,
          1,
          5,
          5,
          1,
          4,
          1,
          13,
          1,
          1,
          1,
          1,
          1,
          12,
          4,
          1,
          0,
          14
        ],
        "occurrenceIndex": 1,
        "text": "Python in analytics and software development",
        "timestamp": 1754417248752,
        "url": "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/"
      },
      {
        "colorKey": "yellow",
        "colorValue": "#ffff00",
        "id": "highlight-1754417254821",
        "nodePath": [
          5,
          9,
          1,
          1,
          5,
          5,
          1,
          4,
          1,
          13,
          1,
          1,
          1,
          1,
          1,
          12,
          4,
          1,
          0,
          18
        ],
        "occurrenceIndex": 1,
        "text": "SQL for MS SQL Server and Snowflake highly preferred)",
        "timestamp": 1754417254822,
        "url": "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/"
      },
      {
        "colorKey": "yellow",
        "colorValue": "#ffff00",
        "id": "highlight-1754417258073",
        "nodePath": [
          5,
          9,
          1,
          1,
          5,
          5,
          1,
          4,
          1,
          13,
          1,
          1,
          1,
          1,
          1,
          12,
          4,
          1,
          0,
          22
        ],
        "occurrenceIndex": 1,
        "text": "Power BI and Streamlit highly preferred",
        "timestamp": 1754417258073,
        "url": "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/"
      },
      {
        "colorKey": "yellow",
        "colorValue": "#ffff00",
        "id": "highlight-1754417298766",
        "nodePath": [
          5,
          9,
          1,
          1,
          5,
          5,
          1,
          4,
          1,
          13,
          1,
          1,
          1,
          1,
          1,
          12,
          4,
          1,
          0,
          30
        ],
        "occurrenceIndex": 1,
        "text": "integrating APIs",
        "timestamp": 1754417298766,
        "url": "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/"
      },
      {
        "colorKey": "pink",
        "colorValue": "#ffcccc",
        "id": "highlight-1754417308165",
        "nodePath": [
          5,
          9,
          1,
          1,
          5,
          5,
          1,
          4,
          1,
          13,
          1,
          1,
          1,
          1,
          1,
          12,
          4,
          1,
          0,
          31
        ],
        "occurrenceIndex": 1,
        "text": "familiarity",
        "timestamp": 1754417308166,
        "url": "https://kiewitcareers.kiewit.com/job/Lenexa-Analytics-Engineer-KS/1311874800/"
      }
    ]
  }
}

In [16]:
k = list(d['highlights'].keys())[0]
for item in d['highlights'][k]:
    print(item['text'])

gather, extract, restructure, analyze and interpret data, and communicate findings and recommendations to the business.
strong business acumen
many data sources
technical skills (SQL/python) to restructure and clean this data
output will embody data visualization
formulating and testing hypotheses, performing correlation analysis, and other forms of data mining.
guide requirement gathering
explain results of analysis to team members
train stakeholders on how to use decision support tools
deep understanding of the core applications we leverage to collect data
Python in analytics and software development
SQL for MS SQL Server and Snowflake highly preferred)
Power BI and Streamlit highly preferred
integrating APIs
familiarity
