In [3]:
import pandas as pd
import pointblank as pb
import function.data as cd
import datetime

In [4]:
endpoint = "https://data.cityofchicago.org/resource/ijzp-q8t2"

start = datetime.datetime(2023,1,1,0,0,0)
end = datetime.datetime(2025,5,1,0,0,0)

In [5]:

df = cd.backfill_chicago_data(endpoint = endpoint, 
                                start = start, 
                                end = end, 
                                offset = 24 * 30,
                                limit = 100000)

In [6]:

df["updated_on"] = pd.to_datetime(df["updated_on"])
df["x_coordinate"] = df["x_coordinate"].astype(float)
df["y_coordinate"] = df["y_coordinate"].astype(float)
df["latitude"] = df["latitude"].astype(float)
df["longitude"] = df["longitude"].astype(float)
df["year"]= df["year"].astype(int)

In [7]:

df.head()

Unnamed: 0,id,case_number,datetime,block,iucr,primary_type,description,location_description,arrest,domestic,...,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude
0,13140855,JG341458,2023-01-01,082XX S JEFFERY BLVD,1754,OFFENSE INVOLVING CHILDREN,AGGRAVATED SEXUAL ASSAULT OF CHILD BY FAMILY M...,APARTMENT,False,True,...,4,8,46,02,1190953.0,1850848.0,2023,2023-09-24 15:41:26,41.745739,-87.575883
1,13180096,JG387858,2023-01-01,075XX S WOLCOTT AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,False,...,6,17,71,11,1164996.0,1854651.0,2023,2023-08-20 15:40:56,41.756762,-87.670887
2,13168471,JG374193,2023-01-01,013XX W HARRISON ST,460,BATTERY,SIMPLE,SCHOOL - PUBLIC GROUNDS,False,False,...,12,34,28,08B,1167465.0,1897475.0,2023,2023-08-19 15:40:26,41.874223,-87.66061
3,13078152,JG267031,2023-01-01,101XX S BEVERLY AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,False,...,22,21,73,11,1168800.0,1837525.0,2023,2023-08-19 15:40:26,41.709685,-87.657439
4,13120699,JG314178,2023-01-01,063XX N FAIRFIELD AVE,1544,SEX OFFENSE,SEXUAL EXPLOITATION OF A CHILD,OTHER (SPECIFY),False,False,...,24,50,2,17,1156847.0,1941985.0,2023,2023-08-19 15:40:26,41.996584,-87.698384


In [8]:

df.dtypes

id                              object
case_number                     object
datetime                datetime64[ns]
block                           object
iucr                            object
primary_type                    object
description                     object
location_description            object
arrest                            bool
domestic                          bool
beat                            object
district                        object
ward                            object
community_area                  object
fbi_code                        object
x_coordinate                   float64
y_coordinate                   float64
year                             int64
updated_on              datetime64[ns]
latitude                       float64
longitude                      float64
dtype: object

In [9]:
schema = pb.Schema(
    columns=[
        ("id", "object"),
        ("case_number", "object"),
        ("datetime", "datetime64[ns]"),   
        ("block", "object"),
        ("iucr", "object"),
        ("primary_type", "object"),
        ("description", "object"),
        ("location_description", "object"),
        ("arrest", "bool"),
        ("domestic", "bool"),
        ("beat", "object"),
        ("district", "object"),
        ("ward", "object"),
        ("community_area", "object"),
        ("fbi_code", "object"),
        ("x_coordinate", "float64"),
        ("y_coordinate", "float64"),
        ("year", "int64"),
        ("updated_on", "datetime64[ns]"),
        ("latitude", "float64"),
        ("longitude", "float64")
    ]
)

In [10]:

validation = (
            pb.Validate(data = df,
            tbl_name= "Chicago Crime Data",
            label = "Chicago Crime Data",
            thresholds=pb.Thresholds(warning= 0, error= 0, critical= 0))
            .col_schema_match(schema=schema)
            .col_count_match(count=len(schema.columns)) 
            .col_vals_not_null(columns= ["case_number", "datetime","year"])
            .rows_distinct() 
            .interrogate()
        )

In [11]:

validation

Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation
Chicago Crime DataPandasChicago Crime DataWARNING0ERROR0CRITICAL0,Chicago Crime DataPandasChicago Crime DataWARNING0ERROR0CRITICAL0,Chicago Crime DataPandasChicago Crime DataWARNING0ERROR0CRITICAL0,Chicago Crime DataPandasChicago Crime DataWARNING0ERROR0CRITICAL0,Chicago Crime DataPandasChicago Crime DataWARNING0ERROR0CRITICAL0,Chicago Crime DataPandasChicago Crime DataWARNING0ERROR0CRITICAL0,Chicago Crime DataPandasChicago Crime DataWARNING0ERROR0CRITICAL0,Chicago Crime DataPandasChicago Crime DataWARNING0ERROR0CRITICAL0,Chicago Crime DataPandasChicago Crime DataWARNING0ERROR0CRITICAL0,Chicago Crime DataPandasChicago Crime DataWARNING0ERROR0CRITICAL0,Chicago Crime DataPandasChicago Crime DataWARNING0ERROR0CRITICAL0,Chicago Crime DataPandasChicago Crime DataWARNING0ERROR0CRITICAL0,Chicago Crime DataPandasChicago Crime DataWARNING0ERROR0CRITICAL0,Chicago Crime DataPandasChicago Crime DataWARNING0ERROR0CRITICAL0
Unnamed: 0_level_2,Unnamed: 1_level_2,STEP,COLUMNS,VALUES,TBL,EVAL,UNITS,PASS,FAIL,W,E,C,EXT
#4CA64C,1,col_schema_match  col_schema_match(),—,SCHEMA,,✓,1,1 1.00,0 0.00,●,●,●,—
#4CA64C,2,col_count_match  col_count_match(),—,21,,✓,1,1 1.00,0 0.00,●,●,●,—
#4CA64C,3,col_vals_not_null  col_vals_not_null(),case_number,—,,✓,594K,594K 1.00,0 0.00,●,●,●,—
#4CA64C,4,col_vals_not_null  col_vals_not_null(),datetime,—,,✓,594K,594K 1.00,0 0.00,●,●,●,—
#4CA64C,5,col_vals_not_null  col_vals_not_null(),year,—,,✓,594K,594K 1.00,0 0.00,●,●,●,—
#4CA64C,6,rows_distinct  rows_distinct(),ALL COLUMNS,—,,✓,594K,594K 1.00,0 0.00,●,●,●,—
2025-05-18 15:18:39 UTC6.8337 s2025-05-18 15:18:46 UTC,2025-05-18 15:18:39 UTC6.8337 s2025-05-18 15:18:46 UTC,2025-05-18 15:18:39 UTC6.8337 s2025-05-18 15:18:46 UTC,2025-05-18 15:18:39 UTC6.8337 s2025-05-18 15:18:46 UTC,2025-05-18 15:18:39 UTC6.8337 s2025-05-18 15:18:46 UTC,2025-05-18 15:18:39 UTC6.8337 s2025-05-18 15:18:46 UTC,2025-05-18 15:18:39 UTC6.8337 s2025-05-18 15:18:46 UTC,2025-05-18 15:18:39 UTC6.8337 s2025-05-18 15:18:46 UTC,2025-05-18 15:18:39 UTC6.8337 s2025-05-18 15:18:46 UTC,2025-05-18 15:18:39 UTC6.8337 s2025-05-18 15:18:46 UTC,2025-05-18 15:18:39 UTC6.8337 s2025-05-18 15:18:46 UTC,2025-05-18 15:18:39 UTC6.8337 s2025-05-18 15:18:46 UTC,2025-05-18 15:18:39 UTC6.8337 s2025-05-18 15:18:46 UTC,2025-05-18 15:18:39 UTC6.8337 s2025-05-18 15:18:46 UTC


In [12]:
validation.all_passed()

True

In [13]:

df.to_csv("data/chicago_crime_2023_2025.csv", index = False)