In [11]:
import pandas as pd
from pathlib import Path

pd.set_option("display.max_columns", None)


def get_project_root() -> Path:
    current = Path(__file__).resolve() if "__file__" in globals() else Path().resolve()

    for parent in [current] + list(current.parents):
        if (parent / ".git").exists():
            return parent
        
    raise RuntimeError("Project root not found. Ensure you're inside a Git repository.")

PROJECT_ROOT = get_project_root()

DATA_PATH = PROJECT_ROOT / "data" / "raw"
OUTPUT_DIR = PROJECT_ROOT / "data" / "built_for_eda"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [12]:
df = pd.read_csv(f"{DATA_PATH}/Crimes_-_2001_to_Present_20260216.csv")
df.shape

(8495400, 22)

In [13]:
df['Date'] = pd.to_datetime(df['Date'], format="%m/%d/%Y %I:%M:%S %p")
df = df[df["Date"].dt.year.between(2014, 2025, inclusive="both")].copy() # type: ignore

df.shape

(3031734, 22)

In [4]:
columns_needed = [
    "Date",
    "Primary Type",
    "Description",
    "Location Description",
    "Arrest",
    "Domestic",
    "District",
    "Beat",
    "Community Area",
    "Latitude",
    "Longitude"
]

df = df[columns_needed]
df.head()

Unnamed: 0,Date,Primary Type,Description,Location Description,Arrest,Domestic,District,Beat,Community Area,Latitude,Longitude
19660,2025-12-31 23:58:00,ASSAULT,SIMPLE,RESIDENCE,False,False,9.0,931,61.0,41.802549,-87.667246
19661,2025-12-31 23:55:00,MOTOR VEHICLE THEFT,THEFT / RECOVERY - AUTOMOBILE,APARTMENT,False,True,15.0,1522,25.0,41.882329,-87.758411
19662,2025-12-31 23:54:00,BATTERY,"AGGRAVATED P.O. - HANDS, FISTS, FEET, NO / MIN...",RESTAURANT,True,False,12.0,1215,28.0,41.885427,-87.661759
19663,2025-12-31 23:54:00,PUBLIC PEACE VIOLATION,OTHER VIOLATION,AIRCRAFT,False,False,16.0,1651,76.0,41.97629,-87.905227
19664,2025-12-31 23:54:00,PUBLIC PEACE VIOLATION,OTHER VIOLATION,AIRCRAFT,False,False,16.0,1651,76.0,41.97629,-87.905227


In [5]:
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day
df["Hour"] = df["Date"].dt.hour
df["DayOfWeek"] = df["Date"].dt.dayofweek

In [6]:
print("Total records:", len(df))
print("Unique crime types:", df["Primary Type"].nunique())
print("Arrest rate:", df["Arrest"].mean())
print("Domestic rate:", df["Domestic"].mean())

Total records: 3031734
Unique crime types: 33
Arrest rate: 0.18484405294132006
Domestic rate: 0.1912915183192193


### save the standard data built for EDA

In [8]:
df.head()

Unnamed: 0,Date,Primary Type,Description,Location Description,Arrest,Domestic,District,Beat,Community Area,Latitude,Longitude,Year,Month,Day,Hour,DayOfWeek
19660,2025-12-31 23:58:00,ASSAULT,SIMPLE,RESIDENCE,False,False,9.0,931,61.0,41.802549,-87.667246,2025,12,31,23,2
19661,2025-12-31 23:55:00,MOTOR VEHICLE THEFT,THEFT / RECOVERY - AUTOMOBILE,APARTMENT,False,True,15.0,1522,25.0,41.882329,-87.758411,2025,12,31,23,2
19662,2025-12-31 23:54:00,BATTERY,"AGGRAVATED P.O. - HANDS, FISTS, FEET, NO / MIN...",RESTAURANT,True,False,12.0,1215,28.0,41.885427,-87.661759,2025,12,31,23,2
19663,2025-12-31 23:54:00,PUBLIC PEACE VIOLATION,OTHER VIOLATION,AIRCRAFT,False,False,16.0,1651,76.0,41.97629,-87.905227,2025,12,31,23,2
19664,2025-12-31 23:54:00,PUBLIC PEACE VIOLATION,OTHER VIOLATION,AIRCRAFT,False,False,16.0,1651,76.0,41.97629,-87.905227,2025,12,31,23,2


In [None]:
df.to_csv(f"{OUTPUT_DIR}/standard_data.csv", index=False)