In [2]:
# 01 - Exploratory Data Analysis
# Purpose: Understand crime patterns and prototype predictive models
# Author: Neat
# Date : Feb 2025

In [3]:
# === Setup ===
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from statsmodels.tsa.seasonal import seasonal_decompose
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

from datetime import datetime

In [4]:
from pathlib import Path

current_year = datetime.now().year
ROOTPATH = Path().absolute().parent
DATAPATH = ROOTPATH/ 'data' / 'processed'
dfs = []

def load_recent_data():
    """
    Loads last 8 years of data for initial exploration

    Returns:
    pd.DataFrame: Dataframe containing 8 years of data 
    """
    years = range(current_year-7,current_year+1)
    for year in years:
        df = pd.read_csv(f'{DATAPATH}/chicago_crimes_{year}_processed.csv',
                         parse_dates=['date','updated_on'])
        dfs.append(df)
    return pd.concat(dfs,ignore_index=True)

df = load_recent_data()
print(f'Loaded {len(df)} records')

Loaded 8000 records


In [None]:
chicago_bkp = df.copy()
df.head()

Unnamed: 0,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,year,updated_on,latitude,longitude,month,day,hour,day_of_week,time_of_day,has_location
0,11561837,JC110056,2018-12-31 23:59:00,W 72ND ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,2018,2019-01-17 14:26:36,41.763181,-87.657709,12,31,23,0,Night,True
1,13247802,JG464580,2018-12-31 23:59:00,S OGLESBY AVE,1754,OFFENSE INVOLVING CHILDREN,AGGRAVATED SEXUAL ASSAULT OF CHILD BY FAMILY M...,RESIDENCE,False,False,...,2018,2023-10-19 15:42:40,41.836183,-87.666105,12,31,23,0,Night,False
2,11556487,JC104662,2018-12-31 23:59:00,S SACRAMENTO AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,2018,2019-01-10 15:16:50,41.689079,-87.696064,12,31,23,0,Night,True
3,11552699,JC100043,2018-12-31 23:57:00,S SANGAMON ST,1310,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,False,False,...,2018,2019-01-10 15:16:50,41.740521,-87.647391,12,31,23,0,Night,True
4,11552724,JC100006,2018-12-31 23:56:00,S ALLPORT ST,440,BATTERY,AGG: HANDS/FIST/FEET NO/MINOR INJURY,OTHER,True,False,...,2018,2019-01-10 15:16:50,41.857068,-87.657625,12,31,23,0,Night,True


In [19]:
columns = df.columns
print(columns)
df.describe()

Index(['id', 'case_number', 'date', 'block', 'iucr', 'primary_type',
       'description', 'location_description', 'arrest', 'domestic', 'beat',
       'district', 'ward', 'community_area', 'fbi_code', 'year', 'updated_on',
       'latitude', 'longitude', 'month', 'day', 'hour', 'day_of_week',
       'time_of_day', 'has_location'],
      dtype='object')


Unnamed: 0,id,date,beat,district,ward,community_area,year,updated_on,latitude,longitude,month,day,hour,day_of_week
count,8000.0,8000,8000.0,8000.0,8000.0,8000.0,8000.0,8000,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,12736810.0,2022-05-22 08:13:36.090000128,1142.890875,11.199875,22.91275,37.10125,2021.5,2022-08-04 22:59:09.739874816,41.841689,-87.668371,10.75,28.10725,14.318125,2.539375
min,24366.0,2018-12-30 14:00:00,111.0,1.0,1.0,1.0,2018.0,2019-01-10 15:16:50,41.645796,-87.909079,2.0,10.0,0.0,0.0
25%,12178350.0,2020-09-30 01:28:45,531.0,5.0,9.0,23.0,2019.75,2021-01-06 15:40:18,41.767538,-87.709431,12.0,30.0,11.0,1.0
50%,12937050.0,2022-07-01 19:59:30,1023.0,10.0,23.0,32.0,2021.5,2023-01-06 15:40:48,41.852863,-87.664043,12.0,31.0,15.0,2.0
75%,13521330.0,2024-04-01 02:37:15,1724.0,17.0,34.0,54.0,2023.25,2025-01-07 15:42:52,41.90275,-87.62733,12.0,31.0,20.0,5.0
max,13752840.0,2025-02-12 00:00:00,2535.0,25.0,50.0,77.0,2025.0,2025-02-19 15:43:45,42.022536,-87.525403,12.0,31.0,23.0,6.0
std,938591.7,,710.993064,7.102674,13.884865,21.505599,2.291431,,0.085908,0.057947,3.307396,6.667556,6.415455,2.16738


## 1 - Time Based Patterns

In [20]:
# Monthly

monthly_crimes = df.groupby(df['month']).size()

print(monthly_crimes)

month
2     1000
12    7000
dtype: int64
