In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
df_arrests = pd.read_csv('arrest_data.csv')

In [4]:
df_arrests['Arrest Date'] = pd.to_datetime(df_arrests['Arrest Date'])

In [5]:
# Filter for 2022 and 2023 data
df_arrests = df_arrests[df_arrests['Arrest Date'].dt.year.isin([2022, 2023])]

In [6]:
# Function to validate zip codes
def is_valid_zip(zip_code):
    return bool(re.match(r'^\d{5}$', str(zip_code)))

# Remove invalid zips
df_arrests = df_arrests[df_arrests['Zip'].apply(is_valid_zip)]

In [7]:
# Create Year and Month columns
df_arrests['Year'] = df_arrests['Arrest Date'].dt.year
df_arrests['Month'] = df_arrests['Arrest Date'].dt.month

In [8]:
# Group by Year, Zip, and Month, then count arrests
arrests_by_zip_month = df_arrests.groupby(['Year', 'Zip', 'Month']).size().reset_index(name='ArrestCount')

In [9]:
# Calculate average monthly counts for each year and zipcode
arrests_by_zip_year = arrests_by_zip_month.groupby(['Year', 'Zip'])['ArrestCount'].mean().reset_index(name='AvgMonthlyArrests')

In [10]:
# Rename 'Zip' to 'ZipCode' for consistency
arrests_by_zip_year = arrests_by_zip_year.rename(columns={'Zip': 'ZipCode'})

In [11]:
# Sort the dataframe
arrests_by_zip_year = arrests_by_zip_year.sort_values(['ZipCode', 'Year'])

In [12]:
arrests_by_zip_year

Unnamed: 0,Year,ZipCode,AvgMonthlyArrests
0,2022,10001,45.666667
190,2023,10001,52.166667
1,2022,10002,157.750000
191,2023,10002,155.916667
2,2022,10003,151.583333
...,...,...,...
377,2023,11693,60.916667
188,2022,11694,13.083333
378,2023,11694,16.916667
189,2022,11697,1.833333


In [13]:
# Save to CSV
arrests_by_zip_year.to_csv('cleaned_arrest_data.csv', index=False)