In [16]:
import pandas as pd
from scipy.stats import zscore
import numpy as np

df = pd.read_csv("combined_data.csv")

In [17]:
columns = df.columns.tolist()
for i in columns:
    print(i)

year
month
day
hour
date
dayofweek
city
population
male_officer
female_officer
total_officers
offense_name
offense_category_name
location_area


In [18]:
df['offense_category_name'].describe()

count                     497273
unique                        24
top       Larceny/Theft Offenses
freq                      162101
Name: offense_category_name, dtype: object

In [19]:
features_considered = ['year', 'hour']

z_scores = zscore(df[features_considered], nan_policy='omit')
abs_z_scores = np.abs(z_scores)

In [20]:
z_scores_df = pd.DataFrame(z_scores, columns=features_considered)


In [21]:
z_scores_df.head()

Unnamed: 0,year,hour
0,-0.448833,0.055005
1,-0.448833,1.237834
2,-0.448833,1.237834
3,-0.448833,1.237834
4,-0.448833,0.055005


In [22]:
# convert to absolute values
abs_z_scores = np.abs(z_scores)
# filter based on |z-score| less than 3
filtered_entries = (abs_z_scores < 3).all(axis=1) & df['offense_category_name'].notna()


In [23]:
filtered_entries

0         True
1         True
2         True
3         True
4         True
          ... 
497268    True
497269    True
497270    True
497271    True
497272    True
Name: offense_category_name, Length: 497273, dtype: bool

In [24]:
print("Number of rows dropped:", (~filtered_entries).sum())
df_without_outliers = df[filtered_entries].copy()


Number of rows dropped: 0


In [25]:
print('Number of rows before filtering outliers', len(df))
df_without_outliers = df[filtered_entries]
print('Number of rows after filtering outliers', len(df_without_outliers))
df_without_outliers.head()

Number of rows before filtering outliers 497273
Number of rows after filtering outliers 497273


Unnamed: 0,year,month,day,hour,date,dayofweek,city,population,male_officer,female_officer,total_officers,offense_name,offense_category_name,location_area
0,2022,9,22,13,2022-09-22,3,Berlin,20109,37.0,7.0,44.0,Drug/Narcotic Violations,Drug/Narcotic Offenses,Abandoned/Condemned Structure
1,2022,6,4,20,2022-06-04,5,Norwich,40096,69.0,11.0,80.0,Weapon Law Violations,Weapon Law Violations,Abandoned/Condemned Structure
2,2022,7,10,20,2022-07-10,6,Norwich,40096,69.0,11.0,80.0,Drug/Narcotic Violations,Drug/Narcotic Offenses,Abandoned/Condemned Structure
3,2022,7,10,20,2022-07-10,6,Norwich,40096,69.0,11.0,80.0,Drug Equipment Violations,Drug/Narcotic Offenses,Abandoned/Condemned Structure
4,2022,10,14,13,2022-10-14,4,Bridgeport,148395,243.0,44.0,287.0,Drug/Narcotic Violations,Drug/Narcotic Offenses,Abandoned/Condemned Structure


In [26]:
df.loc[z_scores_df['hour'].abs() > 2]['city'].value_counts()


city
Derby          2390
New Haven      2212
Hartford       1846
New Britain     926
Waterbury       859
               ... 
Montville         5
Ridgefield        4
Middlebury        3
Madison           3
Weston            1
Name: count, Length: 96, dtype: int64

In [27]:
# 1. Count number of crimes per city
crime_counts = df['city'].value_counts().rename_axis('city').reset_index(name='crime_count')

# 2. Get population per city (assuming one population value per city in your df)
populations = df[['city', 'population']].drop_duplicates(subset='city')

# 3. Merge counts and population
city_stats = crime_counts.merge(populations, on='city')

# 4. Calculate crime rate per 1000 residents
city_stats['crime_rate_per_1000_people'] = ((city_stats['crime_count'] / city_stats['population']) * 1000).round(0).astype(int)

df_without_outliers = df_without_outliers.merge(city_stats[['city', 'crime_rate_per_1000_people']], on='city', how='left')

df_without_outliers['officers_per_1000_people'] = ((df_without_outliers['total_officers'] / df_without_outliers['population']) * 1000).round(2)

In [28]:
df_without_outliers.to_csv("combined_data.csv", index=False)
print()




In [29]:
df_without_outliers.columns.tolist()

['year',
 'month',
 'day',
 'hour',
 'date',
 'dayofweek',
 'city',
 'population',
 'male_officer',
 'female_officer',
 'total_officers',
 'offense_name',
 'offense_category_name',
 'location_area',
 'crime_rate_per_1000_people',
 'officers_per_1000_people']

In [30]:
df_without_outliers

Unnamed: 0,year,month,day,hour,date,dayofweek,city,population,male_officer,female_officer,total_officers,offense_name,offense_category_name,location_area,crime_rate_per_1000_people,officers_per_1000_people
0,2022,9,22,13,2022-09-22,3,Berlin,20109,37.0,7.0,44.0,Drug/Narcotic Violations,Drug/Narcotic Offenses,Abandoned/Condemned Structure,145,2.19
1,2022,6,4,20,2022-06-04,5,Norwich,40096,69.0,11.0,80.0,Weapon Law Violations,Weapon Law Violations,Abandoned/Condemned Structure,214,2.00
2,2022,7,10,20,2022-07-10,6,Norwich,40096,69.0,11.0,80.0,Drug/Narcotic Violations,Drug/Narcotic Offenses,Abandoned/Condemned Structure,214,2.00
3,2022,7,10,20,2022-07-10,6,Norwich,40096,69.0,11.0,80.0,Drug Equipment Violations,Drug/Narcotic Offenses,Abandoned/Condemned Structure,214,2.00
4,2022,10,14,13,2022-10-14,4,Bridgeport,148395,243.0,44.0,287.0,Drug/Narcotic Violations,Drug/Narcotic Offenses,Abandoned/Condemned Structure,142,1.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
497268,2024,7,21,3,2024-07-21,6,New Haven,137243,279.0,52.0,331.0,Aggravated Assault,Assault Offenses,Other/Unknown,405,2.41
497269,2024,12,12,9,2024-12-12,3,Torrington,35612,63.0,7.0,70.0,Intimidation,Assault Offenses,Other/Unknown,119,1.97
497270,2024,11,21,15,2024-11-21,3,New Haven,137243,279.0,52.0,331.0,All Other Larceny,Larceny/Theft Offenses,Other/Unknown,405,2.41
497271,2024,12,5,22,2024-12-05,3,New Haven,137243,279.0,52.0,331.0,Robbery,Robbery,Other/Unknown,405,2.41
