In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
pd.set_option('display.max_colwidth', None)

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
def categorize_offence(offence):
    offence = str(offence)
    offence = offence.lower()
    if ('thef' in offence) or ('larceny' in offence) or ('stolen' in offence):
        offence = "THEFT"
    elif ('fraud' in offence) or ('fortune' in offence) or ('forgery' in offence) or ('gambling' in offence):
        offence = "FRAUD"
    elif ('burglar' in offence) or ('b&e' in offence) or ('break and enter' in offence) or ('unlawfully in' in offence) or ('trespass' in offence):
        offence = "BREAK AND ENTER"
    elif 'robbery' in offence:
        offence = "ROBBERY"
    elif ('assault' in offence) or ('disarming' in offence) or ('death' in offence) or ('off' in offence) or ('touching' in offence):
        offence = "ASSAULT"
    elif ('sex' in offence) or ('rape' in offence):
        offence = "SEX CRIMES"
    elif ('drug' in offence) or ('alcohol' in offence) or ('cannabis' in offence) or ('noxious' in offence):
        offence = "DRINKING/DRUG OFFENSES"
    elif ('homicide' in offence) or ('murder' in offence):
        offence = "MURDER"
    elif ('firearm' in offence) or ('gun' in offence) or ('bodily' in offence) or ('weap' in offence) or ('jostling' in offence):
        offence = "ASSAULT"
    elif 'kidnapping' in offence:
        offence = "KIDNAPPING"
    elif 'loitering' in offence:
        offence = "LOITERING"
    elif ('parking' in offence) or ('traffic' in offence) or ('vehicle' in offence) or ('driving' in offence):
        offence = "PARKING/TRAFFIC/VEHICLE OFFENSES"
    elif ('harassment' in offence) or ('disorderly' in offence) or ('disruption' in offence) or ('harrassment' in offence):
        offence = "DISORDERLY/DISRUPTION/HARASSMENT"
    elif ('child' in offence) or ('abortion' in offence):
        offence = "CHILD CRIMES"
    else:
        offence = 'OTHER'

    return offence

### READ NY ARRESTS

In [4]:
#Crimes in Toronto
file_name = 'NYPD_Arrests_Data__Historic__20250524.csv'

#Read dataset
NYPD_Arrests_df = pd.read_csv(file_name)

len(NYPD_Arrests_df) #5,986,025
NYPD_Arrests_df.head()
NYPD_Arrests_df.info()

5986025

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat
0,279197226,12/19/2023,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,F,M,18,0.0,25-44,M,WHITE,988210.0,218129.0,40.76539,-73.985702,POINT (-73.985702 40.76539)
1,278761840,12/09/2023,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211300,F,K,67,0.0,25-44,M,BLACK,997897.0,175676.0,40.648859,-73.95082,POINT (-73.95082 40.648859)
2,278506761,12/05/2023,153.0,RAPE 3,104.0,RAPE,PL 1302503,F,K,77,0.0,25-44,M,BLACK,1003509.0,185018.0,40.674496,-73.930571,POINT (-73.9305713255961 40.6744956865259)
3,278436408,12/03/2023,157.0,RAPE 1,104.0,RAPE,PL 1303501,F,B,46,0.0,45-64,M,BLACK,1011755.0,250279.0,40.853598,-73.900577,POINT (-73.9005768807295 40.8535983673823)
4,278248753,11/29/2023,660.0,(null),,(null),PL 2407800,M,Q,104,0.0,<18,M,WHITE HISPANIC,1011456.0,194092.0,40.699373,-73.901881,POINT (-73.901881 40.699373)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5986025 entries, 0 to 5986024
Data columns (total 19 columns):
 #   Column             Dtype  
---  ------             -----  
 0   ARREST_KEY         int64  
 1   ARREST_DATE        object 
 2   PD_CD              float64
 3   PD_DESC            object 
 4   KY_CD              float64
 5   OFNS_DESC          object 
 6   LAW_CODE           object 
 7   LAW_CAT_CD         object 
 8   ARREST_BORO        object 
 9   ARREST_PRECINCT    int64  
 10  JURISDICTION_CODE  float64
 11  AGE_GROUP          object 
 12  PERP_SEX           object 
 13  PERP_RACE          object 
 14  X_COORD_CD         float64
 15  Y_COORD_CD         float64
 16  Latitude           float64
 17  Longitude          float64
 18  Lon_Lat            object 
dtypes: float64(7), int64(2), object(10)
memory usage: 867.7+ MB


In [None]:
#Metropolitan area
NYPD_Arrests_df["Metropolitan"] = "NYC"

#Convert string to datetime
NYPD_Arrests_df["ARREST_DATE"] = pd.to_datetime(NYPD_Arrests_df["ARREST_DATE"])

# Extract the year
NYPD_Arrests_df["CRIME_YEAR"] = NYPD_Arrests_df["ARREST_DATE"].dt.year

#Analysis from 2014 to 2024
NYPD_Arrests_df = NYPD_Arrests_df[NYPD_Arrests_df["CRIME_YEAR"] >= 2014] 

# Extract the month
NYPD_Arrests_df["CRIME_MONTH"] = NYPD_Arrests_df["ARREST_DATE"].dt.month

# Extract the Day of Week
NYPD_Arrests_df["CRIME_DOW"] = NYPD_Arrests_df["ARREST_DATE"].dt.weekday

# Define a dictionary for mapping WEEK
week_mapping = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}

# Apply the mapping
NYPD_Arrests_df["CRIME_DOW"] = NYPD_Arrests_df["CRIME_DOW"].map(week_mapping)


# Define a dictionary for mapping MONTH
month_mapping = {
    1: 'January',
    2: 'February',
    3: 'March',
    4: 'April',
    5: 'May',
    6: 'June',
    7: 'July',
    8: 'August',
    9: 'September',
    10: 'October',
    11: 'November',
    12: 'December',

}

# Apply the mapping
NYPD_Arrests_df["CRIME_MONTH"] = NYPD_Arrests_df["CRIME_MONTH"].map(month_mapping)


#Pick the valid age groups
valid_age_group_lst = ['25-44', '18-24', '45-64', '<18', '65+']
len(NYPD_Arrests_df)
NYPD_Arrests_df = NYPD_Arrests_df[NYPD_Arrests_df['AGE_GROUP'].isin(valid_age_group_lst)]


# Define a dictionary for PERP_SEX
sex_mapping = {
    'M': 'Male',
    'F': 'Female',
    'U': 'Undisclosed'
}

# Apply the mapping
NYPD_Arrests_df["PERP_SEX"] = NYPD_Arrests_df["PERP_SEX"].map(sex_mapping)


#Law Category Code
# Define a dictionary for LAW_CAT_CD
law_cat_cd_mapping = {
    'F': 'Felony',
    'M': 'Misdemeanor',
    'V': 'Violation',
    'I':'Unclassified Misdemeanors',
    '9':'Resisting/Obstruction'
}

# Apply the mapping
NYPD_Arrests_df["LAW_CAT_CD"].value_counts(dropna=False)
NYPD_Arrests_df["LAW_CAT_CD"] = NYPD_Arrests_df["LAW_CAT_CD"].map(law_cat_cd_mapping)
NYPD_Arrests_df["LAW_CAT_CD"].value_counts(dropna=False)

#Refine the offense type
NYPD_Arrests_df['TYPE_OF_OFFENSE'] = NYPD_Arrests_df['OFNS_DESC'].apply(lambda s: categorize_offence(s))


NYPD_Arrests_df.head()


In [256]:
for col in ["CRIME_YEAR","CRIME_MONTH","CRIME_DOW",'AGE_GROUP','PERP_SEX','PERP_RACE','LAW_CAT_CD','TYPE_OF_OFFENSE']:
    NYPD_Arrests_df[col].value_counts(dropna=False)


CRIME_YEAR
2014    387727
2015    339470
2016    314864
2017    286225
2024    260503
2018    246773
2023    226872
2019    214617
2022    189774
2021    155507
2020    140413
Name: count, dtype: int64

CRIME_MONTH
March        247188
May          244374
October      240055
January      238356
August       238017
July         232202
April        230715
June         227742
February     226245
September    223798
November     217315
December     196738
Name: count, dtype: int64

CRIME_DOW
Wednesday    484058
Thursday     460321
Tuesday      435994
Friday       417553
Saturday     356720
Monday       322827
Sunday       285272
Name: count, dtype: int64

AGE_GROUP
25-44    1435658
18-24     609916
45-64     540995
<18       142193
65+        33983
Name: count, dtype: int64

PERP_SEX
Male           2278901
Female          480340
Undisclosed       3504
Name: count, dtype: int64

PERP_RACE
BLACK                             1326789
WHITE HISPANIC                     701832
WHITE                              318046
BLACK HISPANIC                     245351
ASIAN / PACIFIC ISLANDER           141011
UNKNOWN                             22414
AMERICAN INDIAN/ALASKAN NATIVE       7302
Name: count, dtype: int64

LAW_CAT_CD
Misdemeanor                  1684885
Felony                        968678
Violation                      82351
NaN                            15906
Unclassified Misdemeanors       9124
Resisting/Obstruction           1801
Name: count, dtype: int64

TYPE_OF_OFFENSE
ASSAULT                             880047
THEFT                               536772
DRINKING/DRUG OFFENSES              351638
OTHER                               334816
PARKING/TRAFFIC/VEHICLE OFFENSES    266106
BREAK AND ENTER                     117997
ROBBERY                             107876
FRAUD                                96627
SEX CRIMES                           48169
MURDER                               13863
DISORDERLY/DISRUPTION/HARASSMENT      8277
CHILD CRIMES                           403
LOITERING                              104
KIDNAPPING                              50
Name: count, dtype: int64

### TORONTO CRIMES

In [257]:
#Crimes in Toronto
file_name = 'toronto-major-crime-indicators.csv'

#Read dataset
toronto_crimes_df = pd.read_csv(file_name)
len(toronto_crimes_df) #420,200
toronto_crimes_df.head()
toronto_crimes_df.info()

420200

Unnamed: 0,_id,EVENT_UNIQUE_ID,REPORT_DATE,OCC_DATE,REPORT_YEAR,REPORT_MONTH,REPORT_DAY,REPORT_DOY,REPORT_DOW,REPORT_HOUR,...,UCR_CODE,UCR_EXT,OFFENCE,MCI_CATEGORY,HOOD_158,NEIGHBOURHOOD_158,HOOD_140,NEIGHBOURHOOD_140,LONG_WGS84,LAT_WGS84
0,1,GO-20141263217,2014-01-01,2013-12-31,2014,January,1,1,Wednesday,16,...,2135,210,Theft Of Motor Vehicle,Auto Theft,43,Victoria Village (43),43,Victoria Village (43),-79.306754,43.734654
1,2,GO-20141260715,2014-01-01,2014-01-01,2014,January,1,1,Wednesday,3,...,1430,100,Assault,Assault,92,Corso Italia-Davenport (92),92,Corso Italia-Davenport (92),-79.45577,43.677775
2,3,GO-20141260730,2014-01-01,2014-01-01,2014,January,1,1,Wednesday,3,...,1430,100,Assault,Assault,105,Lawrence Park North (105),105,Lawrence Park North (105),-79.406223,43.727681
3,4,GO-20141260597,2014-01-01,2014-01-01,2014,January,1,1,Wednesday,2,...,1430,100,Assault,Assault,80,Palmerston-Little Italy (80),80,Palmerston-Little Italy (80),-79.415594,43.654946
4,5,GO-20141259762,2014-01-01,2013-12-31,2014,January,1,1,Wednesday,2,...,1430,100,Assault,Assault,164,Wellington Place (164),77,Waterfront Communities-The Island (77),-79.390786,43.649125


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420200 entries, 0 to 420199
Data columns (total 29 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   _id                420200 non-null  int64  
 1   EVENT_UNIQUE_ID    420200 non-null  object 
 2   REPORT_DATE        420200 non-null  object 
 3   OCC_DATE           420200 non-null  object 
 4   REPORT_YEAR        420200 non-null  int64  
 5   REPORT_MONTH       420200 non-null  object 
 6   REPORT_DAY         420200 non-null  int64  
 7   REPORT_DOY         420200 non-null  int64  
 8   REPORT_DOW         420200 non-null  object 
 9   REPORT_HOUR        420200 non-null  int64  
 10  OCC_YEAR           420056 non-null  float64
 11  OCC_MONTH          420056 non-null  object 
 12  OCC_DAY            420056 non-null  float64
 13  OCC_DOY            420056 non-null  float64
 14  OCC_DOW            420056 non-null  object 
 15  OCC_HOUR           420200 non-null  int64  
 16  DI

In [258]:
#Metropolitan area
toronto_crimes_df["Metropolitan"] = "Toronto"

# Extract the year
toronto_crimes_df["CRIME_YEAR"] = toronto_crimes_df["OCC_YEAR"].astype('Int64')
toronto_crimes_df = toronto_crimes_df[toronto_crimes_df["CRIME_YEAR"] >= 2014]

# Extract the month
toronto_crimes_df["CRIME_MONTH"] = toronto_crimes_df["OCC_MONTH"]

# Extract the Day of Week
toronto_crimes_df["CRIME_DOW"] = toronto_crimes_df["OCC_DOW"]

# Extract the Hour of Day
toronto_crimes_df["CRIME_HOUR"] = toronto_crimes_df["OCC_HOUR"]

#Duration between occurrence data and report date
#Convert string to datetime
toronto_crimes_df["OCC_DATE"] = pd.to_datetime(toronto_crimes_df["OCC_DATE"])
toronto_crimes_df["REPORT_DATE"] = pd.to_datetime(toronto_crimes_df["REPORT_DATE"])
toronto_crimes_df["TIME_LAPSE_OCC_REPORT_DAYS"] = (toronto_crimes_df["REPORT_DATE"] - toronto_crimes_df["OCC_DATE"]).dt.days


#Category of Crime
toronto_crimes_df["CATEGORY_OF_CRIME"] = toronto_crimes_df["MCI_CATEGORY"]

#Redefine crime category
toronto_crimes_df['TYPE_OF_OFFENSE'] = toronto_crimes_df['OFFENCE'].apply(lambda s: categorize_offence(s))

toronto_crimes_df.head()

Unnamed: 0,_id,EVENT_UNIQUE_ID,REPORT_DATE,OCC_DATE,REPORT_YEAR,REPORT_MONTH,REPORT_DAY,REPORT_DOY,REPORT_DOW,REPORT_HOUR,...,LONG_WGS84,LAT_WGS84,Metropolitan,CRIME_YEAR,CRIME_MONTH,CRIME_DOW,CRIME_HOUR,TIME_LAPSE_OCC_REPORT_DAYS,CATEGORY_OF_CRIME,TYPE_OF_OFFENSE
1,2,GO-20141260715,2014-01-01,2014-01-01,2014,January,1,1,Wednesday,3,...,-79.45577,43.677775,Toronto,2014,January,Wednesday,3,0,Assault,ASSAULT
2,3,GO-20141260730,2014-01-01,2014-01-01,2014,January,1,1,Wednesday,3,...,-79.406223,43.727681,Toronto,2014,January,Wednesday,3,0,Assault,ASSAULT
3,4,GO-20141260597,2014-01-01,2014-01-01,2014,January,1,1,Wednesday,2,...,-79.415594,43.654946,Toronto,2014,January,Wednesday,2,0,Assault,ASSAULT
5,6,GO-20141260264,2014-01-01,2014-01-01,2014,January,1,1,Wednesday,1,...,-79.132915,43.780413,Toronto,2014,January,Wednesday,1,0,Assault,ASSAULT
6,7,GO-20141260264,2014-01-01,2014-01-01,2014,January,1,1,Wednesday,1,...,-79.132915,43.780413,Toronto,2014,January,Wednesday,1,0,Assault,ASSAULT


In [259]:
for col in ["CRIME_YEAR","CRIME_MONTH","CRIME_DOW",'PREMISES_TYPE',"CATEGORY_OF_CRIME",'TYPE_OF_OFFENSE']:
    toronto_crimes_df[col].value_counts(dropna=False)


CRIME_YEAR
2023    48992
2024    45218
2022    41674
2019    40165
2018    37580
2017    35565
2020    35302
2021    34911
2016    33673
2015    32957
2014    32500
Name: count, dtype: Int64

CRIME_MONTH
October      36990
August       36902
July         36867
May          36113
September    36008
June         35993
November     35807
December     34007
January      33681
March        33190
April        32846
February     30133
Name: count, dtype: int64

CRIME_DOW
Friday        63180
Saturday      61367
Thursday      59326
Sunday        59309
Wednesday     59031
Monday        58612
Tuesday       57712
Name: count, dtype: int64

PREMISES_TYPE
Outside        113708
Apartment       97648
Commercial      82851
House           75730
Other           25267
Transit         12896
Educational     10437
Name: count, dtype: int64

CATEGORY_OF_CRIME
Assault            222293
Break and Enter     76931
Auto Theft          68006
Robbery             37040
Theft Over          14267
Name: count, dtype: int64

TYPE_OF_OFFENSE
ASSAULT                   221589
THEFT                      82273
BREAK AND ENTER            76931
ROBBERY                    37040
DRINKING/DRUG OFFENSES       704
Name: count, dtype: int64

### COMBINE BOTH DATASETS FOR TABLEAU

In [260]:
#NYC
NYC_cols = ['Metropolitan','CRIME_YEAR','CRIME_MONTH','CRIME_DOW','AGE_GROUP','PERP_SEX','PERP_RACE','LAW_CAT_CD','TYPE_OF_OFFENSE']
NYPD_df = NYPD_Arrests_df[NYC_cols]
NYPD_df.head()

#Toronto
tor_cols = ['Metropolitan','CRIME_YEAR','CRIME_MONTH','CRIME_DOW','CRIME_HOUR','TIME_LAPSE_OCC_REPORT_DAYS',
            'PREMISES_TYPE','LONG_WGS84','LAT_WGS84','TYPE_OF_OFFENSE']
toronto_df = toronto_crimes_df[tor_cols]
toronto_df.head()

Unnamed: 0,Metropolitan,CRIME_YEAR,CRIME_MONTH,CRIME_DOW,AGE_GROUP,PERP_SEX,PERP_RACE,LAW_CAT_CD,TYPE_OF_OFFENSE
0,NYC,2023,December,Tuesday,25-44,Male,WHITE,Felony,ASSAULT
1,NYC,2023,December,Saturday,25-44,Male,BLACK,Felony,ASSAULT
2,NYC,2023,December,Tuesday,25-44,Male,BLACK,Felony,SEX CRIMES
3,NYC,2023,December,Sunday,45-64,Male,BLACK,Felony,SEX CRIMES
4,NYC,2023,November,Wednesday,<18,Male,WHITE HISPANIC,Misdemeanor,OTHER


Unnamed: 0,Metropolitan,CRIME_YEAR,CRIME_MONTH,CRIME_DOW,CRIME_HOUR,TIME_LAPSE_OCC_REPORT_DAYS,PREMISES_TYPE,LONG_WGS84,LAT_WGS84,TYPE_OF_OFFENSE
1,Toronto,2014,January,Wednesday,3,0,Apartment,-79.45577,43.677775,ASSAULT
2,Toronto,2014,January,Wednesday,3,0,Outside,-79.406223,43.727681,ASSAULT
3,Toronto,2014,January,Wednesday,2,0,Apartment,-79.415594,43.654946,ASSAULT
5,Toronto,2014,January,Wednesday,1,0,Transit,-79.132915,43.780413,ASSAULT
6,Toronto,2014,January,Wednesday,1,0,Transit,-79.132915,43.780413,ASSAULT


In [261]:
tableau_df = pd.concat([NYPD_df, toronto_df])
len(tableau_df)
tableau_df.info()
tableau_df.head()
tableau_df.to_csv("tableau_df.csv")

3181282

<class 'pandas.core.frame.DataFrame'>
Index: 3181282 entries, 0 to 420199
Data columns (total 14 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   Metropolitan                object 
 1   CRIME_YEAR                  Int64  
 2   CRIME_MONTH                 object 
 3   CRIME_DOW                   object 
 4   AGE_GROUP                   object 
 5   PERP_SEX                    object 
 6   PERP_RACE                   object 
 7   LAW_CAT_CD                  object 
 8   TYPE_OF_OFFENSE             object 
 9   CRIME_HOUR                  float64
 10  TIME_LAPSE_OCC_REPORT_DAYS  float64
 11  PREMISES_TYPE               object 
 12  LONG_WGS84                  float64
 13  LAT_WGS84                   float64
dtypes: Int64(1), float64(4), object(9)
memory usage: 367.1+ MB


Unnamed: 0,Metropolitan,CRIME_YEAR,CRIME_MONTH,CRIME_DOW,AGE_GROUP,PERP_SEX,PERP_RACE,LAW_CAT_CD,TYPE_OF_OFFENSE,CRIME_HOUR,TIME_LAPSE_OCC_REPORT_DAYS,PREMISES_TYPE,LONG_WGS84,LAT_WGS84
0,NYC,2023,December,Tuesday,25-44,Male,WHITE,Felony,ASSAULT,,,,,
1,NYC,2023,December,Saturday,25-44,Male,BLACK,Felony,ASSAULT,,,,,
2,NYC,2023,December,Tuesday,25-44,Male,BLACK,Felony,SEX CRIMES,,,,,
3,NYC,2023,December,Sunday,45-64,Male,BLACK,Felony,SEX CRIMES,,,,,
4,NYC,2023,November,Wednesday,<18,Male,WHITE HISPANIC,Misdemeanor,OTHER,,,,,


### MACHINE LEARNING DATASET

In [279]:
#Curate the dataset for building the ML classification model
ml_col_lst = ['CRIME_YEAR', 'CRIME_MONTH', 'CRIME_DOW','ARREST_BORO', 'ARREST_PRECINCT',
              'JURISDICTION_CODE', 'AGE_GROUP', 'PERP_SEX', 'PERP_RACE', 'TYPE_OF_OFFENSE']

NYPD_Arrests_ML_df = NYPD_Arrests_df[ml_col_lst]

#Drop na values
len(NYPD_Arrests_ML_df)
NYPD_Arrests_ML_df = NYPD_Arrests_ML_df.dropna()
len(NYPD_Arrests_ML_df)


NYPD_Arrests_ML_df.head()
NYPD_Arrests_ML_df.to_csv("NYPD_Arrests_ML_df.csv")

2762745

2762745

Unnamed: 0,CRIME_YEAR,CRIME_MONTH,CRIME_DOW,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,TYPE_OF_OFFENSE
0,2023,December,Tuesday,M,18,0.0,25-44,Male,WHITE,ASSAULT
1,2023,December,Saturday,K,67,0.0,25-44,Male,BLACK,ASSAULT
2,2023,December,Tuesday,K,77,0.0,25-44,Male,BLACK,SEX CRIMES
3,2023,December,Sunday,B,46,0.0,45-64,Male,BLACK,SEX CRIMES
4,2023,November,Wednesday,Q,104,0.0,<18,Male,WHITE HISPANIC,OTHER


In [280]:
ml_col_lst = ['CRIME_YEAR', 'CRIME_MONTH', 'CRIME_DOW','ARREST_BORO', 'ARREST_PRECINCT',
              'JURISDICTION_CODE', 'AGE_GROUP', 'PERP_SEX', 'PERP_RACE', 'TYPE_OF_OFFENSE']

for col in ml_col_lst:
    NYPD_Arrests_ML_df[col].value_counts(dropna=False)


CRIME_YEAR
2014    387727
2015    339470
2016    314864
2017    286225
2024    260503
2018    246773
2023    226872
2019    214617
2022    189774
2021    155507
2020    140413
Name: count, dtype: int64

CRIME_MONTH
March        247188
May          244374
October      240055
January      238356
August       238017
July         232202
April        230715
June         227742
February     226245
September    223798
November     217315
December     196738
Name: count, dtype: int64

CRIME_DOW
Wednesday    484058
Thursday     460321
Tuesday      435994
Friday       417553
Saturday     356720
Monday       322827
Sunday       285272
Name: count, dtype: int64

ARREST_BORO
K    759739
M    712807
B    620038
Q    559734
S    110427
Name: count, dtype: int64

ARREST_PRECINCT
14     94260
40     87169
75     86313
44     79848
73     63542
       ...  
123    12918
111    11715
22      1877
116      111
483        3
Name: count, Length: 79, dtype: int64

JURISDICTION_CODE
0.0     2357560
1.0      202838
2.0      122787
3.0       25832
97.0      17168
72.0       9385
73.0       3840
4.0        3671
6.0        3112
11.0       2940
17.0       2731
15.0       2570
7.0        1732
71.0       1515
14.0       1465
69.0        871
87.0        799
85.0        297
88.0        281
9.0         261
12.0        260
51.0        258
13.0        198
16.0        183
52.0         83
79.0         51
74.0         48
76.0          6
8.0           3
Name: count, dtype: int64

AGE_GROUP
25-44    1435658
18-24     609916
45-64     540995
<18       142193
65+        33983
Name: count, dtype: int64

PERP_SEX
Male           2278901
Female          480340
Undisclosed       3504
Name: count, dtype: int64

PERP_RACE
BLACK                             1326789
WHITE HISPANIC                     701832
WHITE                              318046
BLACK HISPANIC                     245351
ASIAN / PACIFIC ISLANDER           141011
UNKNOWN                             22414
AMERICAN INDIAN/ALASKAN NATIVE       7302
Name: count, dtype: int64

TYPE_OF_OFFENSE
ASSAULT                             880047
THEFT                               536772
DRINKING/DRUG OFFENSES              351638
OTHER                               334816
PARKING/TRAFFIC/VEHICLE OFFENSES    266106
BREAK AND ENTER                     117997
ROBBERY                             107876
FRAUD                                96627
SEX CRIMES                           48169
MURDER                               13863
DISORDERLY/DISRUPTION/HARASSMENT      8277
CHILD CRIMES                           403
LOITERING                              104
KIDNAPPING                              50
Name: count, dtype: int64

2419095

TYPE_OF_OFFENSE
ASSAULT                             880047
THEFT                               536772
DRINKING/DRUG OFFENSES              351638
PARKING/TRAFFIC/VEHICLE OFFENSES    266106
BREAK AND ENTER                     117997
ROBBERY                             107876
FRAUD                                96627
SEX CRIMES                           48169
MURDER                               13863
Name: count, dtype: int64

#### HANDLE IMBALANCED DATASET

### EXTRA CODE

In [None]:
df = NYPD_Arrests_df[['OFNS_DESC']]
df = df[df["OFNS_DESC"] != '(null)']
df = df.dropna()
df['OFNS_DESC'] = df['OFNS_DESC'].apply(lambda s: categorize_offence(s))
df.sort_values(by="OFNS_DESC", inplace=True)
#list(df['OFNS_DESC'].unique())
[val for val in list(df['OFNS_DESC'].unique())]

# or ('aslt' in offence)
df['OFNS_DESC'].value_counts(dropna=False)




df = toronto_crimes_df[['OFFENCE']]
df = df.dropna()
df['OFFENCE'] = df['OFFENCE'].apply(lambda s: categorize_offence(s))

df.sort_values(by="OFFENCE", inplace=True)
[categorize_offence(val) for val in list(df['OFFENCE'].unique())]


toronto_crimes_df["MCI_CATEGORY"].value_counts(dropna=False)

for col in ['PD_DESC','OFNS_DESC']:
    NYPD_Arrests_df[col].nunique()
    NYPD_Arrests_df[col].value_counts(dropna=False)