In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import pandas as pd

# Define the size of each chunk
chunk_size = 100000  # Adjust this value based on your need

# Read the large CSV file in chunks
csv_file = '/content/drive/MyDrive/MADS Capstone Team 23/Data/raw/CrimeData/NewYork/NewYork-CrimeData-2006-2024.csv'  # Replace with your file path
chunk_iterator = pd.read_csv(csv_file, chunksize=chunk_size, usecols=["CMPLNT_FR_DT","OFNS_DESC"])

crime_types = set()

# Iterate over the chunks and save each chunk as a separate smaller CSV file
for i, chunk in enumerate(chunk_iterator):
    crime_types.update(chunk['OFNS_DESC'].unique())

In [10]:
crime_types

{'(null)',
 'ABORTION',
 'ADMINISTRATIVE CODE',
 'ADMINISTRATIVE CODES',
 'AGRICULTURE & MRKTS LAW-UNCLASSIFIED',
 'ALCOHOLIC BEVERAGE CONTROL LAW',
 'ANTICIPATORY OFFENSES',
 'ARSON',
 'ASSAULT 3 & RELATED OFFENSES',
 "BURGLAR'S TOOLS",
 'BURGLARY',
 'CANNABIS RELATED OFFENSES',
 'CHILD ABANDONMENT/NON SUPPORT',
 'CRIMINAL MISCHIEF & RELATED OF',
 'CRIMINAL TRESPASS',
 'DANGEROUS DRUGS',
 'DANGEROUS WEAPONS',
 'DISORDERLY CONDUCT',
 'DISRUPTION OF A RELIGIOUS SERV',
 'ENDAN WELFARE INCOMP',
 'ESCAPE 3',
 'FELONY ASSAULT',
 'FELONY SEX CRIMES',
 'FORGERY',
 'FORTUNE TELLING',
 'FRAUDS',
 'FRAUDULENT ACCOSTING',
 'GAMBLING',
 'GRAND LARCENY',
 'GRAND LARCENY OF MOTOR VEHICLE',
 'HARRASSMENT 2',
 'HOMICIDE-NEGLIGENT,UNCLASSIFIE',
 'HOMICIDE-NEGLIGENT-VEHICLE',
 'INTOXICATED & IMPAIRED DRIVING',
 'INTOXICATED/IMPAIRED DRIVING',
 'JOSTLING',
 'KIDNAPPING',
 'KIDNAPPING & RELATED OFFENSES',
 'KIDNAPPING AND RELATED OFFENSES',
 'LOITERING',
 'LOITERING FOR DRUG PURPOSES',
 'LOITERING/DEVIATE

In [11]:
violent_crimes = [
    'ARSON', 'ASSAULT 3 & RELATED OFFENSES', 'DANGEROUS WEAPONS', 'FELONY ASSAULT', 'FELONY SEX CRIMES',
    'HOMICIDE-NEGLIGENT,UNCLASSIFIE', 'HOMICIDE-NEGLIGENT-VEHICLE', 'KIDNAPPING', 'KIDNAPPING & RELATED OFFENSES',
    'KIDNAPPING AND RELATED OFFENSES', 'MURDER & NON-NEGL. MANSLAUGHTER', 'RAPE', 'ROBBERY', 'SEX CRIMES'
]

property_crimes = [
    "BURGLAR'S TOOLS", 'BURGLARY', 'CRIMINAL MISCHIEF & RELATED OF', 'CRIMINAL TRESPASS', 'FORGERY', 'FRAUDS',
    'GRAND LARCENY', 'GRAND LARCENY OF MOTOR VEHICLE', 'PETIT LARCENY', 'PETIT LARCENY OF MOTOR VEHICLE',
    'POSSESSION OF STOLEN PROPERTY', 'THEFT OF SERVICES', 'THEFT-FRAUD', 'UNAUTHORIZED USE OF A VEHICLE'
]

In [19]:

violent_crime_df = pd.DataFrame()
property_crime_df = pd.DataFrame()

chunk_iterator = pd.read_csv(csv_file, chunksize=chunk_size, usecols=["CMPLNT_FR_DT","OFNS_DESC"])
for i, chunk in enumerate(chunk_iterator):
  violent_crime_df = pd.concat([violent_crime_df, chunk[chunk['OFNS_DESC'].isin(violent_crimes)]])
  property_crime_df = pd.concat([property_crime_df, chunk[chunk['OFNS_DESC'].isin(property_crimes)]])

print(violent_crime_df.shape)
print(property_crime_df.shape)


(1965552, 2)
(4086143, 2)


In [20]:
violent_crime_df.head()

Unnamed: 0,CMPLNT_FR_DT,OFNS_DESC
0,12/05/2006,MURDER & NON-NEGL. MANSLAUGHTER
1,08/25/1973,MURDER & NON-NEGL. MANSLAUGHTER
2,04/22/2006,MURDER & NON-NEGL. MANSLAUGHTER
3,07/20/2006,MURDER & NON-NEGL. MANSLAUGHTER
4,04/14/2006,MURDER & NON-NEGL. MANSLAUGHTER


In [21]:

violent_crime_df.rename(columns={'CMPLNT_FR_DT': 'Date'}, inplace=True)
property_crime_df.rename(columns={'CMPLNT_FR_DT': 'Date'}, inplace=True)


In [23]:

def convert_date(df):
  df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
  return df

violent_crime_df = convert_date(violent_crime_df)
property_crime_df = convert_date(property_crime_df)


In [25]:
violent_crime_df.dropna(inplace=True)
property_crime_df.dropna(inplace=True)

In [26]:
violent_crimes_by_month = violent_crime_df.groupby(pd.Grouper(key='Date', freq='M')).size().reset_index(name='NumberOfIncident')
violent_crimes_by_month['Date'] = violent_crimes_by_month['Date'].dt.strftime('%Y-%m')

property_crimes_by_month = property_crime_df.groupby(pd.Grouper(key='Date', freq='M')).size().reset_index(name='NumberOfIncident')
property_crimes_by_month['Date'] = property_crimes_by_month['Date'].dt.strftime('%Y-%m')

In [29]:
violent_crimes_by_month.to_csv("/content/drive/MyDrive/MADS Capstone Team 23/Data/processed/newyork_violent_crimes_by_month.csv", index=False)
property_crimes_by_month.to_csv("/content/drive/MyDrive/MADS Capstone Team 23/Data/processed/newyork_property_crimes_by_month.csv", index=False)