In [1]:
import pandas as pd

In [3]:
chicago_df = pd.read_csv("Chicago_data.csv")
nibrs_df = pd.read_csv("NIBRS_timedata.csv")

print("Chicago dataset shape:", chicago_df.shape)
print("NIBRS dataset shape:", nibrs_df.shape)

Chicago dataset shape: (1100000, 22)
NIBRS dataset shape: (1199713, 3)


In [8]:
chicago_df['date'] = pd.to_datetime(chicago_df['date'], errors='coerce')
nibrs_df['date'] = pd.to_datetime(nibrs_df['date'], errors='coerce')

In [9]:
nibrs_df.head(5)

Unnamed: 0,INCIDENT_ID,date,arrest
0,133791805,2020-12-22 23:00:00,False
1,133787561,2020-10-27 14:00:00,True
2,133791812,2020-12-20 04:00:00,True
3,133787564,2020-12-25 15:00:00,True
4,133791816,2020-09-26 18:00:00,False


In [10]:
chicago_df.head(5)

Unnamed: 0,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,13777896,JJ183487,2025-03-16 03:00:00,040XX N KEYSTONE AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,...,39.0,16.0,08B,1148566.0,1926623.0,2025,2025-03-19T15:41:08.000,41.954594,-87.729245,"\n, \n(41.954593897, -87.729244692)"
1,13776543,JJ182816,2025-03-12 00:00:00,037XX W NORTH AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,26.0,23.0,07,1151340.0,1910377.0,2025,2025-03-19T15:42:01.000,41.909959,-87.719475,"\n, \n(41.909959416, -87.719474573)"
2,13772937,JJ178623,2025-03-12 00:00:00,076XX S EAST END AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,False,...,8.0,43.0,11,1188860.0,1854711.0,2025,2025-03-19T15:42:01.000,41.756389,-87.583428,"\n, \n(41.756389436, -87.583428355)"
3,13774108,JJ179898,2025-03-12 00:00:00,097XX S MERRILL AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,7.0,51.0,07,1192381.0,1840724.0,2025,2025-03-19T15:42:01.000,41.717923,-87.570979,"\n, \n(41.717922891, -87.570978883)"
4,13772980,JJ178262,2025-03-12 00:00:00,095XX S HALSTED ST,560,ASSAULT,SIMPLE,LIBRARY,False,False,...,21.0,73.0,08A,1172656.0,1841600.0,2025,2025-03-19T15:42:01.000,41.720783,-87.643198,"\n, \n(41.720783347, -87.643197739)"


Extracted components such as:
- month – the month of the incident
- day – the day in month of the incident
- hour – the hour of the incident
- weekday – the day of the week
- is_weekend – flag for Saturday/Sunday

In [11]:
def extractBasicTimeFeature(df):
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['hour'] = df['date'].dt.hour
    df['weekday'] = df['date'].dt.weekday  # 0=Monday, 6=Sunday
    df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int) # if weekend

    return df

In [12]:
chicago_df = extractBasicTimeFeature(chicago_df)
nibrs_df = extractBasicTimeFeature(nibrs_df)

print("Chicago dataset shape:", chicago_df.shape)
print("NIBRS dataset shape:", nibrs_df.shape)

Chicago dataset shape: (1100000, 27)
NIBRS dataset shape: (1199713, 8)


In [13]:
nibrs_df.head(5)

Unnamed: 0,INCIDENT_ID,date,arrest,month,day,hour,weekday,is_weekend
0,133791805,2020-12-22 23:00:00,False,12,22,23,1,0
1,133787561,2020-10-27 14:00:00,True,10,27,14,1,0
2,133791812,2020-12-20 04:00:00,True,12,20,4,6,1
3,133787564,2020-12-25 15:00:00,True,12,25,15,4,0
4,133791816,2020-09-26 18:00:00,False,9,26,18,5,1


Holiday Feature Extraction: Added a binary flag indicating whether the incident occurred on a holiday.

In [15]:
import holidays

def extractHolidayFeature(df):
    us_holidays = holidays.US()
    df['is_holiday'] = df['date'].dt.date.apply(lambda x: 1 if x in us_holidays else 0)

    return df


In [16]:
chicago_df = extractHolidayFeature(chicago_df)
nibrs_df = extractHolidayFeature(nibrs_df)

print("Chicago dataset shape:", chicago_df.shape)
print("NIBRS dataset shape:", nibrs_df.shape)

nibrs_df.head(5)

Chicago dataset shape: (1100000, 28)
NIBRS dataset shape: (1199713, 9)


Unnamed: 0,INCIDENT_ID,date,arrest,month,day,hour,weekday,is_weekend,is_holiday
0,133791805,2020-12-22 23:00:00,False,12,22,23,1,0,0
1,133787561,2020-10-27 14:00:00,True,10,27,14,1,0,0
2,133791812,2020-12-20 04:00:00,True,12,20,4,6,1,0
3,133787564,2020-12-25 15:00:00,True,12,25,15,4,0,1
4,133791816,2020-09-26 18:00:00,False,9,26,18,5,1,0


Time Since Last Crime (time_since_last_crime): For each incident, computed the time difference (in hours) from the previous incident.

In [20]:
# Combine this feature with spatial feature maybe useful for hot crime analysis
def extractTimeIntervalWithLastCrime(df):
    df = df.sort_values(by='date')
    df['time_since_last_crime'] = df['date'].diff().dt.total_seconds() / 3600 # per Hour
    return df


In [21]:
chicago_df = extractTimeIntervalWithLastCrime(chicago_df)
nibrs_df = extractTimeIntervalWithLastCrime(nibrs_df)

print("Chicago dataset shape:", chicago_df.shape)
print("NIBRS dataset shape:", nibrs_df.shape)

nibrs_df.head(5)

Chicago dataset shape: (1100000, 29)
NIBRS dataset shape: (1199713, 10)


Unnamed: 0,INCIDENT_ID,date,arrest,month,day,hour,weekday,is_weekend,is_holiday,time_since_last_crime
10402,135341753,2020-01-01,False,1,1,0,2,0,1,
5631,133788609,2020-01-01,False,1,1,0,2,0,1,0.0
1706,122507148,2020-01-01,False,1,1,0,2,0,1,0.0
10186,122506302,2020-01-01,False,1,1,0,2,0,1,0.0
5413,133787212,2020-01-01,False,1,1,0,2,0,1,0.0
