In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer

In [2]:
df_list = ['dfmajorcrimes', 'dfhomicidies']

In [3]:
dfmajorcrimes = pd.read_csv('raw_data/majorcrimes.csv')
dfhomicides = pd.read_csv('raw_data/homicidies.csv')

In [4]:
def show_all_columns(df):
    with pd.option_context('display.max_columns', None):
        display(df)
def show_all_rows(df):
    with pd.option_context('display.max_rows', None):
        display(df)

In [5]:
# Define the target columns including the ones that need to be added
columns = ['EVENT_UNIQUE_ID','DATASET','OFFENCE', 'MCI_CATEGORY','OCC_HOUR','OCC_DAY','OCC_MONTH','OCC_YEAR','OCC_DOW','OCC_DOY','LAT_WGS84','LONG_WGS84']

# Add missing columns with default values
for df in [dfhomicides, dfmajorcrimes]:
    for column in columns:
        if column not in df.columns:
            df[column] = None

# Select columns in the specified order for all DataFrames
dfhomicides_s = dfhomicides[columns]
dfmajorcrimes_s = dfmajorcrimes[columns]

# Set the 'DATASET' column for each DataFrame using .loc to avoid SettingWithCopyWarning
dfhomicides_s.loc[:, 'DATASET'] = 'HOMICIDES'
dfmajorcrimes_s.loc[:, 'DATASET'] = 'MAJOR_CRIMES'

# Concatenate the DataFrames into a single DataFrame
dfmatrix = pd.concat([dfhomicides_s, dfmajorcrimes_s], ignore_index=True)

# Display the resulting DataFrame
dfmatrix

Unnamed: 0,EVENT_UNIQUE_ID,DATASET,OFFENCE,MCI_CATEGORY,OCC_HOUR,OCC_DAY,OCC_MONTH,OCC_YEAR,OCC_DOW,OCC_DOY,LAT_WGS84,LONG_WGS84
0,GO-2004111878,HOMICIDES,,,,3.0,January,2004.0,Saturday,3.0,43.685026,-79.392828
1,GO-2004125755,HOMICIDES,,,,8.0,January,2004.0,Thursday,8.0,43.781782,-79.233852
2,GO-2004136086,HOMICIDES,,,,8.0,January,2004.0,Thursday,8.0,43.810544,-79.205574
3,GO-2004148623,HOMICIDES,,,,25.0,January,2004.0,Sunday,25.0,43.670467,-79.434387
4,GO-2004148619,HOMICIDES,,,,25.0,January,2004.0,Sunday,25.0,43.822997,-79.204958
...,...,...,...,...,...,...,...,...,...,...,...,...
386096,GO-2024688981,MAJOR_CRIMES,Theft Of Motor Vehicle,Auto Theft,16,30.0,March,2024.0,Saturday,90.0,43.755641,-79.196001
386097,GO-2024690900,MAJOR_CRIMES,Assault,Assault,16,31.0,March,2024.0,Sunday,91.0,43.595354,-79.529766
386098,GO-2024690985,MAJOR_CRIMES,Assault,Assault,16,31.0,March,2024.0,Sunday,91.0,43.688644,-79.391479
386099,GO-2024690995,MAJOR_CRIMES,Theft Of Motor Vehicle,Auto Theft,23,30.0,March,2024.0,Saturday,90.0,43.684335,-79.372581


In [6]:
dfmatrix.loc[dfmatrix['DATASET'] == 'HOMICIDES', 'OFFENCE'] = 'Homicide'
dfmatrix.loc[dfmatrix['DATASET'] == 'HOMICIDES', 'MCI_CATEGORY'] = 'Homicide'
dfmatrix

Unnamed: 0,EVENT_UNIQUE_ID,DATASET,OFFENCE,MCI_CATEGORY,OCC_HOUR,OCC_DAY,OCC_MONTH,OCC_YEAR,OCC_DOW,OCC_DOY,LAT_WGS84,LONG_WGS84
0,GO-2004111878,HOMICIDES,Homicide,Homicide,,3.0,January,2004.0,Saturday,3.0,43.685026,-79.392828
1,GO-2004125755,HOMICIDES,Homicide,Homicide,,8.0,January,2004.0,Thursday,8.0,43.781782,-79.233852
2,GO-2004136086,HOMICIDES,Homicide,Homicide,,8.0,January,2004.0,Thursday,8.0,43.810544,-79.205574
3,GO-2004148623,HOMICIDES,Homicide,Homicide,,25.0,January,2004.0,Sunday,25.0,43.670467,-79.434387
4,GO-2004148619,HOMICIDES,Homicide,Homicide,,25.0,January,2004.0,Sunday,25.0,43.822997,-79.204958
...,...,...,...,...,...,...,...,...,...,...,...,...
386096,GO-2024688981,MAJOR_CRIMES,Theft Of Motor Vehicle,Auto Theft,16,30.0,March,2024.0,Saturday,90.0,43.755641,-79.196001
386097,GO-2024690900,MAJOR_CRIMES,Assault,Assault,16,31.0,March,2024.0,Sunday,91.0,43.595354,-79.529766
386098,GO-2024690985,MAJOR_CRIMES,Assault,Assault,16,31.0,March,2024.0,Sunday,91.0,43.688644,-79.391479
386099,GO-2024690995,MAJOR_CRIMES,Theft Of Motor Vehicle,Auto Theft,23,30.0,March,2024.0,Saturday,90.0,43.684335,-79.372581


In [7]:
# checking for latitude and longitude errors

# Toronto boundaries
north_boundary = 43.8554
south_boundary = 43.5810
east_boundary = -79.1161
west_boundary = -79.6393

# Check if latitude and longitude are within the boundaries
within_boundaries = (dfmatrix['LAT_WGS84'] <= north_boundary) & (dfmatrix['LAT_WGS84'] >= south_boundary) & \
                    (dfmatrix['LONG_WGS84'] >= west_boundary) & (dfmatrix['LONG_WGS84'] <= east_boundary)

# Filter the DataFrame
data_within_toronto = dfmatrix[within_boundaries]

data_within_toronto

Unnamed: 0,EVENT_UNIQUE_ID,DATASET,OFFENCE,MCI_CATEGORY,OCC_HOUR,OCC_DAY,OCC_MONTH,OCC_YEAR,OCC_DOW,OCC_DOY,LAT_WGS84,LONG_WGS84
0,GO-2004111878,HOMICIDES,Homicide,Homicide,,3.0,January,2004.0,Saturday,3.0,43.685026,-79.392828
1,GO-2004125755,HOMICIDES,Homicide,Homicide,,8.0,January,2004.0,Thursday,8.0,43.781782,-79.233852
2,GO-2004136086,HOMICIDES,Homicide,Homicide,,8.0,January,2004.0,Thursday,8.0,43.810544,-79.205574
3,GO-2004148623,HOMICIDES,Homicide,Homicide,,25.0,January,2004.0,Sunday,25.0,43.670467,-79.434387
4,GO-2004148619,HOMICIDES,Homicide,Homicide,,25.0,January,2004.0,Sunday,25.0,43.822997,-79.204958
...,...,...,...,...,...,...,...,...,...,...,...,...
386096,GO-2024688981,MAJOR_CRIMES,Theft Of Motor Vehicle,Auto Theft,16,30.0,March,2024.0,Saturday,90.0,43.755641,-79.196001
386097,GO-2024690900,MAJOR_CRIMES,Assault,Assault,16,31.0,March,2024.0,Sunday,91.0,43.595354,-79.529766
386098,GO-2024690985,MAJOR_CRIMES,Assault,Assault,16,31.0,March,2024.0,Sunday,91.0,43.688644,-79.391479
386099,GO-2024690995,MAJOR_CRIMES,Theft Of Motor Vehicle,Auto Theft,23,30.0,March,2024.0,Saturday,90.0,43.684335,-79.372581


In [8]:
dfmatrix = dfmatrix.loc[within_boundaries].reset_index(drop=True)
dfmatrix

Unnamed: 0,EVENT_UNIQUE_ID,DATASET,OFFENCE,MCI_CATEGORY,OCC_HOUR,OCC_DAY,OCC_MONTH,OCC_YEAR,OCC_DOW,OCC_DOY,LAT_WGS84,LONG_WGS84
0,GO-2004111878,HOMICIDES,Homicide,Homicide,,3.0,January,2004.0,Saturday,3.0,43.685026,-79.392828
1,GO-2004125755,HOMICIDES,Homicide,Homicide,,8.0,January,2004.0,Thursday,8.0,43.781782,-79.233852
2,GO-2004136086,HOMICIDES,Homicide,Homicide,,8.0,January,2004.0,Thursday,8.0,43.810544,-79.205574
3,GO-2004148623,HOMICIDES,Homicide,Homicide,,25.0,January,2004.0,Sunday,25.0,43.670467,-79.434387
4,GO-2004148619,HOMICIDES,Homicide,Homicide,,25.0,January,2004.0,Sunday,25.0,43.822997,-79.204958
...,...,...,...,...,...,...,...,...,...,...,...,...
380240,GO-2024688981,MAJOR_CRIMES,Theft Of Motor Vehicle,Auto Theft,16,30.0,March,2024.0,Saturday,90.0,43.755641,-79.196001
380241,GO-2024690900,MAJOR_CRIMES,Assault,Assault,16,31.0,March,2024.0,Sunday,91.0,43.595354,-79.529766
380242,GO-2024690985,MAJOR_CRIMES,Assault,Assault,16,31.0,March,2024.0,Sunday,91.0,43.688644,-79.391479
380243,GO-2024690995,MAJOR_CRIMES,Theft Of Motor Vehicle,Auto Theft,23,30.0,March,2024.0,Saturday,90.0,43.684335,-79.372581
