In [1]:
import pandas as pd
import numpy as np
import math
import geopandas as gpd
from shapely.geometry import Point
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

2024-06-01 16:49:25.361071: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-01 16:49:25.364026: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-01 16:49:25.407776: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Initialize Variables

In [2]:
df_list = ['dfmajorcrimes', 'dfhomicidies']

# Data Loading

In [3]:
def show_all_columns(df):
    with pd.option_context('display.max_columns', None):
        display(df)
def show_all_rows(df):
    with pd.option_context('display.max_rows', None):
        display(df)

In [4]:
dfmajorcrimes = pd.read_csv('raw_data/majorcrimes.csv')
dfhomicides = pd.read_csv('raw_data/homicidies.csv')


In [5]:
# Define the target columns including the ones that need to be added
columns = ['EVENT_UNIQUE_ID','DATASET','OFFENCE', 'MCI_CATEGORY','OCC_HOUR','OCC_DAY','OCC_MONTH','OCC_YEAR','OCC_DOW','OCC_DOY','LAT_WGS84','LONG_WGS84']

# Add missing columns with default values
for df in [dfhomicides, dfmajorcrimes]:
    for column in columns:
        if column not in df.columns:
            df[column] = None

# Select columns in the specified order for all DataFrames
dfhomicides_s = dfhomicides[columns]
dfmajorcrimes_s = dfmajorcrimes[columns]

# Set the 'DATASET' column for each DataFrame using .loc to avoid SettingWithCopyWarning
dfhomicides_s.loc[:, 'DATASET'] = 'HOMICIDES'
dfmajorcrimes_s.loc[:, 'DATASET'] = 'MAJOR_CRIMES'

# Concatenate the DataFrames into a single DataFrame
dfmatrix = pd.concat([dfhomicides_s, dfmajorcrimes_s], ignore_index=True)

# Display the resulting DataFrame
dfmatrix

Unnamed: 0,EVENT_UNIQUE_ID,DATASET,OFFENCE,MCI_CATEGORY,OCC_HOUR,OCC_DAY,OCC_MONTH,OCC_YEAR,OCC_DOW,OCC_DOY,LAT_WGS84,LONG_WGS84
0,GO-2004111878,HOMICIDES,,,,3.0,January,2004.0,Saturday,3.0,43.685026,-79.392828
1,GO-2004125755,HOMICIDES,,,,8.0,January,2004.0,Thursday,8.0,43.781782,-79.233852
2,GO-2004136086,HOMICIDES,,,,8.0,January,2004.0,Thursday,8.0,43.810544,-79.205574
3,GO-2004148623,HOMICIDES,,,,25.0,January,2004.0,Sunday,25.0,43.670467,-79.434387
4,GO-2004148619,HOMICIDES,,,,25.0,January,2004.0,Sunday,25.0,43.822997,-79.204958
...,...,...,...,...,...,...,...,...,...,...,...,...
386096,GO-2024688981,MAJOR_CRIMES,Theft Of Motor Vehicle,Auto Theft,16,30.0,March,2024.0,Saturday,90.0,43.755641,-79.196001
386097,GO-2024690900,MAJOR_CRIMES,Assault,Assault,16,31.0,March,2024.0,Sunday,91.0,43.595354,-79.529766
386098,GO-2024690985,MAJOR_CRIMES,Assault,Assault,16,31.0,March,2024.0,Sunday,91.0,43.688644,-79.391479
386099,GO-2024690995,MAJOR_CRIMES,Theft Of Motor Vehicle,Auto Theft,23,30.0,March,2024.0,Saturday,90.0,43.684335,-79.372581


In [6]:
dfmatrix.loc[dfmatrix['DATASET'] == 'HOMICIDES', 'OFFENCE'] = 'Homicide'
dfmatrix.loc[dfmatrix['DATASET'] == 'HOMICIDES', 'MCI_CATEGORY'] = 'Homicide'
dfmatrix

Unnamed: 0,EVENT_UNIQUE_ID,DATASET,OFFENCE,MCI_CATEGORY,OCC_HOUR,OCC_DAY,OCC_MONTH,OCC_YEAR,OCC_DOW,OCC_DOY,LAT_WGS84,LONG_WGS84
0,GO-2004111878,HOMICIDES,Homicide,Homicide,,3.0,January,2004.0,Saturday,3.0,43.685026,-79.392828
1,GO-2004125755,HOMICIDES,Homicide,Homicide,,8.0,January,2004.0,Thursday,8.0,43.781782,-79.233852
2,GO-2004136086,HOMICIDES,Homicide,Homicide,,8.0,January,2004.0,Thursday,8.0,43.810544,-79.205574
3,GO-2004148623,HOMICIDES,Homicide,Homicide,,25.0,January,2004.0,Sunday,25.0,43.670467,-79.434387
4,GO-2004148619,HOMICIDES,Homicide,Homicide,,25.0,January,2004.0,Sunday,25.0,43.822997,-79.204958
...,...,...,...,...,...,...,...,...,...,...,...,...
386096,GO-2024688981,MAJOR_CRIMES,Theft Of Motor Vehicle,Auto Theft,16,30.0,March,2024.0,Saturday,90.0,43.755641,-79.196001
386097,GO-2024690900,MAJOR_CRIMES,Assault,Assault,16,31.0,March,2024.0,Sunday,91.0,43.595354,-79.529766
386098,GO-2024690985,MAJOR_CRIMES,Assault,Assault,16,31.0,March,2024.0,Sunday,91.0,43.688644,-79.391479
386099,GO-2024690995,MAJOR_CRIMES,Theft Of Motor Vehicle,Auto Theft,23,30.0,March,2024.0,Saturday,90.0,43.684335,-79.372581


# Data Cleaning and Transformation

In [7]:
# checking for latitude and longitude errors

# Toronto boundaries
north_boundary = 43.8554
south_boundary = 43.5810
east_boundary = -79.1161
west_boundary = -79.6393


# Check if latitude and longitude are within the boundaries
within_boundaries = (dfmatrix['LAT_WGS84'] <= north_boundary) & (dfmatrix['LAT_WGS84'] >= south_boundary) & \
                    (dfmatrix['LONG_WGS84'] >= west_boundary) & (dfmatrix['LONG_WGS84'] <= east_boundary)

# Filter the DataFrame
dfmatrix = dfmatrix.loc[within_boundaries].reset_index(drop=True)
dfmatrix

Unnamed: 0,EVENT_UNIQUE_ID,DATASET,OFFENCE,MCI_CATEGORY,OCC_HOUR,OCC_DAY,OCC_MONTH,OCC_YEAR,OCC_DOW,OCC_DOY,LAT_WGS84,LONG_WGS84
0,GO-2004111878,HOMICIDES,Homicide,Homicide,,3.0,January,2004.0,Saturday,3.0,43.685026,-79.392828
1,GO-2004125755,HOMICIDES,Homicide,Homicide,,8.0,January,2004.0,Thursday,8.0,43.781782,-79.233852
2,GO-2004136086,HOMICIDES,Homicide,Homicide,,8.0,January,2004.0,Thursday,8.0,43.810544,-79.205574
3,GO-2004148623,HOMICIDES,Homicide,Homicide,,25.0,January,2004.0,Sunday,25.0,43.670467,-79.434387
4,GO-2004148619,HOMICIDES,Homicide,Homicide,,25.0,January,2004.0,Sunday,25.0,43.822997,-79.204958
...,...,...,...,...,...,...,...,...,...,...,...,...
380240,GO-2024688981,MAJOR_CRIMES,Theft Of Motor Vehicle,Auto Theft,16,30.0,March,2024.0,Saturday,90.0,43.755641,-79.196001
380241,GO-2024690900,MAJOR_CRIMES,Assault,Assault,16,31.0,March,2024.0,Sunday,91.0,43.595354,-79.529766
380242,GO-2024690985,MAJOR_CRIMES,Assault,Assault,16,31.0,March,2024.0,Sunday,91.0,43.688644,-79.391479
380243,GO-2024690995,MAJOR_CRIMES,Theft Of Motor Vehicle,Auto Theft,23,30.0,March,2024.0,Saturday,90.0,43.684335,-79.372581


In [8]:
# Display NA values

# Boolean mask where OCC_HOUR is NA
na_mask = dfmatrix['OCC_HOUR'].isna()

# Mask to filter the DataFrame
na_rows = dfmatrix[na_mask]

# Display the rows with NA values in OCC_HOUR column
na_rows

Unnamed: 0,EVENT_UNIQUE_ID,DATASET,OFFENCE,MCI_CATEGORY,OCC_HOUR,OCC_DAY,OCC_MONTH,OCC_YEAR,OCC_DOW,OCC_DOY,LAT_WGS84,LONG_WGS84
0,GO-2004111878,HOMICIDES,Homicide,Homicide,,3.0,January,2004.0,Saturday,3.0,43.685026,-79.392828
1,GO-2004125755,HOMICIDES,Homicide,Homicide,,8.0,January,2004.0,Thursday,8.0,43.781782,-79.233852
2,GO-2004136086,HOMICIDES,Homicide,Homicide,,8.0,January,2004.0,Thursday,8.0,43.810544,-79.205574
3,GO-2004148623,HOMICIDES,Homicide,Homicide,,25.0,January,2004.0,Sunday,25.0,43.670467,-79.434387
4,GO-2004148619,HOMICIDES,Homicide,Homicide,,25.0,January,2004.0,Sunday,25.0,43.822997,-79.204958
...,...,...,...,...,...,...,...,...,...,...,...,...
1409,GO-2024545700,HOMICIDES,Homicide,Homicide,,12.0,March,2024.0,Tuesday,72.0,43.659103,-79.364238
1410,GO-2024545700,HOMICIDES,Homicide,Homicide,,12.0,March,2024.0,Tuesday,72.0,43.659103,-79.364238
1411,GO-2024558575,HOMICIDES,Homicide,Homicide,,14.0,March,2024.0,Thursday,74.0,43.745156,-79.208993
1412,GO-2024671809,HOMICIDES,Homicide,Homicide,,28.0,March,2024.0,Thursday,88.0,43.768847,-79.466916


### OCC_HOUR Column

In [9]:
df_hour = dfmatrix[dfmatrix['DATASET']== 'MAJOR_CRIMES'].OCC_HOUR.value_counts()/len(dfmatrix[dfmatrix['DATASET']== 'MAJOR_CRIMES'])
df_hour

OCC_HOUR
0     0.069181
21    0.054481
12    0.054288
20    0.054272
18    0.054214
22    0.053892
23    0.052298
19    0.052179
17    0.050117
15    0.048177
16    0.046306
14    0.041018
1     0.040786
2     0.040556
13    0.038138
11    0.035145
3     0.033749
10    0.033440
9     0.032930
8     0.028205
4     0.027144
5     0.020833
7     0.020669
6     0.017982
Name: count, dtype: float64

In [10]:
df_hour = df_hour.to_frame().reset_index(drop=False)

df_hour

Unnamed: 0,OCC_HOUR,count
0,0,0.069181
1,21,0.054481
2,12,0.054288
3,20,0.054272
4,18,0.054214
5,22,0.053892
6,23,0.052298
7,19,0.052179
8,17,0.050117
9,15,0.048177


In [11]:
import math

for i, hour_row in df_hour.iterrows():
    size = len(dfmatrix[dfmatrix['DATASET']=='HOMICIDES'])
    for index, row in dfmatrix[((dfmatrix['DATASET']=='HOMICIDES') & (pd.isnull(dfmatrix.OCC_HOUR)))].head(math.trunc(size*hour_row['count'])).iterrows():
        dfmatrix.loc[index, 'OCC_HOUR'] = hour_row['OCC_HOUR']

In [12]:
len(dfmatrix[pd.isnull(dfmatrix['OCC_HOUR'])])

14

In [13]:
# Again for the 14 remainders still showing as Nan

for i, hour_row in df_hour.iterrows():
    size = len(dfmatrix[dfmatrix['DATASET']=='HOMICIDES'])
    for index, row in dfmatrix[((dfmatrix['DATASET']=='HOMICIDES') & (pd.isnull(dfmatrix.OCC_HOUR)))].head(math.trunc(size*hour_row['count'])).iterrows():
        dfmatrix.loc[index, 'OCC_HOUR'] = hour_row['OCC_HOUR']

In [14]:
len(dfmatrix[pd.isnull(dfmatrix['OCC_HOUR'])])

0

In [15]:
dfmatrix['OCC_HOUR'] = pd.to_numeric(dfmatrix['OCC_HOUR'], errors='coerce').fillna(0).astype(int)

In [16]:
# Apply np.floor and convert to int, while handling NaN values
dfmatrix['OCC_HOUR'] = dfmatrix['OCC_HOUR'].apply(lambda x: np.floor(x) if pd.notnull(x) else x).astype('Int64')

### Fixing OCC_DAY

In [17]:
#  Converting OCC_DAY from float to int
dfmatrix['OCC_DAY'] = pd.to_numeric(dfmatrix['OCC_DAY'], errors='coerce').fillna(0).astype(int)

In [18]:
# Apply np.floor and convert to int, while handling NaN values
dfmatrix['OCC_DAY'] = dfmatrix['OCC_DAY'].apply(lambda x: np.floor(x) if pd.notnull(x) else x).astype('Int64')

In [19]:
# Filter the DataFrame for non-zero OCC_DAY values
df_occ_day_non_zero = dfmatrix[dfmatrix['OCC_DAY'] != 0]

# Calculate the value counts and normalize them
occ_day_proportions = df_occ_day_non_zero['OCC_DAY'].value_counts(normalize=True).reset_index()
occ_day_proportions.columns = ['OCC_DAY', 'proportion']

# Total number of rows where OCC_DAY is 0
total_zeros = (dfmatrix['OCC_DAY'] == 0).sum()

# Track the index of the current proportion
current_proportion_index = 0

# Iterate over the rows where OCC_DAY is 0 and update OCC_DAY based on proportions
for index, row in dfmatrix[dfmatrix['OCC_DAY'] == 0].iterrows():
    # Get the current proportion
    current_proportion = occ_day_proportions.at[current_proportion_index, 'OCC_DAY']
    
    # Update OCC_DAY for the current row
    dfmatrix.at[index, 'OCC_DAY'] = current_proportion
    
    # Update the current proportion index
    current_proportion_index += 1
    
    # If we reach the end of the proportions, reset the index
    if current_proportion_index >= len(occ_day_proportions):
        current_proportion_index = 0

In [20]:
# Dropping the zero rows
# Total number of rows
total_rows = len(dfmatrix)

# Number of rows where OCC_DAY is 0
occ_day_zeros = (dfmatrix['OCC_DAY'] == 0).sum()

# Proportion of rows where OCC_DAY is 0
proportion_occ_day_zeros = occ_day_zeros / total_rows

print(f"Total rows: {total_rows}")
print(f"Rows with OCC_DAY = 0: {occ_day_zeros}")
print(f"Proportion of rows with OCC_DAY = 0: {proportion_occ_day_zeros:.2%}")

Total rows: 380245
Rows with OCC_DAY = 0: 0
Proportion of rows with OCC_DAY = 0: 0.00%


### OCC_DAY E DOY para Sin e COS


In [21]:
dfmatrix['OCC_DAY_SIN'] = np.sin(2 * np.pi * dfmatrix['OCC_DAY'] / 24)
dfmatrix['OCC_DAY_COS'] = np.cos(2 * np.pi * dfmatrix['OCC_DAY'] / 24)

dfmatrix['OCC_DOY_SIN'] = np.sin(2 * np.pi * dfmatrix['OCC_DOY'] / 24)
dfmatrix['OCC_DOY_COS'] = np.cos(2 * np.pi * dfmatrix['OCC_DOY'] / 24)

### OCC_MONTH

In [22]:
dfmatrix = dfmatrix.dropna(subset=['OCC_MONTH'])

In [23]:
months = {'January':1,
          'February': 2,
          'March': 3,
          'April':4,
          'May': 5,
          'June': 6,
          'July': 7,
          'August': 8,
          'September': 9,
          'October': 10,
          'November': 11,
          'December': 12}

In [24]:
dfmatrix['OCC_MONTH_NUM'] = dfmatrix.OCC_MONTH.apply(lambda x: months[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmatrix['OCC_MONTH_NUM'] = dfmatrix.OCC_MONTH.apply(lambda x: months[x])


In [25]:
# Apply sine and cosine transformations
dfmatrix['MONTH_SIN'] = np.sin(2 * np.pi * dfmatrix['OCC_MONTH_NUM'] / 12)
dfmatrix['MONTH_COS'] = np.cos(2 * np.pi * dfmatrix['OCC_MONTH_NUM'] / 12)
dfmatrix['HOUR_SIN'] = np.sin(2 * np.pi * dfmatrix['OCC_HOUR'] / 24)
dfmatrix['HOUR_COS'] = np.cos(2 * np.pi * dfmatrix['OCC_HOUR'] / 24)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmatrix['MONTH_SIN'] = np.sin(2 * np.pi * dfmatrix['OCC_MONTH_NUM'] / 12)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmatrix['MONTH_COS'] = np.cos(2 * np.pi * dfmatrix['OCC_MONTH_NUM'] / 12)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmatrix['HOUR_SIN'] = np.sin(2 * np.pi * dfmatrix['OCC

In [26]:
days_of_week = {'Friday': 6,  
                'Saturday': 7,
                'Sunday': 1,  
                'Thursday': 5,
                'Wednesday': 4, 
                'Monday': 2,
                'Tuesday': 3}

dfmatrix['OCC_DOW_NUM'] = dfmatrix.OCC_DOW.apply(lambda x: days_of_week[x.strip()])

dfmatrix['OCC_DOW_SIN'] = np.sin(2 * np.pi * dfmatrix['OCC_DOW_NUM'] / 24)
dfmatrix['OCC_DOW_COS'] = np.cos(2 * np.pi * dfmatrix['OCC_DOW_NUM'] / 24)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmatrix['OCC_DOW_NUM'] = dfmatrix.OCC_DOW.apply(lambda x: days_of_week[x.strip()])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmatrix['OCC_DOW_SIN'] = np.sin(2 * np.pi * dfmatrix['OCC_DOW_NUM'] / 24)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmatrix['OCC_DOW_COS'] = np.cos(2 * np.pi * d

### OCC_YEAR

In [27]:
dfmatrix.loc['OCC_YEAR'] = pd.to_numeric(dfmatrix['OCC_YEAR'], errors='raise').fillna(0).astype(int)

  dfmatrix.loc['OCC_YEAR'] = pd.to_numeric(dfmatrix['OCC_YEAR'], errors='raise').fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmatrix.loc['OCC_YEAR'] = pd.to_numeric(dfmatrix['OCC_YEAR'], errors='raise').fillna(0).astype(int)


In [28]:
dfmatrix = dfmatrix.dropna(subset=['OCC_YEAR'])

In [29]:
# Apply np.floor and convert to int, while handling NaN values
dfmatrix['OCC_YEAR'] = dfmatrix['OCC_YEAR'].apply(lambda x: np.floor(x) if pd.notnull(x) else x).astype('Int64')

In [30]:
# Filter rows where 'OCC_YEAR' is 2014 or later
dfmatrix = dfmatrix[dfmatrix['OCC_YEAR'] >= 2014]

### Dropping unnecessary columns

In [31]:
# List of columns to drop
columns_to_drop = ['DATASET', 'OFFENCE', 'DIVISION', 'HOOD_140', 'NEIGHBOURHOOD_140', 'HOOD_158', 'NEIGHBOURHOOD_158']

# Filter the list to include only existing columns in the DataFrame
existing_columns_to_drop = [col for col in columns_to_drop if col in dfmatrix.columns]

# Drop the existing columns
dfmatrix = dfmatrix.drop(columns=existing_columns_to_drop)

In [32]:
# Drop rows with NaN values
dfmatrix = dfmatrix.dropna(subset=['OCC_DOY'])

# Now convert the column to integers
dfmatrix['OCC_DOY'] = dfmatrix['OCC_DOY'].astype(int)

In [33]:
dfmatrix.head()

Unnamed: 0,EVENT_UNIQUE_ID,MCI_CATEGORY,OCC_HOUR,OCC_DAY,OCC_MONTH,OCC_YEAR,OCC_DOW,OCC_DOY,LAT_WGS84,LONG_WGS84,...,OCC_DOY_SIN,OCC_DOY_COS,OCC_MONTH_NUM,MONTH_SIN,MONTH_COS,HOUR_SIN,HOUR_COS,OCC_DOW_NUM,OCC_DOW_SIN,OCC_DOW_COS
662,GO-20141272537,Homicide,17,3,January,2014,Friday,3,43.667288,-79.373767,...,0.707107,0.7071068,1.0,0.5,0.866025,-0.965926,-0.258819,6.0,1.0,6.123234000000001e-17
663,GO-20141326860,Homicide,17,14,January,2014,Tuesday,14,43.75914,-79.289604,...,-0.5,-0.8660254,1.0,0.5,0.866025,-0.965926,-0.258819,3.0,0.707107,0.7071068
664,GO-20141431240,Homicide,17,29,January,2014,Wednesday,29,43.769478,-79.220423,...,0.965926,0.258819,1.0,0.5,0.866025,-0.965926,-0.258819,4.0,0.866025,0.5
665,GO-20141428622,Homicide,17,29,January,2014,Wednesday,29,43.694614,-79.517334,...,0.965926,0.258819,1.0,0.5,0.866025,-0.965926,-0.258819,4.0,0.866025,0.5
666,GO-20141435425,Homicide,17,30,January,2014,Thursday,30,43.654318,-79.378757,...,1.0,1.19434e-15,1.0,0.5,0.866025,-0.965926,-0.258819,5.0,0.965926,0.258819


### Prepare for geographic data

In [34]:
# # Prepare geographic data
# df['geometry'] = dfmatrix.apply(lambda row: Point(row['LONG_WGS84'], row['LAT_WGS84']), axis=1)
# geo_df = gpd.GeoDataFrame(df, geometry='geometry')

# # Drop rows with missing coordinates
# geo_df = geo_df.dropna(subset=['LAT_WGS84', 'LONG_WGS84'])

In [35]:
#geo_df

### Aggrupate by 'EVENT_UNIQUE_ID', 'LAT_WGS84' e 'LONG_WGS84'and count occurrences per category

In [36]:
# # Agrupar por 'EVENT_UNIQUE_ID', 'LAT_WGS84' e 'LONG_WGS84' e contar as ocorrências de cada categoria
# dfmatrix['MCI_CATEGORY'] = dfmatrix['MCI_CATEGORY'].apply(lambda x: [x])
# dfmatrix = dfmatrix.groupby(['EVENT_UNIQUE_ID', 'LAT_WGS84', 'LONG_WGS84']).agg({
#     'MCI_CATEGORY': lambda x: sum(x, []),
#     'OCC_YEAR': 'first',
#     'OCC_MONTH': 'first',
#     'OCC_DAY': 'first',
#     'OCC_HOUR': 'first',
#     'OCC_DOW': 'first',
#     'OCC_DOY': 'first',
#     'OCC_MONTH_NUM': 'first',
#     'MONTH_SIN': 'first',
#     'MONTH_COS': 'first',
#     'HOUR_SIN': 'first',
#     'HOUR_COS':'first',
#     'OCC_DAY_SIN':'first',
#     'OCC_DAY_COS':'first',
#     'OCC_DOY_SIN':'first',
#     'OCC_DOY_COS':'first',
#     'OCC_DOW_NUM':'first',
#     'OCC_DOW_SIN':'first',
#     'OCC_DOW_COS':'first'
# }).reset_index()

In [37]:
# # Binarizar as categorias de crimes usando MultiLabelBinarizer
# mlb = MultiLabelBinarizer()
# crime_counts = pd.DataFrame(mlb.fit_transform(dfmatrix['MCI_CATEGORY']), columns=mlb.classes_)
# crime_counts

In [38]:
## aqui conta os MCI CATEGORY PARA FIACR IGUAL A PLANILHA
# # Ajustar a contagem de crimes
# for column in crime_counts.columns:
#     crime_counts[column] = dfmatrix['MCI_CATEGORY'].apply(lambda x: x.count(column))

In [39]:
## aqui conta os MCI CATEGORY PARA FIACR IGUAL A PLANILHA
# # Concatenar as contagens de crimes com o DataFrame original
# dfmatrix = pd.concat([dfmatrix.drop(columns=['MCI_CATEGORY']), crime_counts], axis=1)

# # Inferir tipos de objeto e preencher valores nulos
# dfmatrix = dfmatrix.infer_objects().fillna(0)

## Label encoder

In [40]:
# Criar as colunas one-hot encoded
df_onehot = pd.get_dummies(dfmatrix, columns=['MCI_CATEGORY'], prefix='', prefix_sep='')


# Renomear todas as colunas para maiúsculas
df_onehot.columns = [col.upper() for col in df_onehot.columns]

# Identificar automaticamente as colunas criadas para MCI_CATEGORY para depois eu jogar no y sem ser manualmente
original_columns = set(dfmatrix.columns)
onehot_columns = set(df_onehot.columns) - {col.upper() for col in original_columns}

# Converter os valores booleanos para inteiros (0 e 1) apenas nas colunas one-hot - apesar que não precisa pq o boolean funciona bem
for col in onehot_columns:
    df_onehot[col] = df_onehot[col].astype(int)

# Verificar o resultado
dfmatrix = df_onehot
dfmatrix

Unnamed: 0,EVENT_UNIQUE_ID,OCC_HOUR,OCC_DAY,OCC_MONTH,OCC_YEAR,OCC_DOW,OCC_DOY,LAT_WGS84,LONG_WGS84,OCC_DAY_SIN,...,HOUR_COS,OCC_DOW_NUM,OCC_DOW_SIN,OCC_DOW_COS,ASSAULT,AUTO THEFT,BREAK AND ENTER,HOMICIDE,ROBBERY,THEFT OVER
662,GO-20141272537,17,3,January,2014,Friday,3,43.667288,-79.373767,0.707107,...,-0.258819,6.0,1.000000,6.123234e-17,0,0,0,1,0,0
663,GO-20141326860,17,14,January,2014,Tuesday,14,43.759140,-79.289604,-0.5,...,-0.258819,3.0,0.707107,7.071068e-01,0,0,0,1,0,0
664,GO-20141431240,17,29,January,2014,Wednesday,29,43.769478,-79.220423,0.965926,...,-0.258819,4.0,0.866025,5.000000e-01,0,0,0,1,0,0
665,GO-20141428622,17,29,January,2014,Wednesday,29,43.694614,-79.517334,0.965926,...,-0.258819,4.0,0.866025,5.000000e-01,0,0,0,1,0,0
666,GO-20141435425,17,30,January,2014,Thursday,30,43.654318,-79.378757,1.0,...,-0.258819,5.0,0.965926,2.588190e-01,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380240,GO-2024688981,16,30,March,2024,Saturday,90,43.755641,-79.196001,1.0,...,-0.5,7.0,0.965926,-2.588190e-01,0,1,0,0,0,0
380241,GO-2024690900,16,31,March,2024,Sunday,91,43.595354,-79.529766,0.965926,...,-0.5,1.0,0.258819,9.659258e-01,1,0,0,0,0,0
380242,GO-2024690985,16,31,March,2024,Sunday,91,43.688644,-79.391479,0.965926,...,-0.5,1.0,0.258819,9.659258e-01,1,0,0,0,0,0
380243,GO-2024690995,23,30,March,2024,Saturday,90,43.684335,-79.372581,1.0,...,0.965926,7.0,0.965926,-2.588190e-01,0,1,0,0,0,0


In [41]:
# # le = LabelEncoder()

# # dfmatrix['CATEGORY ENCODED'] = le.fit_transform(dfmatrix['MCI_CATEGORY'])
# dfmatrix


In [42]:
#dfmatrix['MCI_CATEGORY'].value_counts()

In [43]:
# le = LabelEncoder()

# dfmatrix['CATEGORY ENCODED'] = le.fit_transform(dfmatrix['MCI_CATEGORY'])
# dfmatrix

In [44]:
# y = to_categorical(dfmatrix['CATEGORY ENCODED'])
# y

In [45]:
# label_map = dict(zip(dfmatrix['MCI_CATEGORY'], dfmatrix['CATEGORY ENCODED'])) # get the mapping between the original labels and encoded labels
# label_map

In [46]:
# y_df = pd.DataFrame(y, columns=['Assault', 'Auto Theft', 'Break and Enter', 'Homicide', 'Robbery', 'Theft Over'])
# y_df

In [47]:
# y_df.sum()

In [48]:
# dfmatrix['MCI_CATEGORY'].value_counts()

In [49]:
# # Realizar o one-hot encoding da coluna MCI_CATEGORY
# df_one_hot = pd.get_dummies(dfmatrix, columns=['MCI_CATEGORY'], prefix='', prefix_sep='')

# # Rearranjar as colunas para que as colunas one-hot estejam no final
# columns = [col for col in df_one_hot.columns if col not in ['Assault', 'Auto Theft', 'Break and Enter', 'Homicide', 'Robbery', 'Theft Over']] + ['Assault', 'Auto Theft', 'Break and Enter', 'Homicide', 'Robbery', 'Theft Over']
# df_one_hot = df_one_hot[columns]

# # Exibir o DataFrame resultante
# df_one_hot

In [50]:
# # Realizar o one-hot encoding da coluna MCI_CATEGORY
# df_one_hot = pd.get_dummies(dfmatrix, columns=['MCI_CATEGORY'], prefix='', prefix_sep='')

# # Obter a lista de colunas que não são parte do one-hot encoding
# original_columns = [col for col in dfmatrix.columns if col != 'MCI_CATEGORY']

# # Obter a lista de novas colunas geradas pelo one-hot encoding
# one_hot_columns = df_one_hot.columns.difference(original_columns).tolist()

# # Combinar as colunas originais com as novas colunas
# final_columns = original_columns + one_hot_columns

# # Rearranjar as colunas do DataFrame resultante
# df_one_hot = df_one_hot[final_columns]

# # Exibir o DataFrame resultante
# dfmatrix = df_one_hot

In [51]:
# dftest = df_one_hot[['Assault', 'Auto Theft', 'Break and Enter', 'Homicide', 'Robbery', 'Theft Over']]
# dftest.sum()

### Drop Rows with NaN values and conversion of collumns to integers

In [52]:
# Drop rows with NaN values
dfmatrix = dfmatrix.dropna(subset=['OCC_DOY'])

# Now convert the column to integers
dfmatrix['OCC_DOY'] = dfmatrix['OCC_DOY'].astype(int)

# Drop rows with NaN values²
dfmatrix = dfmatrix.dropna(subset=['OCC_MONTH_NUM'])
# Now convert the column to integers²
dfmatrix['OCC_MONTH_NUM'] = dfmatrix['OCC_MONTH_NUM'].astype(int)

In [53]:
show_all_columns(dfmatrix)

Unnamed: 0,EVENT_UNIQUE_ID,OCC_HOUR,OCC_DAY,OCC_MONTH,OCC_YEAR,OCC_DOW,OCC_DOY,LAT_WGS84,LONG_WGS84,OCC_DAY_SIN,OCC_DAY_COS,OCC_DOY_SIN,OCC_DOY_COS,OCC_MONTH_NUM,MONTH_SIN,MONTH_COS,HOUR_SIN,HOUR_COS,OCC_DOW_NUM,OCC_DOW_SIN,OCC_DOW_COS,ASSAULT,AUTO THEFT,BREAK AND ENTER,HOMICIDE,ROBBERY,THEFT OVER
662,GO-20141272537,17,3,January,2014,Friday,3,43.667288,-79.373767,0.707107,0.707107,0.707107,7.071068e-01,1,0.5,8.660254e-01,-0.965926,-0.258819,6.0,1.000000,6.123234e-17,0,0,0,1,0,0
663,GO-20141326860,17,14,January,2014,Tuesday,14,43.759140,-79.289604,-0.5,-0.866025,-0.500000,-8.660254e-01,1,0.5,8.660254e-01,-0.965926,-0.258819,3.0,0.707107,7.071068e-01,0,0,0,1,0,0
664,GO-20141431240,17,29,January,2014,Wednesday,29,43.769478,-79.220423,0.965926,0.258819,0.965926,2.588190e-01,1,0.5,8.660254e-01,-0.965926,-0.258819,4.0,0.866025,5.000000e-01,0,0,0,1,0,0
665,GO-20141428622,17,29,January,2014,Wednesday,29,43.694614,-79.517334,0.965926,0.258819,0.965926,2.588190e-01,1,0.5,8.660254e-01,-0.965926,-0.258819,4.0,0.866025,5.000000e-01,0,0,0,1,0,0
666,GO-20141435425,17,30,January,2014,Thursday,30,43.654318,-79.378757,1.0,0.0,1.000000,1.194340e-15,1,0.5,8.660254e-01,-0.965926,-0.258819,5.0,0.965926,2.588190e-01,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380240,GO-2024688981,16,30,March,2024,Saturday,90,43.755641,-79.196001,1.0,0.0,-1.000000,8.578717e-16,3,1.0,6.123234e-17,-0.866025,-0.5,7.0,0.965926,-2.588190e-01,0,1,0,0,0,0
380241,GO-2024690900,16,31,March,2024,Sunday,91,43.595354,-79.529766,0.965926,-0.258819,-0.965926,2.588190e-01,3,1.0,6.123234e-17,-0.866025,-0.5,1.0,0.258819,9.659258e-01,1,0,0,0,0,0
380242,GO-2024690985,16,31,March,2024,Sunday,91,43.688644,-79.391479,0.965926,-0.258819,-0.965926,2.588190e-01,3,1.0,6.123234e-17,-0.866025,-0.5,1.0,0.258819,9.659258e-01,1,0,0,0,0,0
380243,GO-2024690995,23,30,March,2024,Saturday,90,43.684335,-79.372581,1.0,0.0,-1.000000,8.578717e-16,3,1.0,6.123234e-17,-0.258819,0.965926,7.0,0.965926,-2.588190e-01,0,1,0,0,0,0


In [62]:
dfmatrix = dfmatrix.reset_index(drop=True)
dfmatrix

Unnamed: 0,EVENT_UNIQUE_ID,OCC_HOUR,OCC_DAY,OCC_MONTH,OCC_YEAR,OCC_DOW,OCC_DOY,LAT_WGS84,LONG_WGS84,OCC_DAY_SIN,...,HOUR_COS,OCC_DOW_NUM,OCC_DOW_SIN,OCC_DOW_COS,ASSAULT,AUTO THEFT,BREAK AND ENTER,HOMICIDE,ROBBERY,THEFT OVER
0,GO-20141272537,17,3,January,2014,Friday,3,43.667288,-79.373767,0.707107,...,-0.258819,6.0,1.000000,6.123234e-17,0,0,0,1,0,0
1,GO-20141326860,17,14,January,2014,Tuesday,14,43.759140,-79.289604,-0.5,...,-0.258819,3.0,0.707107,7.071068e-01,0,0,0,1,0,0
2,GO-20141431240,17,29,January,2014,Wednesday,29,43.769478,-79.220423,0.965926,...,-0.258819,4.0,0.866025,5.000000e-01,0,0,0,1,0,0
3,GO-20141428622,17,29,January,2014,Wednesday,29,43.694614,-79.517334,0.965926,...,-0.258819,4.0,0.866025,5.000000e-01,0,0,0,1,0,0
4,GO-20141435425,17,30,January,2014,Thursday,30,43.654318,-79.378757,1.0,...,-0.258819,5.0,0.965926,2.588190e-01,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378085,GO-2024688981,16,30,March,2024,Saturday,90,43.755641,-79.196001,1.0,...,-0.5,7.0,0.965926,-2.588190e-01,0,1,0,0,0,0
378086,GO-2024690900,16,31,March,2024,Sunday,91,43.595354,-79.529766,0.965926,...,-0.5,1.0,0.258819,9.659258e-01,1,0,0,0,0,0
378087,GO-2024690985,16,31,March,2024,Sunday,91,43.688644,-79.391479,0.965926,...,-0.5,1.0,0.258819,9.659258e-01,1,0,0,0,0,0
378088,GO-2024690995,23,30,March,2024,Saturday,90,43.684335,-79.372581,1.0,...,0.965926,7.0,0.965926,-2.588190e-01,0,1,0,0,0,0


In [54]:
dfmatrix.columns

Index(['EVENT_UNIQUE_ID', 'OCC_HOUR', 'OCC_DAY', 'OCC_MONTH', 'OCC_YEAR',
       'OCC_DOW', 'OCC_DOY', 'LAT_WGS84', 'LONG_WGS84', 'OCC_DAY_SIN',
       'OCC_DAY_COS', 'OCC_DOY_SIN', 'OCC_DOY_COS', 'OCC_MONTH_NUM',
       'MONTH_SIN', 'MONTH_COS', 'HOUR_SIN', 'HOUR_COS', 'OCC_DOW_NUM',
       'OCC_DOW_SIN', 'OCC_DOW_COS', 'ASSAULT', 'AUTO THEFT',
       'BREAK AND ENTER', 'HOMICIDE', 'ROBBERY', 'THEFT OVER'],
      dtype='object')

In [55]:
# # Selecionar as colunas de features e de rótulos
# feature_columns = dfmatrix[['LAT_WGS84', 'LONG_WGS84', 'OCC_YEAR', 'MONTH_SIN', 'MONTH_COS', 'HOUR_SIN', 'HOUR_COS', 'OCC_DAY_SIN', 'OCC_DAY_COS', 'OCC_DOY_SIN',  'OCC_DOY_COS', 'OCC_DOW_SIN','OCC_DOW_COS']]
# #label_columns = list(crime_counts.columns)
# # label_columns = dfmatrix[['CATEGORY ENCODED']]
# label_columns = dfmatrix[list(onehot_columns)]

In [57]:

# Separar features e rótulos
X = dfmatrix[['LAT_WGS84', 'LONG_WGS84', 'OCC_YEAR', 'MONTH_SIN', 'MONTH_COS', 'HOUR_SIN', 'HOUR_COS', 'OCC_DAY_SIN', 'OCC_DAY_COS', 'OCC_DOY_SIN',  'OCC_DOY_COS', 'OCC_DOW_SIN','OCC_DOW_COS']]
y = dfmatrix[list(onehot_columns)]

# Normalizar as features
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

In [61]:
y.value_counts()

BREAK AND ENTER  HOMICIDE  AUTO THEFT  ASSAULT  ROBBERY  THEFT OVER
0                0         0           1        0        0             199103
1                0         0           0        0        0              71538
0                0         1           0        0        0              60473
                           0           0        1        0              33693
                                                0        1              12531
                 1         0           0        0        0                752
Name: count, dtype: int64

In [59]:
# Dividir os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Reshape os dados para LSTM (samples, timesteps, features)
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

In [60]:
# Construir o modelo LSTM ajustado
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(32))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(6, activation='softmax'))

# Compilar o modelo
adam = Adam(learning_rate=0.001)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

# Treinar o modelo com ajustes
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Avaliar o modelo
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

2024-06-01 16:50:36.355709: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:08:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-06-01 16:50:36.356105: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
  super().__init__(**kwargs)


Epoch 1/100
[1m7562/7562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 6ms/step - accuracy: 0.5268 - loss: 1.2661 - val_accuracy: 0.5365 - val_loss: 1.2197
Epoch 2/100
[1m1157/7562[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m36s[0m 6ms/step - accuracy: 0.5413 - loss: 1.2170

KeyboardInterrupt: 

In [None]:
#3296/3296 ━━━━━━━━━━━━━━━━━━━━ 11s 3ms/step - accuracy: 0.5090 - loss: 1.7930 - val_accuracy: 0.5101 - val_loss: 1.7616
#Epoch 11/50
3781/3781 ━━━━━━━━━━━━━━━━━━━━ 24s 6ms/step - accuracy: 0.5448 - loss: 1.1972 - val_accuracy: 0.5460 - val_loss: 1.1926