## Imports

In [1]:
import altair as alt
from pylab import rcParams
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC

## Reading Data

In [2]:
df = pd.read_csv("data/crimedata_csv_AllNeighbourhoods_AllYears.csv", encoding="utf-8")
df_2023 = df[df["YEAR"] == 2023]
df_2023.head()

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y
25,Break and Enter Commercial,2023,9,14,3,32,10XX ALBERNI ST,West End,491065.2962,5459130.0
102,Break and Enter Commercial,2023,4,1,4,7,10XX BEACH AVE,West End,490197.8719,5458239.0
135,Break and Enter Commercial,2023,4,3,0,50,10XX BEACH AVE,Central Business District,490249.2307,5458167.0
136,Break and Enter Commercial,2023,5,11,18,0,10XX BEACH AVE,Central Business District,490249.2307,5458167.0
185,Break and Enter Commercial,2023,8,9,4,31,10XX BEACH AVE,Central Business District,490268.432,5458143.0


In [3]:
df_2023.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31352 entries, 25 to 879855
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TYPE           31352 non-null  object 
 1   YEAR           31352 non-null  int64  
 2   MONTH          31352 non-null  int64  
 3   DAY            31352 non-null  int64  
 4   HOUR           31352 non-null  int64  
 5   MINUTE         31352 non-null  int64  
 6   HUNDRED_BLOCK  31352 non-null  object 
 7   NEIGHBOURHOOD  31347 non-null  object 
 8   X              31349 non-null  float64
 9   Y              31349 non-null  float64
dtypes: float64(2), int64(5), object(3)
memory usage: 2.6+ MB


In [4]:
df_2023.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
YEAR,31352.0,2023.0,0.0,2023.0,2023.0,2023.0,2023.0,2023.0
MONTH,31352.0,5.736923,2.883964,1.0,3.0,6.0,8.0,11.0
DAY,31352.0,15.1731,8.779039,1.0,8.0,15.0,23.0,31.0
HOUR,31352.0,11.7093,7.502034,0.0,5.0,13.0,18.0,23.0
MINUTE,31352.0,17.92099,18.91126,0.0,0.0,13.0,30.0,59.0
X,31349.0,439205.6,152854.4,0.0,490452.9,491647.0,493098.9,498325.516
Y,31349.0,4867558.0,1693831.0,0.0,5454246.0,5457363.0,5458742.0,5462265.0


In [5]:
def missing_zero_values_table(df):
        zero_val = (df == 0.00).astype(int).sum(axis=0)
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
        mz_table = mz_table.rename(
        columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% of Total Values'})
        mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
        mz_table['% Total Zero Missing Values'] = 100 * mz_table['Total Zero Missing Values'] / len(df)
        mz_table['Data Type'] = df.dtypes
        mz_table = mz_table[
            mz_table.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      
            "There are " + str(mz_table.shape[0]) +
              " columns that have missing values.")
        return mz_table

missing_zero_values_table(df_2023)

Your selected dataframe has 10 columns and 31352 Rows.
There are 3 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type
NEIGHBOURHOOD,0,5,0.0,5,0.0,object
X,3386,3,0.0,3389,10.8,float64
Y,3386,3,0.0,3389,10.8,float64


In [6]:
df_2023["TYPE"].value_counts()

TYPE
Other Theft                                               10860
Theft from Vehicle                                         6433
Mischief                                                   5461
Offence Against a Person                                   3373
Break and Enter Commercial                                 1563
Break and Enter Residential/Other                          1064
Theft of Bicycle                                            979
Vehicle Collision or Pedestrian Struck (with Injury)        933
Theft of Vehicle                                            658
Vehicle Collision or Pedestrian Struck (with Fatality)       15
Homicide                                                     13
Name: count, dtype: int64

In [7]:
df_2023["NEIGHBOURHOOD"].value_counts()

NEIGHBOURHOOD
Central Business District    10463
West End                      2430
Strathcona                    2415
Renfrew-Collingwood           2224
Mount Pleasant                1949
Fairview                      1563
Grandview-Woodland            1460
Kensington-Cedar Cottage      1351
Sunset                        1259
Kitsilano                     1054
Hastings-Sunrise               787
Marpole                        752
Riley Park                     574
Victoria-Fraserview            479
Killarney                      467
Kerrisdale                     360
Dunbar-Southlands              330
South Cambie                   288
Oakridge                       286
West Point Grey                246
Arbutus Ridge                  233
Shaughnessy                    188
Stanley Park                   155
Musqueam                        34
Name: count, dtype: int64

In [8]:
alt.data_transformers.enable('vegafusion')
numeric_cols = ["MONTH", "DAY", "HOUR", "MINUTE"]
numeric_cols_dist = alt.Chart(df_2023).mark_bar().encode(
    alt.X(alt.repeat(), type = "quantitative", bin = alt.Bin(maxbins = 30)),
    y ="count()",
).properties(
        width = 200,
        height = 150
).repeat(
    numeric_cols,
    columns = 1
)

numeric_cols_dist

In [9]:

categ_cols_dist = alt.Chart(df_2023).mark_bar().encode(
    y = alt.X(alt.repeat(),type= "nominal").sort("x"),
    x =alt.Y("count()"),
).properties(
        width = 500,
        height = 300
).repeat(
     ["TYPE", "NEIGHBOURHOOD"],
    columns = 1
)
categ_cols_dist

In [10]:
def get_redundant_pairs(df):
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations !")
print(get_top_abs_correlations(df_2023.select_dtypes(include=['int32','int64']), 10))

Top Absolute Correlations !
HOUR   MINUTE    0.198873
MONTH  DAY       0.026610
       MINUTE    0.016947
DAY    MINUTE    0.007530
       HOUR      0.001796
MONTH  HOUR      0.000429
YEAR   MONTH          NaN
       DAY            NaN
       HOUR           NaN
       MINUTE         NaN
dtype: float64


In [11]:
import folium
from folium.plugins import HeatMap
CBD_district=df_2023.loc[df_2023.NEIGHBOURHOOD=='Central Business District'][['X','Y']]
CBD_district.X.fillna(0, inplace = True)
CBD_district.Y.fillna(0, inplace = True) 

map_1=folium.Map(location=[49.2827, -123.1207], 
                tiles = "OpenStreetMap",
                zoom_start=11)

folium.CircleMarker([49.2727, -123.1307],
                        radius=70,
                        fill_color="#b22222",
                        popup='Other Theft',
                        color='red',
                       ).add_to(map_1)


map_2 = HeatMap(data=CBD_district, radius=20)

map_2.add_to(map_1)
map_1
