In [12]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import scipy.stats as stats

In [2]:

data = pd.read_csv('./crime.csv')

data.head()

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,Latitude,Longitude
0,Other Theft,2003,5,12,16.0,15.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
1,Other Theft,2003,5,7,15.0,20.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
2,Other Theft,2003,4,23,16.0,40.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
3,Other Theft,2003,4,20,11.0,15.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
4,Other Theft,2003,4,12,17.0,45.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763


# Missing Values

In [3]:
# Checking for missing values, data types, and looking for duplicate entries
missing_data = data.isnull().sum()
data_types = data.dtypes
duplicate_count = data.duplicated().sum()

missing_data, data_types, duplicate_count

(TYPE                 0
 YEAR                 0
 MONTH                0
 DAY                  0
 HOUR             54362
 MINUTE           54362
 HUNDRED_BLOCK       13
 NEIGHBOURHOOD    56624
 X                    0
 Y                    0
 Latitude             0
 Longitude            0
 dtype: int64,
 TYPE              object
 YEAR               int64
 MONTH              int64
 DAY                int64
 HOUR             float64
 MINUTE           float64
 HUNDRED_BLOCK     object
 NEIGHBOURHOOD     object
 X                float64
 Y                float64
 Latitude         float64
 Longitude        float64
 dtype: object,
 48838)

## Handling Missing Values

In [4]:
cleaned_data = data.dropna(subset=['NEIGHBOURHOOD', 'HUNDRED_BLOCK'])

original_shape = data.shape
cleaned_shape = cleaned_data.shape

original_shape, cleaned_shape

((530652, 12), (474015, 12))

## Handling Duplicate Values

In [11]:
cleaned_data_unique = cleaned_data.drop_duplicates()

unique_cleaned_shape = cleaned_data_unique.shape
unique_cleaned_shape

(474014, 12)

In [6]:
type_categories = cleaned_data_unique['TYPE'].unique()
neighbourhood_categories = cleaned_data_unique['NEIGHBOURHOOD'].unique()

type_categories, neighbourhood_categories

(array(['Other Theft', 'Break and Enter Residential/Other', 'Mischief',
        'Break and Enter Commercial', 'Theft from Vehicle',
        'Vehicle Collision or Pedestrian Struck (with Injury)',
        'Vehicle Collision or Pedestrian Struck (with Fatality)',
        'Theft of Vehicle', 'Theft of Bicycle'], dtype=object),
 array(['Strathcona', 'Kerrisdale', 'Dunbar-Southlands',
        'Grandview-Woodland', 'Sunset', 'West End',
        'Central Business District', 'Hastings-Sunrise',
        'Victoria-Fraserview', 'Fairview', 'Kensington-Cedar Cottage',
        'West Point Grey', 'Shaughnessy', 'Renfrew-Collingwood',
        'Killarney', 'Riley Park', 'Arbutus Ridge', 'Musqueam',
        'Mount Pleasant', 'Kitsilano', 'Stanley Park', 'South Cambie',
        'Marpole', 'Oakridge'], dtype=object))

In [10]:
crime_type_counts = cleaned_data_unique['TYPE'].value_counts()

crime_neighbourhood_counts = cleaned_data_unique['NEIGHBOURHOOD'].value_counts()

crime_type_counts, crime_neighbourhood_counts

(TYPE
 Theft from Vehicle                                        170888
 Mischief                                                   70157
 Break and Enter Residential/Other                          60856
 Other Theft                                                52160
 Theft of Vehicle                                           38351
 Break and Enter Commercial                                 33841
 Theft of Bicycle                                           25620
 Vehicle Collision or Pedestrian Struck (with Injury)       21887
 Vehicle Collision or Pedestrian Struck (with Fatality)       254
 Name: count, dtype: int64,
 NEIGHBOURHOOD
 Central Business District    110944
 West End                      41352
 Fairview                      32161
 Mount Pleasant                30534
 Grandview-Woodland            27180
 Renfrew-Collingwood           26761
 Kitsilano                     26698
 Kensington-Cedar Cottage      24941
 Strathcona                    20917
 Hastings-Sunrise       

In [8]:
data = data.dropna()
print(data)

                                                     TYPE  YEAR  MONTH  DAY  \
0                                             Other Theft  2003      5   12   
1                                             Other Theft  2003      5    7   
2                                             Other Theft  2003      4   23   
3                                             Other Theft  2003      4   20   
4                                             Other Theft  2003      4   12   
...                                                   ...   ...    ...  ...   
530646                                           Mischief  2017      1   18   
530647                  Break and Enter Residential/Other  2017      3    3   
530648                                           Mischief  2017      5   29   
530650                                 Theft from Vehicle  2017      6    5   
530651  Vehicle Collision or Pedestrian Struck (with I...  2017      6    6   

        HOUR  MINUTE       HUNDRED_BLOCK           

In [9]:
nan_count = data.isnull().sum().sum()
print('Number of NaN values:', nan_count)
null_count = data.isnull().sum().sum()
print('Number of null values:', null_count)

Number of NaN values: 0
Number of null values: 0
